# IPL Win Probability Predictor

This notebook trains a logistic regression model to predict the win probability of the batting team in the second innings of an IPL match.

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

## 1. Data Loading

In [None]:
match_df = pd.read_csv('matches.csv')
delivery = pd.read_csv('deliveries.csv')

In [None]:
match_df.shape, delivery.shape

## 2. Feature Engineering

In [None]:
# Determine total runs for each match in the first innings
total_score_df = delivery.groupby(['match_id', 'inning']).sum()['total_runs'].reset_index()
total_score_df = total_score_df[total_score_df['inning'] == 1]
total_score_df['total_runs'] += 1 # Target score is total + 1

# Merge with match dataframe
match_df = match_df.merge(total_score_df[['match_id', 'total_runs']], left_on='id', right_on='match_id')

In [None]:
# Standardize team names
teams = [
    'Sunrisers Hyderabad',
    'Mumbai Indians',
    'Royal Challengers Bangalore',
    'Kolkata Knight Riders',
    'Kings XI Punjab',
    'Chennai Super Kings',
    'Rajasthan Royals',
    'Delhi Capitals'
]

match_df['team1'] = match_df['team1'].str.replace('Delhi Daredevils', 'Delhi Capitals')
match_df['team2'] = match_df['team2'].str.replace('Delhi Daredevils', 'Delhi Capitals')

match_df['team1'] = match_df['team1'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')
match_df['team2'] = match_df['team2'].str.replace('Deccan Chargers', 'Sunrisers Hyderabad')

# Filter for active teams
match_df = match_df[match_df['team1'].isin(teams)]
match_df = match_df[match_df['team2'].isin(teams)]

# Include only non-DL matches
match_df = match_df[match_df['dl_applied'] == 0]

# Select relevant columns
match_df = match_df[['match_id', 'city', 'winner', 'total_runs']]

In [None]:
# Merge match data with delivery data for 2nd innings
delivery_df = match_df.merge(delivery, on='match_id')
delivery_df = delivery_df[delivery_df['inning'] == 2]

In [None]:
# Calculate current score, runs left, and balls left
delivery_df['current_score'] = delivery_df.groupby('match_id')['total_runs_y'].cumsum()
delivery_df['runs_left'] = delivery_df['total_runs_x'] - delivery_df['current_score']
delivery_df['balls_left'] = 126 - (delivery_df['over'] * 6 + delivery_df['ball'])

In [None]:
# Calculate wickets left
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].fillna("0")
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].apply(lambda x: "0" if x == "0" else "1")
delivery_df['player_dismissed'] = delivery_df['player_dismissed'].astype('int')
wickets = delivery_df.groupby('match_id')['player_dismissed'].cumsum().values
delivery_df['wickets'] = 10 - wickets

In [None]:
# Calculate run rates
delivery_df['cur_run_rate'] = (delivery_df['current_score'] * 6) / (120 - delivery_df['balls_left'])
delivery_df['req_run_rate'] = (delivery_df['runs_left'] * 6) / delivery_df['balls_left']

In [None]:
def result(row):
    return 1 if row['batting_team'] == row['winner'] else 0

delivery_df['result'] = delivery_df.apply(result, axis=1)

In [None]:
# Select final features
final_df = delivery_df[['batting_team', 'bowling_team', 'city', 'runs_left', 'balls_left', 'wickets', 'total_runs_x', 'cur_run_rate', 'req_run_rate', 'result']]

# Shuffle samples
final_df = final_df.sample(final_df.shape[0])

# Remove rows with missing or infinite values (e.g., req_run_rate division by zero)
final_df.dropna(inplace=True)
final_df = final_df[final_df['balls_left'] != 0]

## 3. Model Training

In [None]:
X = final_df.iloc[:, :-1]
y = final_df.iloc[:, -1]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
# Preprocessing pipeline
trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse_output=False, drop='first'), ['batting_team', 'bowling_team', 'city'])
], remainder='passthrough')

# Logistic Regression pipeline
pipe = Pipeline(steps=[
    ('step1', trf),
    ('step2', LogisticRegression(solver='liblinear'))
])

# Train model
pipe.fit(X_train, y_train)

In [None]:
# Evaluate model
y_pred = pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

## 4. Export Model

In [None]:
pickle.dump(pipe, open('pipe.pkl', 'wb'))