# 🏏 IPL In-Play Match Winner Prediction
This notebook builds a machine learning model to predict the match winner after every ball using IPL data.

In [None]:
# Install required packages
%pip install xgboost lightgbm




In [21]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')


## Step 1: Load and Merge Data

In [22]:
matches = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IPL/matches.csv")
deliveries_raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IPL/deliveries.csv")

# ---- Home Team Mapping ----
home_venues = {
    "Chennai Super Kings": ["MA Chidambaram Stadium"],
    "Mumbai Indians": ["Wankhede Stadium"],
    "Kolkata Knight Riders": ["Eden Gardens"],
    "Royal Challengers Bangalore": ["M Chinnaswamy Stadium"],
    "Sunrisers Hyderabad": ["Rajiv Gandhi International Stadium, Uppal"],
    "Delhi Capitals": ["Arun Jaitley Stadium", "Feroz Shah Kotla"],
    "Punjab Kings": ["Punjab Cricket Association IS Bindra Stadium"],
    "Rajasthan Royals": ["Sawai Mansingh Stadium"],
    "Gujarat Titans": ["Narendra Modi Stadium"],
    "Lucknow Super Giants": ["Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium"]
}
venue_to_home_team = {}
for team, venues in home_venues.items():
    for v in venues:
        venue_to_home_team[v] = team
matches["venue_home_team"] = matches["venue"].map(venue_to_home_team)

# Use ONLY first innings to calculate match target
first_innings = deliveries_raw[deliveries_raw["inning"] == 1]
targets = first_innings.groupby("match_id")["total_runs"].sum().reset_index()
targets.columns = ["match_id", "target"]

# Use 2nd innings for prediction
second_innings = deliveries_raw[deliveries_raw["inning"] == 2]

# Merge everything
df = second_innings.merge(matches[["id", "venue", "winner", "venue_home_team"]],
                          left_on="match_id", right_on="id", how="left")
df = df.merge(targets, on="match_id", how="left")


In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Step 2: Feature Engineering (Ball-by-Ball)

In [23]:
# ---- Feature Engineering ----
df['total_runs_so_far'] = df.groupby('match_id')['total_runs'].cumsum()
df['balls_so_far'] = df.groupby('match_id').cumcount() + 1
df['balls_left'] = 120 - df['balls_so_far']
df['current_run_rate'] = df['total_runs_so_far'] * 6 / df['balls_so_far']
df['runs_left'] = df['target'] - df['total_runs_so_far']
df['required_run_rate'] = df['runs_left'] * 6 / df['balls_left']
df['wickets_so_far'] = df.groupby('match_id')['is_wicket'].cumsum()
df['wickets_left'] = 10 - df['wickets_so_far']
df['run_rate_diff'] = df['current_run_rate'] - df['required_run_rate']
df['is_home_team'] = (df['batting_team'] == df['venue_home_team']).astype(int)

# Clean
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(subset=['required_run_rate'], inplace=True)

# ---- Target ----
df['batting_team_won'] = (df['batting_team'] == df['winner']).astype(int)



In [24]:
def get_match_phase(over):
    if over <= 6:
        return 'Powerplay'
    elif over <= 15:
        return 'Middle'
    else:
        return 'Death'
df['match_phase'] = df['over'].apply(get_match_phase)

# One-Hot Encode the 'match_phase' column
phase_dummies = pd.get_dummies(df['match_phase'], prefix='phase', dtype=int)
df = pd.concat([df, phase_dummies], axis=1)


df.replace([np.inf, -np.inf], np.nan, inplace=True)
# Now, fill any and all NaN values with 0. This PRESERVES your data.
df.fillna(0, inplace=True)


In [27]:
# --- 2. Create the 'wicket_pressure' feature ---
# This feature multiplies the required run rate by how many wickets have been lost.
# (11 - wickets_left) gives us wickets lost (1 for the first wicket, 10 for the last).
# We add 1 to the denominator to avoid dividing by zero if wickets_left is 0.
df['wicket_pressure'] = (df['required_run_rate'] * (11 - df['wickets_left']))

# Let's look at some examples to see how it behaves
print("\nWicket Pressure Feature Examples:")
print(df[['required_run_rate', 'wickets_left', 'wicket_pressure']].tail(5))


Wicket Pressure Feature Examples:
        required_run_rate  wickets_left  wicket_pressure
125736           0.315789             8         0.947368
125737           0.214286             8         0.642857
125738           0.109091             8         0.327273
125739           0.000000             8         0.000000
125740          -0.113208             8        -0.339623


In [25]:
# Create the 'danger_index' feature
# This feature is designed to be extremely sensitive to the number of wickets left.
df['danger_index'] = df['required_run_rate'] / (df['wickets_left'] + 0.1)

## Step 3: Encode Categorical Variables

Encoding complete using persistent encoders.


In [28]:
for col in ['batting_team', 'bowling_team', 'venue', 'venue_home_team']:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))
# Final feature list
features_final = [
    'batting_team', 'bowling_team', 'venue',
    'balls_so_far', 'balls_left',
    'total_runs_so_far', 'runs_left',
    'current_run_rate', 'required_run_rate',
    'wickets_left', 'run_rate_diff', 'is_home_team', 'phase_Middle',
    'phase_Death', 'wicket_pressure', 'danger_index'
]
X = df[features_final]
y = df['batting_team_won']



## Step 4: Train-Test Split and Model Training

In [29]:
# New, better code
X = df[features_final]
y = df['batting_team_won']

# Use the entire dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [30]:
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.dropna(inplace=True)
y = y[X.index]  # Align target with filtered features


models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier()
}

for name, model in models.items():
    print(f"\n🔍 Model: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print(report)


🔍 Model: Random Forest
Accuracy: 0.9732
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     11996
           1       0.97      0.98      0.97     13028

    accuracy                           0.97     25024
   macro avg       0.97      0.97      0.97     25024
weighted avg       0.97      0.97      0.97     25024


🔍 Model: XGBoost
Accuracy: 0.9734
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     11996
           1       0.97      0.97      0.97     13028

    accuracy                           0.97     25024
   macro avg       0.97      0.97      0.97     25024
weighted avg       0.97      0.97      0.97     25024


🔍 Model: LightGBM
[LightGBM] [Info] Number of positive: 52111, number of negative: 47985
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004730 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n

In [31]:
import pickle

# Assume LightGBM was best
best_model = models["XGBoost"]

# Save the trained model
with open("xgb_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("Best model saved as xgb_model.pkl")

Best model saved as xgb_model.pkl
