<a href="https://colab.research.google.com/github/Sarfarazzzzz/IPL_Prediction/blob/main/ipl_inplay_prediction_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🏏 IPL In-Play Match Winner Prediction
This notebook builds a machine learning model to predict the match winner after every ball using IPL data.

In [None]:
# Install required packages
%pip install xgboost lightgbm


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')


## Step 1: Load and Merge Data

In [None]:
matches = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IPL/matches.csv")
deliveries_raw = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/IPL/deliveries.csv")

# ---- Home Team Mapping ----
home_venues = {
    "Chennai Super Kings": ["MA Chidambaram Stadium"],
    "Mumbai Indians": ["Wankhede Stadium"],
    "Kolkata Knight Riders": ["Eden Gardens"],
    "Royal Challengers Bangalore": ["M Chinnaswamy Stadium"],
    "Sunrisers Hyderabad": ["Rajiv Gandhi International Stadium, Uppal"],
    "Delhi Capitals": ["Arun Jaitley Stadium", "Feroz Shah Kotla"],
    "Punjab Kings": ["Punjab Cricket Association IS Bindra Stadium"],
    "Rajasthan Royals": ["Sawai Mansingh Stadium"],
    "Gujarat Titans": ["Narendra Modi Stadium"],
    "Lucknow Super Giants": ["Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium"]
}
venue_to_home_team = {}
for team, venues in home_venues.items():
    for v in venues:
        venue_to_home_team[v] = team
matches["venue_home_team"] = matches["venue"].map(venue_to_home_team)

# Use ONLY first innings to calculate match target
first_innings = deliveries_raw[deliveries_raw["inning"] == 1]
targets = first_innings.groupby("match_id")["total_runs"].sum().reset_index()
targets.columns = ["match_id", "target"]

# Use 2nd innings for prediction
second_innings = deliveries_raw[deliveries_raw["inning"] == 2]

# Merge everything
df = second_innings.merge(matches[["id", "venue", "winner", "venue_home_team"]],
                          left_on="match_id", right_on="id", how="left")
df = df.merge(targets, on="match_id", how="left")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Step 2: Feature Engineering (Ball-by-Ball)

In [None]:
# ---- Feature Engineering ----
df['total_runs_so_far'] = df.groupby('match_id')['total_runs'].cumsum()
df['balls_so_far'] = df.groupby('match_id').cumcount() + 1
df['balls_left'] = 120 - df['balls_so_far']
df['current_run_rate'] = df['total_runs_so_far'] * 6 / df['balls_so_far']
df['runs_left'] = df['target'] - df['total_runs_so_far']
df['required_run_rate'] = df['runs_left'] * 6 / df['balls_left']
df['wickets_so_far'] = df.groupby('match_id')['is_wicket'].cumsum()
df['wickets_left'] = 10 - df['wickets_so_far']
df['run_rate_diff'] = df['current_run_rate'] - df['required_run_rate']
df['is_home_team'] = (df['batting_team'] == df['venue_home_team']).astype(int)

# Clean
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(subset=['required_run_rate'], inplace=True)

# ---- Target ----
df['batting_team_won'] = (df['batting_team'] == df['winner']).astype(int)


## Step 3: Encode Categorical Variables

In [None]:
for col in ['batting_team', 'bowling_team', 'venue', 'venue_home_team']:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))
# Final feature list
features_final = [
    'batting_team', 'bowling_team', 'venue',
    'balls_so_far', 'balls_left',
    'total_runs_so_far', 'runs_left',
    'current_run_rate', 'required_run_rate',
    'wickets_left', 'run_rate_diff', 'is_home_team'
]
X = df[features_final]
y = df['batting_team_won']



## Step 4: Train-Test Split and Model Training

In [None]:
df_sampled = df.sample(n=5000, random_state=42)
X_sampled = df_sampled[features_final]
y_sampled = df_sampled['batting_team_won']

X_train, X_test, y_train, y_test = train_test_split(
    X_sampled, y_sampled, test_size=0.2, stratify=y_sampled, random_state=42
)



In [None]:
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.dropna(inplace=True)
y = y[X.index]  # Align target with filtered features


models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier()
}

for name, model in models.items():
    print(f"\n🔍 Model: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print(report)



🔍 Model: Random Forest
Accuracy: 0.8100
              precision    recall  f1-score   support

           0       0.82      0.78      0.80       483
           1       0.81      0.83      0.82       517

    accuracy                           0.81      1000
   macro avg       0.81      0.81      0.81      1000
weighted avg       0.81      0.81      0.81      1000


🔍 Model: XGBoost
Accuracy: 0.8370
              precision    recall  f1-score   support

           0       0.84      0.81      0.83       483
           1       0.83      0.86      0.84       517

    accuracy                           0.84      1000
   macro avg       0.84      0.84      0.84      1000
weighted avg       0.84      0.84      0.84      1000


🔍 Model: LightGBM
[LightGBM] [Info] Number of positive: 2070, number of negative: 1930
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000545 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

In [None]:
test_cases = pd.DataFrame([
    {
        "batting_team": 3,
        "bowling_team": 5,
        "venue": 2,
        "balls_so_far": 12,  # 2 overs
        "balls_left": 108,
        "total_runs_so_far": 10,
        "runs_left": 150,
        "current_run_rate": 10 * 6 / 12,
        "required_run_rate": 150 * 6 / 108,
        "wickets_left": 9,
        "run_rate_diff": (10 * 6 / 12) - (150 * 6 / 108),
        "is_home_team": 1
    },
    {
        "batting_team": 3,
        "bowling_team": 5,
        "venue": 2,
        "balls_so_far": 60,  # 10 overs
        "balls_left": 60,
        "total_runs_so_far": 70,
        "runs_left": 90,
        "current_run_rate": 70 * 6 / 60,
        "required_run_rate": 90 * 6 / 60,
        "wickets_left": 7,
        "run_rate_diff": (70 * 6 / 60) - (90 * 6 / 60),
        "is_home_team": 1
    },
    {
        "batting_team": 3,
        "bowling_team": 5,
        "venue": 2,
        "balls_so_far": 108,  # 18 overs
        "balls_left": 12,
        "total_runs_so_far": 140,
        "runs_left": 20,
        "current_run_rate": 140 * 6 / 108,
        "required_run_rate": 20 * 6 / 12,
        "wickets_left": 5,
        "run_rate_diff": (140 * 6 / 108) - (20 * 6 / 12),
        "is_home_team": 1
    }
])

In [None]:
for name, model in models.items():
    preds = model.predict(test_cases)
    probs = model.predict_proba(test_cases)
    print(f"\n🔍 {name}")
    for i, row in test_cases.iterrows():
        print(f"Ball {row['balls_so_far']} → Win Prob: {probs[i][1]:.2f} | Prediction: {'WIN' if preds[i] == 1 else 'LOSE'}")


🔍 Random Forest
Ball 12.0 → Win Prob: 0.59 | Prediction: WIN
Ball 60.0 → Win Prob: 0.66 | Prediction: WIN
Ball 108.0 → Win Prob: 0.67 | Prediction: WIN

🔍 XGBoost
Ball 12.0 → Win Prob: 0.68 | Prediction: WIN
Ball 60.0 → Win Prob: 0.92 | Prediction: WIN
Ball 108.0 → Win Prob: 0.94 | Prediction: WIN

🔍 LightGBM
Ball 12.0 → Win Prob: 0.71 | Prediction: WIN
Ball 60.0 → Win Prob: 0.85 | Prediction: WIN
Ball 108.0 → Win Prob: 0.75 | Prediction: WIN
