In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv("t20_data.csv")

In [4]:
data.sample(3)

Unnamed: 0,match_id,venue,toss_winner,toss_decision,batting_team,bowling_team,innings,runs_so_far,wickets_so_far,balls_faced,run_rate,target_runs,required_run_rate,winner
536650,1001349,Melbourne Cricket Ground,Sri Lanka,field,Sri Lanka,Australia,2,35,1,23,8.4,169.0,8.28866,Sri Lanka
349084,1478867,Harare Sports Club,New Zealand,field,New Zealand,Zimbabwe,2,10,1,17,3.333333,121.0,6.466019,New Zealand
469200,1199519,Tolerance Oval,Jersey,bat,Jersey,United Arab Emirates,1,49,3,43,6.125,,,Jersey


In [5]:
data["batting_team_wins"] = (data["winner"] == data["batting_team"]).astype(int)
data.sample(3)

Unnamed: 0,match_id,venue,toss_winner,toss_decision,batting_team,bowling_team,innings,runs_so_far,wickets_so_far,balls_faced,run_rate,target_runs,required_run_rate,winner,batting_team_wins
616065,1407715,Udayana Cricket Ground,Cambodia,bat,Cambodia,Indonesia,1,4,2,13,1.333333,,,Indonesia,0
362804,1265223,Gahanga International Cricket Stadium. Rwanda,Namibia,bat,Nigeria,Namibia,2,27,3,66,2.454545,157.0,14.444444,Namibia,0
925366,682903,Sylhet Stadium,United Arab Emirates,bat,United Arab Emirates,Netherlands,1,86,4,69,7.166667,,,Netherlands,0


In [6]:
# Fill missing numeric with -1
numeric_cols = data.select_dtypes(include=[np.number]).columns
data[numeric_cols] = data[numeric_cols].fillna(-1)

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
# Categorical encoding
categorical_cols = ["venue", "batting_team", "bowling_team", "toss_winner", "toss_decision", "winner"]
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

In [9]:
data.sample(3)

Unnamed: 0,match_id,venue,toss_winner,toss_decision,batting_team,bowling_team,innings,runs_so_far,wickets_so_far,balls_faced,run_rate,target_runs,required_run_rate,winner,batting_team_wins
785501,1483942,5,4,1,70,4,1,19,0,10,5.7,-1.0,-1.0,65,1
48621,1482828,216,27,1,59,27,1,100,3,62,9.090909,-1.0,-1.0,55,1
441102,1321265,377,93,1,47,94,1,90,4,72,7.5,-1.0,-1.0,45,1


In [10]:
df_inn1 = data[data["innings"] == 1].copy()
df_inn2 = data[data["innings"] == 2].copy()

# Features for innings 1 → drop target-related
features_inn1 = [col for col in df_inn1.columns if col not in [
    "match_id", "winner", "batting_team_wins", 
    "target_runs", "runs_required", "required_rr"
]]

# Features for innings 2 → keep all
features_inn2 = [col for col in df_inn2.columns if col not in [
    "match_id", "winner", "batting_team_wins"
]]

In [11]:
from sklearn.model_selection import GroupShuffleSplit

# Split so same match doesn't leak between train/test
gss1 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx1, test_idx1 = next(gss1.split(df_inn1[features_inn1], df_inn1["batting_team_wins"], groups=df_inn1["match_id"]))

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx2, test_idx2 = next(gss2.split(df_inn2[features_inn2], df_inn2["batting_team_wins"], groups=df_inn2["match_id"]))

X1_train, X1_test = df_inn1[features_inn1].iloc[train_idx1], df_inn1[features_inn1].iloc[test_idx1]
y1_train, y1_test = df_inn1["batting_team_wins"].iloc[train_idx1], df_inn1["batting_team_wins"].iloc[test_idx1]

X2_train, X2_test = df_inn2[features_inn2].iloc[train_idx2], df_inn2[features_inn2].iloc[test_idx2]
y2_train, y2_test = df_inn2["batting_team_wins"].iloc[train_idx2], df_inn2["batting_team_wins"].iloc[test_idx2]

print("Train size:", X1_train.shape, "Test size:", X1_test.shape)
print("Train size:", X2_train.shape, "Test size:", X2_test.shape)

Train size: (439135, 11) Test size: (109610, 11)
Train size: (374039, 12) Test size: (94765, 12)


In [12]:
from lightgbm import LGBMClassifier

# First innings model
model_inn1 = LGBMClassifier(
    objective="binary",
    learning_rate=0.05,
    num_leaves=31,
    n_estimators=1000,
    random_state=42
)

model_inn1.fit(
    X1_train, y1_train,
    eval_set=[(X1_test, y1_test)],
    eval_metric="auc",
)

# Second innings model
model_inn2 = LGBMClassifier(
    objective="binary",
    learning_rate=0.05,
    num_leaves=31,
    n_estimators=1000,
    random_state=42
)

model_inn2.fit(
    X2_train, y2_train,
    eval_set=[(X2_test, y2_test)],
    eval_metric="auc",
)


[LightGBM] [Info] Number of positive: 218554, number of negative: 220581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1195
[LightGBM] [Info] Number of data points in the train set: 439135, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497692 -> initscore=-0.009232
[LightGBM] [Info] Start training from score -0.009232
[LightGBM] [Info] Number of positive: 169417, number of negative: 204622
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1687
[LightGBM] [Info] Number of data points in the train set: 374039, number of used features: 11
[LightGBM] [I

In [13]:
from sklearn.metrics import roc_auc_score, accuracy_score

# First innings
y1_pred_proba = model_inn1.predict_proba(X1_test)[:, 1]
y1_pred = (y1_pred_proba >= 0.5).astype(int)
print("First Innings AUC:", roc_auc_score(y1_test, y1_pred_proba))
print("First Innings Accuracy:", accuracy_score(y1_test, y1_pred))

# Second innings
y2_pred_proba = model_inn2.predict_proba(X2_test)[:, 1]
y2_pred = (y2_pred_proba >= 0.5).astype(int)
print("Second Innings AUC:", roc_auc_score(y2_test, y2_pred_proba))
print("Second Innings Accuracy:", accuracy_score(y2_test, y2_pred))

First Innings AUC: 0.7241065552913248
First Innings Accuracy: 0.6604415655505884
Second Innings AUC: 0.8977234340106379
Second Innings Accuracy: 0.8136548303698623


In [None]:
import pickle

model_bundle = {
    "model_inn1": model_inn1,
    "model_inn2": model_inn2,
    "label_encoders": label_encoders,
    "features_inn1": features_inn1,
    "features_inn2": features_inn2
}

with open("pipe.pkl", "wb") as f:
    pickle.dump(model_bundle, f)

print("✅ Saved two-model setup to t20i_dual_models.pkl")


✅ Saved two-model setup to t20i_dual_models.pkl


In [91]:
data.head()

Unnamed: 0,match_id,venue,toss_winner,toss_decision,batting_team,bowling_team,innings,runs_so_far,wickets_so_far,balls_faced,run_rate,target_runs,required_run_rate,batting_team_wins
0,1442989,258,89,1,41,90,1,0,0,1,0.0,-1.0,-1.0,0
1,1442989,258,89,1,41,90,1,1,0,2,1.0,-1.0,-1.0,0
2,1442989,258,89,1,41,90,1,1,0,3,1.0,-1.0,-1.0,0
3,1442989,258,89,1,41,90,1,2,0,4,2.0,-1.0,-1.0,0
4,1442989,258,89,1,41,90,1,3,0,5,3.0,-1.0,-1.0,0
