In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("t20_data.csv.zip", compression="zip")

In [3]:
data.sample(3)

Unnamed: 0,match_id,venue,toss_winner,toss_decision,batting_team,bowling_team,innings,runs_so_far,wickets_so_far,balls_faced,run_rate,target_runs,required_run_rate,winner
137944,1394775,"UKM-YSD Cricket Oval, Bangi",Hong Kong,field,Thailand,Hong Kong,1,42,1,52,4.666667,,,Thailand
235634,1457227,University of Doha for Science and Technology,United Arab Emirates,bat,United Arab Emirates,Thailand,1,45,2,33,7.5,,,United Arab Emirates
878058,1282738,"Desert Springs Cricket Ground, Almeria",Germany,field,Jersey,Germany,1,38,1,40,5.428571,,,Jersey


In [4]:
data["batting_team_wins"] = (data["winner"] == data["batting_team"]).astype(int)
data.sample(3)

Unnamed: 0,match_id,venue,toss_winner,toss_decision,batting_team,bowling_team,innings,runs_so_far,wickets_so_far,balls_faced,run_rate,target_runs,required_run_rate,winner,batting_team_wins
10135,1477669,"Botswana Cricket Association Oval 2, Gaborone",Eswatini,field,Sierra Leone,Eswatini,1,89,3,54,9.888889,,,Sierra Leone,1
254583,571150,Dubai International Cricket Stadium,Pakistan,field,Australia,Pakistan,1,140,2,87,9.333333,,,Australia,1
163499,1344512,"Buffalo Park, East London",South Africa,bat,South Africa,West Indies,1,96,2,89,6.4,,,South Africa,1


In [5]:
# Fill missing numeric with -1
numeric_cols = data.select_dtypes(include=[np.number]).columns
data[numeric_cols] = data[numeric_cols].fillna(-1)

In [7]:
mask_pp = data['balls_faced'] >= 36
if mask_pp.any():
    pp_data = data[data['balls_faced'] <= 36]
    data.loc[mask_pp, 'pp_runs'] = pp_data['runs_so_far'].max()
    data.loc[mask_pp, 'pp_wickets'] = pp_data['wickets_so_far'].max()

# Middle overs: 6–15 overs (balls 36–95)
mask_mid = data['balls_faced'] >= 90
if mask_mid.any():
    mid_data = data[(data['balls_faced'] >= 36) & (data['balls_faced'] <= 90)]
    data.loc[mask_mid, 'mid_runs'] = mid_data['runs_so_far'].max() - data['pp_runs']
    data.loc[mask_mid, 'mid_wickets'] = mid_data['wickets_so_far'].max() - data['pp_wickets']

# Death overs: 16–20 overs (balls 96–119)
mask_death = data['balls_faced'] >= 96
if mask_death.any():
    death_data = data[data['balls_faced'] >= 96]
    data.loc[mask_death, 'death_runs'] = death_data['runs_so_far'].max() - data['pp_runs'] - data['mid_runs'].fillna(0)
    data.loc[mask_death, 'death_wickets'] = death_data['wickets_so_far'].max() - data['pp_wickets'] - data['mid_wickets'].fillna(0)

data.sample(5)

Unnamed: 0,match_id,venue,toss_winner,toss_decision,batting_team,bowling_team,innings,runs_so_far,wickets_so_far,balls_faced,...,target_runs,required_run_rate,winner,batting_team_wins,pp_runs,pp_wickets,mid_runs,mid_wickets,death_runs,death_wickets
502595,1432442,Dubai International Cricket Stadium,South Africa,field,South Africa,Australia,2,27,1,28,...,135.0,7.043478,South Africa,1,,,,,,
990581,1438092,"Scott Page Field, Vinor",Croatia,bat,Gibraltar,Croatia,2,74,5,96,...,128.0,13.5,Croatia,0,116.0,9.0,128.0,1.0,100.0,1.0
943047,1419917,"Bayuemas Oval, Kuala Lumpur",Nepal,field,Bhutan,Nepal,1,49,2,95,...,-1.0,-1.0,Nepal,0,116.0,9.0,128.0,1.0,,
462712,1421079,Dubai International Cricket Stadium,United Arab Emirates,field,Scotland,United Arab Emirates,1,72,4,60,...,-1.0,-1.0,Scotland,1,116.0,9.0,,,,
995942,895817,Adelaide Oval,Australia,field,Australia,India,2,129,6,96,...,189.0,15.0,India,0,116.0,9.0,128.0,1.0,100.0,1.0


In [17]:
for phase in ['pp_runs', 'mid_runs', 'death_runs']:
    data[f"{phase}_known"] = data[phase].notna().astype(int)

In [18]:
data.sample(5)

Unnamed: 0,match_id,venue,toss_winner,toss_decision,batting_team,bowling_team,innings,runs_so_far,wickets_so_far,balls_faced,...,batting_team_wins,pp_runs,pp_wickets,mid_runs,mid_wickets,death_runs,death_wickets,pp_runs_known,mid_runs_known,death_runs_known
910577,1200428,372,65,0,81,66,2,9,1,13,...,1,,,,,,,0,0,0
412489,1214766,5,33,0,33,70,1,78,2,83,...,1,116.0,9.0,,,,,1,0,0
424611,1478866,139,86,1,87,67,2,9,0,7,...,0,,,,,,,0,0,0
12974,1310180,216,76,0,24,77,2,167,9,118,...,0,116.0,9.0,128.0,1.0,100.0,1.0,1,1,1
866200,1157374,91,70,0,1,71,2,60,2,42,...,0,116.0,9.0,,,,,1,0,0


In [28]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in ['venue', 'toss_winner', 'toss_decision', 'batting_team', 'bowling_team']:
    le = LabelEncoder()
    
    # Fit on all unique values in the FULL dataset (df), not just training split
    all_values = data[col].dropna().unique().tolist()
    le.fit(all_values)
    
    # Transform the full column now
    data[col] = le.transform(data[col])
    
    # Save encoder
    label_encoders[col] = le


In [29]:
data.sample(3)

Unnamed: 0,match_id,venue,toss_winner,toss_decision,batting_team,bowling_team,innings,runs_so_far,wickets_so_far,balls_faced,...,batting_team_wins,pp_runs,pp_wickets,mid_runs,mid_wickets,death_runs,death_wickets,pp_runs_known,mid_runs_known,death_runs_known
251562,1488321,321,42,0,43,12,1,113,6,110,...,0,116.0,9.0,128.0,1.0,100.0,1.0,1,1,1
451465,1282273,131,34,0,101,35,2,69,0,40,...,1,116.0,9.0,,,,,1,0,0
827348,1188784,17,104,0,3,82,1,19,0,38,...,1,116.0,9.0,,,,,1,0,0


In [30]:
data_inn1 = data[data["innings"] == 1].copy()
data_inn2 = data[data["innings"] == 2].copy()

# Features for innings 1 → drop target-related
features_inn1 = [col for col in data_inn1.columns if col not in [
    "match_id", "winner", "batting_team_wins", 
    "target_runs", "runs_required", "required_rr"
]]

# Features for innings 2 → keep all
features_inn2 = [col for col in data_inn2.columns if col not in [
    "match_id", "winner", "batting_team_wins"
]]

In [31]:
from sklearn.model_selection import GroupShuffleSplit

# Split so same match doesn't leak between train/test
gss1 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx1, test_idx1 = next(gss1.split(data_inn1[features_inn1], data_inn1["batting_team_wins"], groups=data_inn1["match_id"]))

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx2, test_idx2 = next(gss2.split(data_inn2[features_inn2], data_inn2["batting_team_wins"], groups=data_inn2["match_id"]))

X1_train, X1_test = data_inn1[features_inn1].iloc[train_idx1], data_inn1[features_inn1].iloc[test_idx1]
y1_train, y1_test = data_inn1["batting_team_wins"].iloc[train_idx1], data_inn1["batting_team_wins"].iloc[test_idx1]

X2_train, X2_test = data_inn2[features_inn2].iloc[train_idx2], data_inn2[features_inn2].iloc[test_idx2]
y2_train, y2_test = data_inn2["batting_team_wins"].iloc[train_idx2], data_inn2["batting_team_wins"].iloc[test_idx2]

print("Train size:", X1_train.shape, "Test size:", X1_test.shape)
print("Train size:", X2_train.shape, "Test size:", X2_test.shape)

Train size: (439135, 20) Test size: (109610, 20)
Train size: (374039, 21) Test size: (94765, 21)


In [32]:
from lightgbm import LGBMClassifier

# First innings model
model_inn1 = LGBMClassifier(
    objective="binary",
    learning_rate=0.002,
    num_leaves=31,
    n_estimators=1200,
    random_state=42
)

model_inn1.fit(
    X1_train, y1_train,
    eval_set=[(X1_test, y1_test)],
    eval_metric="auc",
)


[LightGBM] [Info] Number of positive: 218554, number of negative: 220581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014671 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1222
[LightGBM] [Info] Number of data points in the train set: 439135, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497692 -> initscore=-0.009232
[LightGBM] [Info] Start training from score -0.009232


In [33]:
# Second innings model
model_inn2 = LGBMClassifier(
    objective="binary",
    learning_rate=0.002,
    num_leaves=31,
    n_estimators=1000,
    random_state=42
)

model_inn2.fit(
    X2_train, y2_train,
    eval_set=[(X2_test, y2_test)],
    eval_metric="auc",
)

[LightGBM] [Info] Number of positive: 169417, number of negative: 204622
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1713
[LightGBM] [Info] Number of data points in the train set: 374039, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.452939 -> initscore=-0.188801
[LightGBM] [Info] Start training from score -0.188801


In [34]:
from sklearn.metrics import roc_auc_score, accuracy_score

# First innings
y1_pred_proba = model_inn1.predict_proba(X1_test)[:, 1]
y1_pred = (y1_pred_proba >= 0.5).astype(int)
print("First Innings AUC:", roc_auc_score(y1_test, y1_pred_proba))
print("First Innings Accuracy:", accuracy_score(y1_test, y1_pred))

# Second innings
y2_pred_proba = model_inn2.predict_proba(X2_test)[:, 1]
y2_pred = (y2_pred_proba >= 0.5).astype(int)
print("Second Innings AUC:", roc_auc_score(y2_test, y2_pred_proba))
print("Second Innings Accuracy:", accuracy_score(y2_test, y2_pred))

First Innings AUC: 0.7322900785089685
First Innings Accuracy: 0.6660249977191862
Second Innings AUC: 0.9121468636193957
Second Innings Accuracy: 0.8271091647760249


In [35]:
import pickle

model_bundle = {
    "model_inn1": model_inn1,
    "model_inn2": model_inn2,
    "label_encoders": label_encoders,
    "features_inn1": features_inn1,
    "features_inn2": features_inn2
}

with open("pipe.pkl", "wb") as f:
    pickle.dump(model_bundle, f)

print("✅ Saved two-model setup to pipe.pkl")


✅ Saved two-model setup to pipe.pkl


In [27]:
data.columns

Index(['match_id', 'venue', 'toss_winner', 'toss_decision', 'batting_team',
       'bowling_team', 'innings', 'runs_so_far', 'wickets_so_far',
       'balls_faced', 'run_rate', 'target_runs', 'required_run_rate', 'winner',
       'batting_team_wins', 'pp_runs', 'pp_wickets', 'mid_runs', 'mid_wickets',
       'death_runs', 'death_wickets', 'pp_runs_known', 'mid_runs_known',
       'death_runs_known'],
      dtype='object')

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid_logistic = {
    'C': [0.01, 0.1, 1],       # Regularization strength
    'penalty': ['l2'],            # Norm used in the penalization
    'solver': ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}
inn1_model_logistic = LogisticRegression(max_iter=1000)
inn1_grid_search_logistic = GridSearchCV(inn1_model_logistic, param_grid_logistic, cv=3, scoring='roc_auc', n_jobs=-1)
inn1_grid_search_logistic.fit(X1_train, y1_train)

inn2_model_logistic = LogisticRegression(max_iter=1000)
inn2_grid_search_logistic = GridSearchCV(inn2_model_logistic, param_grid_logistic, cv=3, scoring='roc_auc', n_jobs=-1)
inn2_grid_search_logistic.fit(X2_train, y2_train)

print("Best Parameters (inning 1):", inn1_grid_search_logistic.best_params_)
print("Best Cross-validation Accuracy (inning 1):", inn1_grid_search_logistic.best_score_)

print("Best Parameters (inning 2):", inn2_grid_search_logistic.best_params_)
print("Best Cross-validation Accuracy (inning 2):", inn2_grid_search_logistic.best_score_)




Best Parameters (inning 1): {'C': 0.01, 'penalty': 'l2', 'solver': 'sag'}
Best Cross-validation Accuracy (inning 1): 0.7328448976686911
Best Parameters (inning 2): {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Cross-validation Accuracy (inning 2): 0.9111782693816367


In [None]:
# First innings
y1_pred_proba_logistic = inn1_grid_search_logistic.predict_proba(X1_test)[:, 1]
y1_pred_logistic = (y1_pred_proba_logistic >= 0.5).astype(int)
print("First Innings AUC:", roc_auc_score(y1_test, y1_pred_proba_logistic))
print("First Innings Accuracy:", accuracy_score(y1_test, y1_pred_logistic))

# Best Parameters (inning 1): {'C': 0.01, 'penalty': 'l2', 'solver': 'sag'}

# Second innings
y2_pred_proba_logistic = inn2_grid_search_logistic.predict_proba(X2_test)[:, 1]
y2_pred_logistic = (y2_pred_proba_logistic >= 0.5).astype(int)
print("Second Innings AUC:", roc_auc_score(y2_test, y2_pred_proba_logistic))
print("Second Innings Accuracy:", accuracy_score(y2_test, y2_pred_logistic))

# Best Parameters (inning 2): {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}

First Innings AUC: 0.7216775825908632
First Innings Accuracy: 0.6606514004196697
Second Innings AUC: 0.9113077265293417
Second Innings Accuracy: 0.8232153221125943


In [51]:
import xgboost as xgb

param_grid_xgb = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0]
}

inn1_model_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
inn1_grid_search_xgb = GridSearchCV(inn1_model_xgb, param_grid_xgb, cv=3, scoring='roc_auc', n_jobs=-1)
inn1_grid_search_xgb.fit(X1_train, y1_train)

inn2_model_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
inn2_grid_search_xgb = GridSearchCV(inn2_model_xgb, param_grid_xgb, cv=3, scoring='roc_auc', n_jobs=-1)
inn2_grid_search_xgb.fit(X2_train, y2_train)

print("Best Parameters (inning 1):", inn1_grid_search_xgb.best_params_)
print("Best Cross-validation Accuracy (inning 1):", inn1_grid_search_xgb.best_score_)

print("Best Parameters (inning 2):", inn2_grid_search_xgb.best_params_)
print("Best Cross-validation Accuracy (inning 2):", inn2_grid_search_xgb.best_score_)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best Parameters (inning 1): {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.7}
Best Cross-validation Accuracy (inning 1): 0.7601113976241507
Best Parameters (inning 2): {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.7}
Best Cross-validation Accuracy (inning 2): 0.9156274799659169


In [52]:
# First innings
y1_pred_proba_xgb = inn1_grid_search_xgb.predict_proba(X1_test)[:, 1]
y1_pred_xgb = (y1_pred_proba_xgb >= 0.5).astype(int)
print("First Innings AUC:", roc_auc_score(y1_test, y1_pred_proba_xgb))
print("First Innings Accuracy:", accuracy_score(y1_test, y1_pred_xgb))

# Best Parameters (inning 1): {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.7}

# Second innings
y2_pred_proba_xgb = inn2_grid_search_xgb.predict_proba(X2_test)[:, 1]
y2_pred_xgb = (y2_pred_proba_xgb >= 0.5).astype(int)
print("Second Innings AUC:", roc_auc_score(y2_test, y2_pred_proba_xgb))
print("Second Innings Accuracy:", accuracy_score(y2_test, y2_pred_xgb))

# Best Parameters (inning 2): {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.7}

First Innings AUC: 0.7623782905767983
First Innings Accuracy: 0.6896086123528875
Second Innings AUC: 0.9144360739700117
Second Innings Accuracy: 0.8263177333403683
