In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [44]:
data = pd.read_csv("t20_data_clean.csv.zip", compression="zip")

In [45]:
data.sample(3)

Unnamed: 0,match_id,dates,venue,toss_winner,toss_decision,batting_team,bowling_team,innings,runs_so_far,wickets_so_far,balls_faced,run_rate,target_runs,required_run_rate,winner
383316,287865,2007-09-16,New Wanderers Stadium,India,field,India,New Zealand,2,45,0,22,10.8,191.0,8.938776,New Zealand
583499,1457218,2024-11-19,University of Doha for Science and Technology,Bhutan,field,United Arab Emirates,Bhutan,1,82,2,58,8.2,,,United Arab Emirates
704564,1483070,2025-05-10,"Marina Ground, Corfu",Greece,field,Greece,Germany,2,40,5,62,3.58209,167.0,13.137931,Germany


In [46]:
data["dates"] = pd.to_datetime(data["dates"], errors="coerce")
data["year"] = data["dates"].dt.year

In [47]:
data["batting_team_wins"] = (data["winner"] == data["batting_team"]).astype(int)

In [48]:
# Fill missing numeric with -1
numeric_cols = data.select_dtypes(include=[np.number]).columns
data[numeric_cols] = data[numeric_cols].fillna(-1)
data.drop(columns=["dates"], inplace=True)

In [49]:
data.sample(5)

Unnamed: 0,match_id,venue,toss_winner,toss_decision,batting_team,bowling_team,innings,runs_so_far,wickets_so_far,balls_faced,run_rate,target_runs,required_run_rate,winner,year,batting_team_wins
121475,1289273,"Edgbaston, Birmingham",England,bat,England,New Zealand,1,46,3,42,6.571429,-1.0,-1.0,New Zealand,2022,0
465597,1430812,"Pembroke Cricket Club, Sandymount, Dublin",Sri Lanka,field,Sri Lanka,Ireland,2,83,1,60,8.3,146.0,6.3,Sri Lanka,2024,1
629997,1486228,"Ballpark Ground, Graz",Austria,bat,Slovenia,Austria,2,20,1,21,5.0,245.0,13.636364,Austria,2025,0
712441,1384589,Marsa Sports Club,Malta,field,Luxembourg,Malta,1,55,2,55,5.409836,-1.0,-1.0,Malta,2023,0
435451,1370905,"United Cricket Club Ground, Windhoek",Hong Kong,bat,Hong Kong,Uganda,1,16,2,35,2.666667,-1.0,-1.0,Uganda,2023,0


In [50]:
data_inn1 = data[data["innings"] == 1].copy()
data_inn2 = data[data["innings"] == 2].copy()

In [51]:
# get final score of innings 1 per match
first_innings_final = (
    data_inn1[data_inn1["innings"] == 1]
    .groupby("match_id")["runs_so_far"]
    .max()
    .reset_index()
    .rename(columns={"runs_so_far": "final_1st_score"})
)

data_inn1 = data_inn1.merge(first_innings_final, on="match_id", how="left")
data_inn1.sample(2)

Unnamed: 0,match_id,venue,toss_winner,toss_decision,batting_team,bowling_team,innings,runs_so_far,wickets_so_far,balls_faced,run_rate,target_runs,required_run_rate,winner,year,batting_team_wins,final_1st_score
432111,533297,R Premadasa Stadium,West Indies,bat,West Indies,Australia,1,8,0,10,4.0,-1.0,-1.0,West Indies,2012,1,205
384408,1407098,"United Cricket Club Ground, Windhoek",Tanzania,bat,Tanzania,Nigeria,1,99,4,91,6.1875,-1.0,-1.0,Nigeria,2023,0,139


In [52]:
venue_stats = data_inn1[data_inn1["innings"] == 1].groupby("venue").agg(
    avg_1st_score=("final_1st_score", "mean"),
    median_1st_score=("final_1st_score", "median"),
    std_1st_score=("final_1st_score", "std"),
    win_rate_bat_first=("batting_team_wins", "mean")
).reset_index()

data_inn1 = data_inn1.merge(venue_stats, on="venue", how="left")
data_inn1.sample(2)

Unnamed: 0,match_id,venue,toss_winner,toss_decision,batting_team,bowling_team,innings,runs_so_far,wickets_so_far,balls_faced,...,target_runs,required_run_rate,winner,year,batting_team_wins,final_1st_score,avg_1st_score,median_1st_score,std_1st_score,win_rate_bat_first
22244,1485343,"Stars Arena Hofstade, Zemst",Belgium,bat,Belgium,Portugal,1,98,7,75,...,-1.0,-1.0,Portugal,2025,0,162,131.511921,131.0,36.201311,0.290323
512042,1344789,Al Amerat Cricket Ground Oman Cricket (Ministr...,Bahrain,field,Canada,Bahrain,1,24,3,34,...,-1.0,-1.0,Bahrain,2022,0,160,150.518426,147.0,36.610471,0.446809


In [53]:
data_inn1["score_vs_avg"] = data_inn1["runs_so_far"] - data_inn1["avg_1st_score"]
data_inn1.sample(2)

Unnamed: 0,match_id,venue,toss_winner,toss_decision,batting_team,bowling_team,innings,runs_so_far,wickets_so_far,balls_faced,...,required_run_rate,winner,year,batting_team_wins,final_1st_score,avg_1st_score,median_1st_score,std_1st_score,win_rate_bat_first,score_vs_avg
324678,1486091,"Simar Cricket Ground, Rome",Spain,field,Germany,Spain,1,101,4,100,...,-1.0,Germany,2025,1,139,138.777686,128.0,36.163591,0.535048,-37.777686
531765,1450466,"Sikh Union Club Ground, Nairobi",Rwanda,bat,Rwanda,Kenya,1,86,7,98,...,-1.0,Rwanda,2024,1,110,140.08008,149.0,28.020445,0.501001,-54.08008


In [54]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in ['venue', 'toss_winner', 'toss_decision', 'batting_team', 'bowling_team']:
    le = LabelEncoder()
    
    # Fit on all unique values in the FULL dataset (df), not just training split
    all_values = data_inn1[col].dropna().unique().tolist()
    le.fit(all_values)
    
    # Transform the full column now
    data_inn1[col] = le.transform(data_inn1[col])
    
    # Save encoder
    label_encoders[col] = le


In [55]:
label_encoders = {}
for col in ['venue', 'toss_winner', 'toss_decision', 'batting_team', 'bowling_team']:
    le = LabelEncoder()
    
    # Fit on all unique values in the FULL dataset (df), not just training split
    all_values = data_inn2[col].dropna().unique().tolist()
    le.fit(all_values)
    
    # Transform the full column now
    data_inn2[col] = le.transform(data_inn2[col])
    
    # Save encoder
    label_encoders[col] = le

In [65]:
# Features for innings 1 → drop target-related
features_inn1 = [col for col in data_inn1.columns if col not in [
    "match_id", "winner", "batting_team_wins", 
    "target_runs", "runs_required", "required_rr", 'final_1st_score'
]]

# Features for innings 2 → keep all
features_inn2 = [col for col in data_inn2.columns if col not in [
    "match_id", "winner", "batting_team_wins"
]]

In [66]:
features_inn1

['venue',
 'toss_winner',
 'toss_decision',
 'batting_team',
 'bowling_team',
 'innings',
 'runs_so_far',
 'wickets_so_far',
 'balls_faced',
 'run_rate',
 'required_run_rate',
 'year',
 'avg_1st_score',
 'median_1st_score',
 'std_1st_score',
 'win_rate_bat_first',
 'score_vs_avg']

In [67]:
features_inn2

['venue',
 'toss_winner',
 'toss_decision',
 'batting_team',
 'bowling_team',
 'innings',
 'runs_so_far',
 'wickets_so_far',
 'balls_faced',
 'run_rate',
 'target_runs',
 'required_run_rate',
 'year']

In [68]:
venue_stats.to_csv("venue_stats.csv", index=False)

In [69]:
from sklearn.model_selection import GroupShuffleSplit

# Split so same match doesn't leak between train/test
gss1 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx1, test_idx1 = next(gss1.split(data_inn1[features_inn1], data_inn1["batting_team_wins"], groups=data_inn1["match_id"]))

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx2, test_idx2 = next(gss2.split(data_inn2[features_inn2], data_inn2["batting_team_wins"], groups=data_inn2["match_id"]))

X1_train, X1_test = data_inn1[features_inn1].iloc[train_idx1], data_inn1[features_inn1].iloc[test_idx1]
y1_train, y1_test = data_inn1["batting_team_wins"].iloc[train_idx1], data_inn1["batting_team_wins"].iloc[test_idx1]

X2_train, X2_test = data_inn2[features_inn2].iloc[train_idx2], data_inn2[features_inn2].iloc[test_idx2]
y2_train, y2_test = data_inn2["batting_team_wins"].iloc[train_idx2], data_inn2["batting_team_wins"].iloc[test_idx2]

print("Train size:", X1_train.shape, "Test size:", X1_test.shape)
print("Train size:", X2_train.shape, "Test size:", X2_test.shape)

Train size: (439135, 17) Test size: (109610, 17)
Train size: (374039, 13) Test size: (94765, 13)


In [70]:
from lightgbm import LGBMClassifier

# First innings model
model_inn1 = LGBMClassifier(
    objective="binary",
    learning_rate=0.002,
    num_leaves=31,
    n_estimators=1200,
    random_state=42
)

model_inn1.fit(
    X1_train, y1_train,
    eval_set=[(X1_test, y1_test)],
    eval_metric="auc",
)


[LightGBM] [Info] Number of positive: 218554, number of negative: 220581
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003245 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2333
[LightGBM] [Info] Number of data points in the train set: 439135, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497692 -> initscore=-0.009232
[LightGBM] [Info] Start training from score -0.009232


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.002
,n_estimators,1200
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [71]:
# Second innings model
model_inn2 = LGBMClassifier(
    objective="binary",
    learning_rate=0.002,
    num_leaves=31,
    n_estimators=1000,
    random_state=42
)

model_inn2.fit(
    X2_train, y2_train,
    eval_set=[(X2_test, y2_test)],
    eval_metric="auc",
)

[LightGBM] [Info] Number of positive: 169417, number of negative: 204622
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002056 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1709
[LightGBM] [Info] Number of data points in the train set: 374039, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.452939 -> initscore=-0.188801
[LightGBM] [Info] Start training from score -0.188801


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.002
,n_estimators,1000
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [72]:
from sklearn.metrics import roc_auc_score, accuracy_score

# First innings
y1_pred_proba = model_inn1.predict_proba(X1_test)[:, 1]
y1_pred = (y1_pred_proba >= 0.5).astype(int)
print("First Innings AUC:", roc_auc_score(y1_test, y1_pred_proba))
print("First Innings Accuracy:", accuracy_score(y1_test, y1_pred))

# Second innings
y2_pred_proba = model_inn2.predict_proba(X2_test)[:, 1]
y2_pred = (y2_pred_proba >= 0.5).astype(int)
print("Second Innings AUC:", roc_auc_score(y2_test, y2_pred_proba))
print("Second Innings Accuracy:", accuracy_score(y2_test, y2_pred))

First Innings AUC: 0.7894081299348782
First Innings Accuracy: 0.7147523036219323
Second Innings AUC: 0.9123204485598415
Second Innings Accuracy: 0.8279428058882499


In [73]:
import pickle

model_bundle = {
    "model_inn1": model_inn1,
    "model_inn2": model_inn2,
    "label_encoders": label_encoders,
    "features_inn1": features_inn1,
    "features_inn2": features_inn2
}

with open("pipe_new.pkl", "wb") as f:
    pickle.dump(model_bundle, f)

print("✅ Saved two-model setup to pipe_new.pkl")


✅ Saved two-model setup to pipe_new.pkl
