In [1]:
# Predicting Batsman Runs in ODI Cricket Matches Using Machine Learning

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from lightgbm import LGBMRegressor
import time

In [2]:
# 1. Load Data
match_data_path = r"C:\\Users\\revan\\Desktop\\Assignments\\Trimester 6\\Sports Analytics\\ODI Men's Cricket Match Data (2002–2023) data\\ODI_Match_Data.csv"
match_info_path = r"C:\\Users\\revan\\Desktop\\Assignments\\Trimester 6\\Sports Analytics\\ODI Men's Cricket Match Data (2002–2023) data\\ODI_Match_info.csv"

match_data = pd.read_csv(match_data_path, low_memory=False)
match_info = pd.read_csv(match_info_path, low_memory=False)

In [3]:
# 2. Merge Data
merged_df = pd.merge(match_data, match_info, left_on='match_id', right_on='id', how='left')
merged_df.drop(columns=['season_y', 'venue_y', 'id'], inplace=True)
merged_df.rename(columns={'season_x': 'season', 'venue_x': 'venue'}, inplace=True)

In [4]:
# 3. Total Runs per Batsman per Match
batsman_match_df = (
    merged_df.groupby(['match_id', 'striker'])['runs_off_bat']
    .sum()
    .reset_index()
    .rename(columns={'striker': 'batsman', 'runs_off_bat': 'total_runs'})
)

In [5]:
# 4. Match Context Merge
match_context = merged_df.drop_duplicates(subset='match_id')[
    ['match_id', 'season', 'city', 'venue', 'date', 'batting_team', 'bowling_team',
     'team1', 'team2', 'toss_winner', 'toss_decision', 'result', 'dl_applied']
]

batsman_full_df = pd.merge(batsman_match_df, match_context, on='match_id', how='left')

In [6]:
# 5. Add Form Features
match_dates = merged_df[['match_id', 'date']].drop_duplicates()
batsman_match_df = pd.merge(batsman_match_df, match_dates, on='match_id', how='left')
batsman_match_df['date'] = pd.to_datetime(batsman_match_df['date'])
batsman_match_df = batsman_match_df.sort_values(by=['batsman', 'date'])

batsman_match_df['form_avg_3'] = batsman_match_df.groupby('batsman')['total_runs'].shift(1).rolling(window=3).mean()
batsman_match_df['form_avg_5'] = batsman_match_df.groupby('batsman')['total_runs'].shift(1).rolling(window=5).mean()

batsman_full_df = pd.merge(
    batsman_full_df,
    batsman_match_df[['match_id', 'batsman', 'form_avg_3', 'form_avg_5']],
    on=['match_id', 'batsman'],
    how='left'
)

In [7]:
# 6. Add Batting Position
batting_orders = merged_df.groupby(['match_id', 'batting_team', 'striker']).first().reset_index()
batting_orders['batting_position'] = batting_orders.groupby(['match_id', 'batting_team']).cumcount() + 1

batsman_full_df = pd.merge(
    batsman_full_df,
    batting_orders[['match_id', 'striker', 'batting_position']],
    left_on=['match_id', 'batsman'],
    right_on=['match_id', 'striker'],
    how='left'
)
batsman_full_df.drop(columns='striker', inplace=True)

In [8]:
# 7. Add Innings Number
innings_df = merged_df[['match_id', 'striker', 'innings']].drop_duplicates()
batsman_full_df = pd.merge(
    batsman_full_df,
    innings_df,
    left_on=['match_id', 'batsman'],
    right_on=['match_id', 'striker'],
    how='left'
)
batsman_full_df.drop(columns='striker', inplace=True)

In [9]:
# 8. Add Match Pressure
match_targets = merged_df[merged_df['innings'] == 1].groupby('match_id')['runs_off_bat'].sum().reset_index()
match_targets.rename(columns={'runs_off_bat': '1st_innings_total'}, inplace=True)
batsman_full_df = pd.merge(batsman_full_df, match_targets, on='match_id', how='left')
batsman_full_df['chasing'] = (batsman_full_df['innings'] == 2).astype(int)
batsman_full_df['pressure_score'] = batsman_full_df['chasing'] * batsman_full_df['1st_innings_total']

In [10]:
# 9. Encode + Prepare Features
categorical_cols = [
    'batsman', 'batting_team', 'bowling_team', 'venue', 'city', 'toss_winner', 'toss_decision'
]

encoded_df = pd.get_dummies(batsman_full_df, columns=categorical_cols, drop_first=True)

encoded_df['season'] = batsman_full_df['season']
encoded_df['form_avg_3'] = batsman_full_df['form_avg_3']
encoded_df['form_avg_5'] = batsman_full_df['form_avg_5']
encoded_df['batting_position'] = batsman_full_df['batting_position']
encoded_df['innings'] = batsman_full_df['innings']
encoded_df['pressure_score'] = batsman_full_df['pressure_score']

In [11]:
# 10. Train-Test Split
X = encoded_df.drop(columns=['total_runs', 'match_id', 'date', 'team1', 'team2', 'result'])
y = encoded_df['total_runs']

X = X.dropna()
y = y.loc[X.index]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Sanitize + Deduplicate Columns
X_train.columns = X_train.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_test.columns = X_test.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

In [12]:
# 11. Train LightGBM Model
# Ensure all columns are float, int, or bool — fix 'season' in-place
X_train['season'] = X_train['season'].astype(str).str.extract(r'(\d{4})').astype(float)
X_test['season'] = X_test['season'].astype(str).str.extract(r'(\d{4})').astype(float)

start = time.time()
model = LGBMRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
end = time.time()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011515 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2680
[LightGBM] [Info] Number of data points in the train set: 27520, number of used features: 823
[LightGBM] [Info] Start training from score 25.642260


In [13]:
# 12. Evaluate Model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"\n📊 MAE: {mae:.2f}")
print(f"📉 RMSE: {rmse:.2f}")
print(f"📈 R² Score: {r2:.2f}")
print(f"⏱️ Training time: {end - start:.2f} seconds")


📊 MAE: 19.45
📉 RMSE: 26.41
📈 R² Score: 0.16
⏱️ Training time: 1.11 seconds




In [14]:
import joblib

# Save trained model
joblib.dump(model, "batsman_run_predictor.pkl")

# Pull metadata from the original dataframe BEFORE encoding
model_metadata = {
    "batsman_list": sorted(batsman_full_df['batsman'].unique().tolist()),
    "team_list": sorted(set(batsman_full_df['batting_team'].unique()).union(set(batsman_full_df['bowling_team'].unique()))),
    "venue_list": sorted(batsman_full_df['venue'].dropna().unique().tolist()),
    "city_list": sorted(batsman_full_df['city'].dropna().unique().tolist()),
    "categorical_features": ['batsman', 'batting_team', 'bowling_team', 'venue', 'city', 'toss_winner', 'toss_decision'],
    "feature_columns": X_train.columns.tolist()
}

joblib.dump(model_metadata, "model_metadata.pkl")


['model_metadata.pkl']