In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
# Use raw string (r"") to avoid issues with backslashes
match_data_path = r"C:\Users\revan\Desktop\Assignments\Trimester 6\Sports Analytics\ODI Men's Cricket Match Data (2002–2023) data\ODI_Match_Data.csv"
match_info_path = r"C:\Users\revan\Desktop\Assignments\Trimester 6\Sports Analytics\ODI Men's Cricket Match Data (2002–2023) data\ODI_Match_info.csv"

# Load the CSV files
match_data = pd.read_csv(match_data_path, low_memory=False)
match_info = pd.read_csv(match_info_path, low_memory=False)

# Preview the data
print("Match Data:", match_data.shape)
print("Match Info:", match_info.shape)


Match Data: (1265103, 23)
Match Info: (2379, 18)


In [3]:
print("Match Data Columns:")
print(match_data.columns.tolist())

Match Data Columns:
['match_id', 'season', 'start_date', 'venue', 'innings', 'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes', 'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type', 'other_player_dismissed', 'cricsheet_id']


In [4]:
print("\nMatch Info Columns:")
print(match_info.columns.tolist())


Match Info Columns:
['id', 'season', 'city', 'date', 'team1', 'team2', 'toss_winner', 'toss_decision', 'result', 'dl_applied', 'winner', 'win_by_runs', 'win_by_wickets', 'player_of_match', 'venue', 'umpire1', 'umpire2', 'umpire3']


In [5]:
# Merge datasets
merged_df = pd.merge(match_data, match_info, left_on='match_id', right_on='id', how='left')

# Drop duplicate/conflicting columns
merged_df.drop(columns=['season_y', 'venue_y', 'id'], inplace=True)

# Optional: Rename for clarity
merged_df.rename(columns={'season_x': 'season', 'venue_x': 'venue'}, inplace=True)

# Sanity check
print(merged_df.shape)
merged_df.head()


(1265103, 38)


Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,umpire1,umpire2,umpire3
0,1389389,2023/24,2023-09-24,"Holkar Cricket Stadium, Indore",1,0.1,India,Australia,RD Gaikwad,Shubman Gill,...,field,D/L,1,India,99,0,SS Iyer,J Madanagopal,HDPK Dharmasena,KN Ananthapadmanabhan
1,1389389,2023/24,2023-09-24,"Holkar Cricket Stadium, Indore",1,0.2,India,Australia,RD Gaikwad,Shubman Gill,...,field,D/L,1,India,99,0,SS Iyer,J Madanagopal,HDPK Dharmasena,KN Ananthapadmanabhan
2,1389389,2023/24,2023-09-24,"Holkar Cricket Stadium, Indore",1,0.3,India,Australia,RD Gaikwad,Shubman Gill,...,field,D/L,1,India,99,0,SS Iyer,J Madanagopal,HDPK Dharmasena,KN Ananthapadmanabhan
3,1389389,2023/24,2023-09-24,"Holkar Cricket Stadium, Indore",1,0.4,India,Australia,RD Gaikwad,Shubman Gill,...,field,D/L,1,India,99,0,SS Iyer,J Madanagopal,HDPK Dharmasena,KN Ananthapadmanabhan
4,1389389,2023/24,2023-09-24,"Holkar Cricket Stadium, Indore",1,0.5,India,Australia,RD Gaikwad,Shubman Gill,...,field,D/L,1,India,99,0,SS Iyer,J Madanagopal,HDPK Dharmasena,KN Ananthapadmanabhan


In [6]:
print(merged_df.columns.tolist())


['match_id', 'season', 'start_date', 'venue', 'innings', 'ball', 'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes', 'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type', 'other_player_dismissed', 'cricsheet_id', 'city', 'date', 'team1', 'team2', 'toss_winner', 'toss_decision', 'result', 'dl_applied', 'winner', 'win_by_runs', 'win_by_wickets', 'player_of_match', 'umpire1', 'umpire2', 'umpire3']


### Batsman Form

In [7]:
# Step 1: Create batsman-match dataset
batsman_match_df = (
    merged_df.groupby(['match_id', 'striker'])['runs_off_bat']
    .sum()
    .reset_index()
    .rename(columns={'striker': 'batsman', 'runs_off_bat': 'total_runs'})
)


In [8]:
# Step 2: Extract one row per match (match-level features)
match_context = merged_df.drop_duplicates(subset='match_id')[
    ['match_id', 'season', 'city', 'venue', 'date', 'batting_team', 'bowling_team', 
     'team1', 'team2', 'toss_winner', 'toss_decision', 'result', 'dl_applied']
]


In [9]:
# Step 3: Merge aggregated batsman performance with match context
batsman_full_df = pd.merge(batsman_match_df, match_context, on='match_id', how='left')

In [10]:
print(batsman_full_df.columns.tolist())

['match_id', 'batsman', 'total_runs', 'season', 'city', 'venue', 'date', 'batting_team', 'bowling_team', 'team1', 'team2', 'toss_winner', 'toss_decision', 'result', 'dl_applied']


In [34]:
# Attach match date
match_dates = merged_df[['match_id', 'date']].drop_duplicates()
batsman_match_df = pd.merge(batsman_match_df, match_dates, on='match_id', how='left')

# Ensure date is datetime
batsman_match_df['date'] = pd.to_datetime(batsman_match_df['date'])


In [36]:
# Sort by batsman and date
batsman_match_df = batsman_match_df.sort_values(by=['batsman', 'date'])

# Rolling averages (form over last 3 and 5 matches)
batsman_match_df['form_avg_3'] = (
    batsman_match_df.groupby('batsman')['total_runs']
    .shift(1)  # exclude current match
    .rolling(window=3)
    .mean()
)

batsman_match_df['form_avg_5'] = (
    batsman_match_df.groupby('batsman')['total_runs']
    .shift(1)
    .rolling(window=5)
    .mean()
)


In [38]:
# Merge by match_id + batsman
batsman_full_df = pd.merge(
    batsman_full_df,
    batsman_match_df[['match_id', 'batsman', 'form_avg_3', 'form_avg_5']],
    on=['match_id', 'batsman'],
    how='left'
)


In [40]:
print(batsman_full_df.columns.tolist())

['match_id', 'batsman', 'total_runs', 'season', 'city', 'venue', 'date', 'batting_team', 'bowling_team', 'team1', 'team2', 'toss_winner', 'toss_decision', 'result', 'dl_applied', 'form_avg_3', 'form_avg_5']


### 🔧 Feature Engineering: Categorical Encoding

In [12]:
# Convert season like '2018/19' → 2018 (numeric)
batsman_full_df['season'] = batsman_full_df['season'].apply(lambda x: str(x).split('/')[0]).astype(int)


In [42]:
categorical_cols = [
    'batsman',
    'batting_team',
    'bowling_team',
    'venue',
    'city',
    'toss_winner',
    'toss_decision'
]

In [44]:
# Encode
encoded_df = pd.get_dummies(batsman_full_df, columns=categorical_cols, drop_first=True)

# Add back numerical columns
encoded_df['season'] = batsman_full_df['season']
encoded_df['form_avg_3'] = batsman_full_df['form_avg_3']
encoded_df['form_avg_5'] = batsman_full_df['form_avg_5']

### 📊 Model Training: Predicting Batsman Runs (Random Forest Regressor)

In [54]:
!pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
    --------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -- ------------------------------------- 0.1/1.5 MB 1.5 MB/s eta 0:00:01
   ----- ---------------------------------- 0.2/1.5 MB 1.9 MB/s eta 0:00:01
   -------- ------------------------------- 0.3/1.5 MB 2.5 MB/s eta 0:00:01
   --------------- ------------------------ 0.6/1.5 MB 3.2 MB/s eta 0:00:01
   ----------------------- ---------------- 0.9/1.5 MB 3.9 MB/s eta 0:00:01
   -------------------------------- ------- 1.2/1.5 MB 4.4 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 4.6 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [46]:
# Drop irrelevant or ID-based columns
X = encoded_df.drop(columns=['total_runs', 'match_id', 'date', 'team1', 'team2', 'result'])
y = encoded_df['total_runs']

# Drop rows with missing form data (only first few matches per batsman will have NaNs)
X = X.dropna()
y = y.loc[X.index]



In [48]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)


In [49]:
# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"📊 MAE: {mae:.2f}")
print(f"📉 RMSE: {rmse:.2f}")
print(f"📈 R² Score: {r2:.2f}")

📊 MAE: 19.77
📉 RMSE: 27.98
📈 R² Score: 0.04




In [60]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import time

# ✅ Step 1: Sanitize column names
X_train.columns = X_train.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
X_test.columns = X_test.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

# ✅ Step 2: Remove duplicated columns (core fix)
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

# ✅ Step 3: Train the model
start = time.time()

model = LGBMRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

end = time.time()

# ✅ Step 4: Predict and evaluate
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"📊 MAE: {mae:.2f}")
print(f"📉 RMSE: {rmse:.2f}")
print(f"📈 R² Score: {r2:.2f}")
print(f"⏱️ Training time: {end - start:.2f} seconds")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010367 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2163
[LightGBM] [Info] Number of data points in the train set: 27511, number of used features: 818
[LightGBM] [Info] Start training from score 25.707753
📊 MAE: 20.07
📉 RMSE: 27.22
📈 R² Score: 0.09
⏱️ Training time: 0.88 seconds


