In [1]:
import sys
print(sys.executable)


C:\Program Files\Python310\python.exe


In [2]:
import sys
! "C:\Program Files\Python310\python.exe" -m pip install lightgbm



Defaulting to user installation because normal site-packages is not writeable


[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: C:\Program Files\Python310\python.exe -m pip install --upgrade pip





In [3]:
# Import required libraries for data handling, evaluation metrics and regression models

import pandas as pd
import numpy as np
import joblib

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


In [4]:
# Load the processed dataset into dataframe and display first few rows for verification

df = pd.read_csv("dataset.csv", low_memory=False)
print(df.head())


       id          batter  season  match_date             venue  \
0  548346  A Ashish Reddy    2012  2012-04-29  Wankhede Stadium   
1  548346  A Ashish Reddy    2012  2012-04-29  Wankhede Stadium   
2  548346  A Ashish Reddy    2012  2012-04-29  Wankhede Stadium   
3  548346  A Ashish Reddy    2012  2012-04-29  Wankhede Stadium   
4  548346  A Ashish Reddy    2012  2012-04-29  Wankhede Stadium   

     bowling_team  match_runs  rolling_avg_5  pvt_avg         bowler  \
0  Mumbai Indians        10.0           10.0     1.08        A Nehra   
1  Mumbai Indians        10.0           10.0     1.08       AB Dinda   
2  Mumbai Indians        10.0           10.0     1.08     AD Mathews   
3  Mumbai Indians        10.0           10.0     1.08     AD Russell   
4  Mumbai Indians        10.0           10.0     1.08  Anureet Singh   

    pvp_avg  career_avg  next_match_runs batting_team  match_wickets  \
0  0.777778    1.428571             10.0          NaN            NaN   
1  1.285714    1.428

In [5]:
print(df.columns.tolist())


['id', 'batter', 'season', 'match_date', 'venue', 'bowling_team', 'match_runs', 'rolling_avg_5', 'pvt_avg', 'bowler', 'pvp_avg', 'career_avg', 'next_match_runs', 'batting_team', 'match_wickets', 'rolling_wkt_5', 'career_wkts', 'next_match_wickets']


In [6]:
# Select input features and target variable for predicting next match runs
X = df[['rolling_avg_5', 'pvt_avg', 'career_avg']]
y = df['next_match_runs']


In [7]:
# Split data sequentially into 80% training and 20% testing to preserve time order

split = int(len(df) * 0.8)

X_train = X[:split]
X_test  = X[split:]

y_train = y[:split]
y_test  = y[split:]

print("Train:", X_train.shape)
print("Test :", X_test.shape)
print(X_train.head())



Train: (1657713, 3)
Test : (414429, 3)
   rolling_avg_5  pvt_avg  career_avg
0           10.0     1.08    1.428571
1           10.0     1.08    1.428571
2           10.0     1.08    1.428571
3           10.0     1.08    1.428571
4           10.0     1.08    1.428571


In [8]:
# Check number of missing values in each input feature before model training
print(X.isnull().sum())


rolling_avg_5    12448
pvt_avg          12448
career_avg       12448
dtype: int64


In [9]:
# Replace any remaining missing feature and target values with 0 to avoid model errors
X = X.fillna(0)
y = y.fillna(0)



In [10]:
split = int(len(df) * 0.8)

X_train = X[:split]
X_test  = X[split:]

y_train = y[:split]
y_test  = y[split:]


In [11]:
baseline_pred = X_test['rolling_avg_5']
baseline_pred = baseline_pred.fillna(0)


In [12]:
# Use rolling average of last 5 innings as simple baseline prediction and check for nulls

temp = pd.DataFrame({
    "y_true": y_test,
    "y_pred": baseline_pred
})

# Drop rows where ANY is NaN
temp = temp.dropna()

y_test_clean = temp["y_true"]
baseline_clean = temp["y_pred"]

print("Rows after removing NaN:", len(temp))


Rows after removing NaN: 414429


In [13]:
# Import evaluation metrics and numpy for error calculations

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


In [14]:
# Compute baseline error metrics to compare ML models against simple rolling average

baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_pred))
baseline_mae  = mean_absolute_error(y_test, baseline_pred)
baseline_r2   = r2_score(y_test, baseline_pred)


print("===== Baseline Performance =====")
print("RMSE:", baseline_rmse)
print("MAE :", baseline_mae)
print("R2  :", baseline_r2)


===== Baseline Performance =====
RMSE: 19.439248910389875
MAE : 14.513658230159251
R2  : 0.3160524512862455


In [15]:
# Remove any remaining NaN from target and baseline
y = y.fillna(0)
baseline_pred = baseline_pred.fillna(0)


In [16]:
# Train Random Forest with 200 trees and depth 8
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=8,
    random_state=42
)

rf.fit(X_train, y_train)

# Predict using Random Forest
rf_pred = rf.predict(X_test)


In [17]:
# Evaluate Random Forest model
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_mae  = mean_absolute_error(y_test, rf_pred)
rf_r2   = r2_score(y_test, rf_pred)

print("===== Random Forest Performance =====")
print("RMSE:", rf_rmse)
print("MAE :", rf_mae)
print("R2  :", rf_r2)


===== Random Forest Performance =====
RMSE: 19.5760929396015
MAE : 14.399778469441909
R2  : 0.30638915923263266


In [18]:
# Train XGBoost regressor for comparison
xgb = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    random_state=42
)

xgb.fit(X_train, y_train)

# Predict using XGBoost
xgb_pred = xgb.predict(X_test)


In [19]:
imp = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

print(imp)


         Feature  Importance
0  rolling_avg_5    0.829935
1        pvt_avg    0.128591
2     career_avg    0.041474


In [20]:
# Evaluate XGBoost model
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
xgb_mae  = mean_absolute_error(y_test, xgb_pred)
xgb_r2   = r2_score(y_test, xgb_pred)

print("===== XGBoost Performance =====")
print("RMSE:", xgb_rmse)
print("MAE :", xgb_mae)
print("R2  :", xgb_r2)


===== XGBoost Performance =====
RMSE: 19.388576645022813
MAE : 14.269668271653547
R2  : 0.3196134943535459


In [21]:
# Compare performance of all models in one place
print("===== MODEL COMPARISON =====")
print("Baseline R2 :", baseline_r2)
print("RandomForest R2 :", rf_r2)
print("XGBoost R2 :", xgb_r2)


===== MODEL COMPARISON =====
Baseline R2 : 0.3160524512862455
RandomForest R2 : 0.30638915923263266
XGBoost R2 : 0.3196134943535459


In [22]:
# Choose best model based on R2 score
best_model = rf if rf_r2 > xgb_r2 else xgb
print("Best Model:", "Random Forest" if rf_r2 > xgb_r2 else "XGBoost")


Best Model: XGBoost


In [23]:
# Save trained best model for future use
import joblib
joblib.dump(best_model, "batsman_runs_predictor.pkl")


['batsman_runs_predictor.pkl']

In [24]:
import joblib

joblib.dump(rf, "batsman_rf_model.pkl")
print("Model Saved ✔")


Model Saved ✔


In [25]:
# Predict next match runs for first test sample
sample = X_test.iloc[0:1]
print("Predicted Runs:", best_model.predict(sample))


Predicted Runs: [28.157497]
