In [None]:
import os
import sys
from pathlib import Path

PROJECT_DIR = Path(os.getcwd()).parent
sys.path.insert(0, str(PROJECT_DIR))

print("PROJECT_DIR:", PROJECT_DIR)


PROJECT_DIR: c:\Users\Abhishek Karyagol\OneDrive\TaÌ€i liÃªÌ£u\infosys\AI_Cricket_Player_Performance_Prediction


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import joblib

from src.config import PROCESSED_DIR, PROJECT_DIR


In [3]:
data_path = PROCESSED_DIR / "batsman_match_features.csv"
df = pd.read_csv(data_path)

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (16515, 14)


Unnamed: 0,match_id,batter,runs,balls_faced,date,season,venue,team1,team2,winner,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue_runs_avg
0,548346,A Ashish Reddy,10,10,2012-04-29,2012,Wankhede Stadium,Mumbai Indians,Deccan Chargers,Mumbai Indians,0.0,0.0,0.0,0.0
1,548352,A Ashish Reddy,3,3,2012-05-04,2012,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,Chennai Super Kings,10.0,10.0,10.0,0.0
2,548359,A Ashish Reddy,8,8,2012-05-08,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Kings XI Punjab,Kings XI Punjab,6.5,6.5,6.5,0.0
3,548373,A Ashish Reddy,10,4,2012-05-18,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Rajasthan Royals,Deccan Chargers,7.0,7.0,7.0,8.0
4,548376,A Ashish Reddy,4,5,2012-05-20,2012,"Rajiv Gandhi International Stadium, Uppal",Deccan Chargers,Royal Challengers Bangalore,Deccan Chargers,7.75,7.75,7.75,9.0


In [4]:
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=["date"])  # drop rows where date missing

print("After date cleanup:", df.shape)
df[["date", "batter", "runs"]].head()


After date cleanup: (16515, 14)


Unnamed: 0,date,batter,runs
0,2012-04-29,A Ashish Reddy,10
1,2012-05-04,A Ashish Reddy,3
2,2012-05-08,A Ashish Reddy,8
3,2012-05-18,A Ashish Reddy,10
4,2012-05-20,A Ashish Reddy,4


In [5]:
df = df.sort_values("date").reset_index(drop=True)
df.tail()


Unnamed: 0,match_id,batter,runs,balls_faced,date,season,venue,team1,team2,winner,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue_runs_avg
16510,1426312,VR Iyer,52,27,2024-05-26,2024,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,Kolkata Knight Riders,Kolkata Knight Riders,38.0,26.1,26.541667,6.0
16511,1426312,H Klaasen,16,17,2024-05-26,2024,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,Kolkata Knight Riders,Kolkata Knight Riders,33.6,28.6,31.516129,29.0
16512,1426312,TM Head,0,1,2024-05-26,2024,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,Kolkata Knight Riders,Kolkata Knight Riders,34.2,43.4,32.166667,23.5
16513,1426312,RA Tripathi,9,16,2024-05-26,2024,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,Kolkata Knight Riders,Kolkata Knight Riders,31.2,25.9,24.206522,28.2
16514,1426312,Nithish Kumar Reddy,13,10,2024-05-26,2024,"MA Chidambaram Stadium, Chepauk, Chennai",Sunrisers Hyderabad,Kolkata Knight Riders,Kolkata Knight Riders,29.4,29.0,29.0,10.0


In [6]:
split_index = int(len(df) * 0.8)

train_df = df.iloc[:split_index].copy()
test_df = df.iloc[split_index:].copy()

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

print("Train date range:", train_df["date"].min(), "->", train_df["date"].max())
print("Test date range:", test_df["date"].min(), "->", test_df["date"].max())


Train shape: (13212, 14)
Test shape: (3303, 14)
Train date range: 2008-04-18 00:00:00 -> 2022-04-02 00:00:00
Test date range: 2022-04-02 00:00:00 -> 2024-05-26 00:00:00


In [8]:
y_true = test_df["runs"].values
y_pred_baseline = test_df["runs_last_10_avg"].values

mae = mean_absolute_error(y_true, y_pred_baseline)
rmse = np.sqrt(mean_squared_error(y_true, y_pred_baseline))
r2 = r2_score(y_true, y_pred_baseline)

print("ðŸ“Œ BASELINE (runs_last_10_avg)")
print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)


ðŸ“Œ BASELINE (runs_last_10_avg)
MAE: 16.295270653422335
RMSE: 21.856160098021196
R2: 0.058503043907369046


In [9]:
target = "runs"

feature_cols = [
    "batter",
    "venue",
    "team1",
    "team2",
    "runs_last_5_avg",
    "runs_last_10_avg",
    "career_runs_avg",
    "venue_runs_avg",
]

X_train = train_df[feature_cols]
y_train = train_df[target]

X_test = test_df[feature_cols]
y_test = test_df[target]

X_train.head()


Unnamed: 0,batter,venue,team1,team2,runs_last_5_avg,runs_last_10_avg,career_runs_avg,venue_runs_avg
0,BB McCullum,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,0.0,0.0,0.0,0.0
1,P Kumar,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,0.0,0.0,0.0,0.0
2,Z Khan,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,0.0,0.0,0.0,0.0
3,CL White,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,0.0,0.0,0.0,0.0
4,SB Joshi,M Chinnaswamy Stadium,Royal Challengers Bangalore,Kolkata Knight Riders,0.0,0.0,0.0,0.0


In [10]:
cat_cols = ["batter", "venue", "team1", "team2"]
num_cols = ["runs_last_5_avg", "runs_last_10_avg", "career_runs_avg", "venue_runs_avg"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)


In [11]:
rf_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", rf_model)
])

model.fit(X_train, y_train)
print("âœ… RandomForest training complete")


âœ… RandomForest training complete


In [12]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("ðŸ”¥ RANDOM FOREST RESULTS")
print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)


ðŸ”¥ RANDOM FOREST RESULTS
MAE: 15.8889706327581
RMSE: 22.00582927753341
R2: 0.04556430655783017


In [13]:
baseline_mae = mean_absolute_error(y_test, test_df["runs_last_10_avg"])
baseline_rmse = np.sqrt(mean_squared_error(y_test, test_df["runs_last_10_avg"]))
baseline_r2 = r2_score(y_test, test_df["runs_last_10_avg"])

results = pd.DataFrame({
    "Model": ["Baseline (runs_last_10_avg)", "RandomForest"],
    "MAE": [baseline_mae, mae],
    "RMSE": [baseline_rmse, rmse],
    "R2": [baseline_r2, r2]
})

results


Unnamed: 0,Model,MAE,RMSE,R2
0,Baseline (runs_last_10_avg),16.295271,21.85616,0.058503
1,RandomForest,15.888971,22.005829,0.045564


In [14]:
models_dir = PROJECT_DIR / "models"
models_dir.mkdir(exist_ok=True)

model_path = models_dir / "rf_runs_model.joblib"
joblib.dump(model, model_path)

print("âœ… Model saved to:", model_path)


âœ… Model saved to: C:\Users\Abhishek Karyagol\OneDrive\TaÌ€i liÃªÌ£u\infosys\AI_Cricket_Player_Performance_Prediction\models\rf_runs_model.joblib
