In [187]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor
import lightgbm as lgb
import shap

In [188]:
BASE_DIR = Path().resolve().parent

DATA_DIR = BASE_DIR / "data" / "processed"
MODEL_DIR = BASE_DIR / "models"

MODEL_DIR.mkdir(exist_ok=True)

print("Base:", BASE_DIR)

Base: C:\Users\Abhishek Karyagol\OneDrive\TaÃÄi li√™Ã£u\AI_Cricket_Player_Performance_Prediction


In [189]:
batter_df = pd.read_csv(DATA_DIR / "dataset.csv")
bowler_df = pd.read_csv(DATA_DIR / "bowler_dataset.csv")

batter_df["date"] = pd.to_datetime(batter_df["date"])
bowler_df["date"] = pd.to_datetime(bowler_df["date"])

print("Batter:", batter_df.shape)
print("Bowler:", bowler_df.shape)

Batter: (15842, 17)
Bowler: (12448, 15)


In [192]:
batter_df = batter_df.sort_values("date").reset_index(drop=True)
bowler_df = bowler_df.sort_values("date").reset_index(drop=True)

print("Sorting complete ‚úÖ")
print("Batter shape:", batter_df.shape)
print("Bowler shape:", bowler_df.shape)

Sorting complete ‚úÖ
Batter shape: (15842, 17)
Bowler shape: (12448, 15)


In [193]:
# BATTER FEATURES

CAT_BAT = ["batter","venue","team1","team2"]

NUM_BAT = [
    "runs_last_5_avg",
    "runs_last_10_avg",
    "career_runs_avg",
    "career_sr",
    "venue_runs_avg",
    "pvt_runs_avg",
    "pvp_runs_avg"
]

TARGET_BAT = "target_next_runs"


# BOWLER FEATURES

NUM_WKT = [
    "overs",
    "runs",
    "economy",
    "wickets_last_5",
    "career_wickets_avg",
    "venue_wickets_avg"
]

TARGET_WKT = "target_next_wickets"
print("Categorical:", CAT_BAT)
print("Numerical (Batter):", NUM_BAT)
print("Numerical (Bowler):", NUM_WKT)

Categorical: ['batter', 'venue', 'team1', 'team2']
Numerical (Batter): ['runs_last_5_avg', 'runs_last_10_avg', 'career_runs_avg', 'career_sr', 'venue_runs_avg', 'pvt_runs_avg', 'pvp_runs_avg']
Numerical (Bowler): ['overs', 'runs', 'economy', 'wickets_last_5', 'career_wickets_avg', 'venue_wickets_avg']


In [194]:
# Batter
split_b = int(len(batter_df)*0.8)

train_b = batter_df.iloc[:split_b]
test_b  = batter_df.iloc[split_b:]


# Bowler
split_w = int(len(bowler_df)*0.8)

train_w = bowler_df.iloc[:split_w]
test_w  = bowler_df.iloc[split_w:]

In [196]:
print("Train Batter:", train_b.shape)
print("Test Batter :", test_b.shape)

print("Train Bowler:", train_w.shape)
print("Test Bowler :", test_w.shape)

Train Batter: (12673, 17)
Test Batter : (3169, 17)
Train Bowler: (9958, 15)
Test Bowler : (2490, 15)


In [195]:
# Batter baseline (10-match avg)
baseline_b = test_b["runs_last_10_avg"]

# Bowler baseline
baseline_w = test_w["wickets_last_5"]

In [197]:
print("Baseline Batter:")
print(baseline_b.head())

print("\nBaseline Bowler:")
print(baseline_w.head())

Baseline Batter:
12673    13.3
12674    20.3
12675    23.3
12676    21.9
12677    26.0
Name: runs_last_10_avg, dtype: float64

Baseline Bowler:
9958    1.0
9959    0.2
9960    1.2
9961    0.8
9962    2.0
Name: wickets_last_5, dtype: float64


In [198]:
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_BAT),
    ("num", StandardScaler(), NUM_BAT)
])

print("‚úÖ Preprocessor Created")

print("\nCategorical Features:", CAT_BAT)
print("Numerical Features:", NUM_BAT)

‚úÖ Preprocessor Created

Categorical Features: ['batter', 'venue', 'team1', 'team2']
Numerical Features: ['runs_last_5_avg', 'runs_last_10_avg', 'career_runs_avg', 'career_sr', 'venue_runs_avg', 'pvt_runs_avg', 'pvp_runs_avg']


In [199]:
rf_pipe = Pipeline([
    ("prep", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

print("‚è≥ Training Random Forest...")

rf_pipe.fit(
    train_b[CAT_BAT + NUM_BAT],
    train_b[TARGET_BAT]
)

print("‚úÖ Random Forest Trained")

‚è≥ Training Random Forest...
‚úÖ Random Forest Trained


In [200]:
xgb_pipe = Pipeline([
    ("prep", preprocessor),
    ("model", XGBRegressor(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        random_state=42
    ))
])

print("‚è≥ Training XGBoost...")

xgb_pipe.fit(
    train_b[CAT_BAT + NUM_BAT],
    train_b[TARGET_BAT]
)

print("‚úÖ XGBoost Trained")

‚è≥ Training XGBoost...
‚úÖ XGBoost Trained


In [201]:
lgb_pipe = Pipeline([
    ("prep", preprocessor),
    ("model", lgb.LGBMRegressor(
        n_estimators=300,
        learning_rate=0.05,
        random_state=42
    ))
])

print("‚è≥ Training LightGBM...")

lgb_pipe.fit(
    train_b[CAT_BAT + NUM_BAT],
    train_b[TARGET_BAT]
)

print("‚úÖ LightGBM Trained")

‚è≥ Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000896 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2266
[LightGBM] [Info] Number of data points in the train set: 12673, number of used features: 248
[LightGBM] [Info] Start training from score 20.065888
‚úÖ LightGBM Trained


In [203]:
def eval_model(name, y_true, y_pred):

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)

    print(f"\nüìä {name} Results")
    print("RMSE:", rmse)
    print("MAE :", mae)
    print("R2  :", r2)

In [204]:
print("üìà Evaluating Models...")

# Baseline
eval_model(
    "Baseline (10-match avg)",
    test_b[TARGET_BAT],
    baseline_b
)


# RF
print("‚û°Ô∏è Predicting RF...")
pred_rf = rf_pipe.predict(test_b[CAT_BAT + NUM_BAT])

eval_model("Random Forest", test_b[TARGET_BAT], pred_rf)


# XGB
print("‚û°Ô∏è Predicting XGB...")
pred_xgb = xgb_pipe.predict(test_b[CAT_BAT + NUM_BAT])

eval_model("XGBoost", test_b[TARGET_BAT], pred_xgb)


# LGBM
print("‚û°Ô∏è Predicting LGBM...")
pred_lgb = lgb_pipe.predict(test_b[CAT_BAT + NUM_BAT])

eval_model("LightGBM", test_b[TARGET_BAT], pred_lgb)

print("‚úÖ Evaluation Complete")

üìà Evaluating Models...

üìä Baseline (10-match avg) Results
RMSE: 22.116833073240443
MAE : 16.630627181976188
R2  : 0.053143169108287314
‚û°Ô∏è Predicting RF...

üìä Random Forest Results
RMSE: 21.868702864801
MAE : 15.988090880403913
R2  : 0.0742696890890695
‚û°Ô∏è Predicting XGB...

üìä XGBoost Results
RMSE: 21.798720348119822
MAE : 16.15637483909778
R2  : 0.08018510871639783
‚û°Ô∏è Predicting LGBM...

üìä LightGBM Results
RMSE: 21.721563150655996
MAE : 16.015751916950826
R2  : 0.08668500754416797
‚úÖ Evaluation Complete




In [205]:
best_model = xgb_pipe   # Change if others better

joblib.dump(
    best_model,
    MODEL_DIR / "xgb_runs_model.joblib"
)

print("üíæ Best Model Saved: xgb_runs_model.joblib")

üíæ Best Model Saved: xgb_runs_model.joblib


In [206]:
Xw_train = train_w[NUM_WKT]
yw_train = train_w[TARGET_WKT]

Xw_test = test_w[NUM_WKT]
yw_test = test_w[TARGET_WKT]


xgb_wkt = XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    random_state=42
)

print("‚è≥ Training Wickets Model...")

xgb_wkt.fit(Xw_train, yw_train)

print("‚úÖ Wickets Model Trained")


pred_w = xgb_wkt.predict(Xw_test)

eval_model("Wickets XGB", yw_test, pred_w)


joblib.dump(
    xgb_wkt,
    MODEL_DIR / "xgb_wickets_model.joblib"
)

print("üíæ Wickets Model Saved")

‚è≥ Training Wickets Model...
‚úÖ Wickets Model Trained

üìä Wickets XGB Results
RMSE: 1.0887956196479545
MAE : 0.8525750793112689
R2  : 0.005482629862046284
üíæ Wickets Model Saved
