In [None]:
import pandas as pd
import joblib
import shap
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
#Verifying if the versions are compatible 
import shap, xgboost
print(shap.__version__)      # should be 0.49.1
print(xgboost.__version__)   # should be 1.7.6

In [None]:
#Print the columns
print(df.columns.tolist())

In [None]:
#Reading the dataset
df = pd.read_csv(r"C:\Users\asind\cricket_ai_project\data\processed\dataset.csv")

X = df[['rolling_avg_5','venue_avg','pvt_avg','career_avg']]
y = df['next_match_runs']

In [None]:
#Train-Test datasplit
split = int(len(df) * 0.8)

X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

In [None]:
#Baseline Prediction
baseline_pred = X_test['rolling_avg_5']
print(baseline_pred)

In [None]:
#Baseline performance Metrics
baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_pred))
baseline_mae = mean_absolute_error(y_test, baseline_pred)
baseline_r2 = r2_score(y_test, baseline_pred)

print("Baseline RMSE:",baseline_rmse)
print("Baseline MAE:",baseline_mae)
print("Baseline r2:",baseline_r2)

In [None]:
#RandomForest Prediction
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=8,
    random_state=42
)

rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [None]:
#XGBoost Prediction
xgb = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

In [None]:
#LightGBM Prediction
lgb = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5
)

lgb.fit(X_train, y_train)
lgb_pred = lgb.predict(X_test)

In [None]:
#Evaluating three models
def evaluate(y_true, y_pred, name):
    print(f"\n{name}")
    print("RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))
    print("MAE:", mean_absolute_error(y_true, y_pred))
    print("R2:", r2_score(y_true, y_pred))

evaluate(y_test, rf_pred, "Random Forest")
evaluate(y_test, xgb_pred, "XGBoost")
evaluate(y_test, lgb_pred, "LightGBM")

In [None]:
#Hyperparameter Tuning 
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3,5,7],
    'learning_rate': [0.01, 0.05],
    'n_estimators': [200,300]
}

grid = GridSearchCV(
    estimator=XGBRegressor(
        random_state=42,
        objective='reg:squarederror'
    ),
    param_grid=param_grid,
    cv=3,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

grid.fit(X_train, y_train)
print("Best Parameters:",grid.best_params_)
print("Best RMSE:",-grid.best_score_)
best_xgb = grid.best_estimator_
best_xgb.fit(X_train, y_train)

In [None]:
#SHAP for impact on model output
explainer = shap.TreeExplainer(best_xgb)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

#booster = best_xgb.get_booster()

#explainer = shap.TreeExplainer(booster)
#shap_values = explainer.shap_values(X_test)

#shap.summary_plot(shap_values, X_test)


In [None]:
#Saving the model
joblib.dump(best_xgb, "xgb_model.joblib")
print("xgb_model.joblib saved successfully" )