In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import joblib
import seaborn as sns
import matplotlib.pyplot as plt


In [11]:
ml_df = pd.read_csv("../data/processed/ml_data.csv")
ml_df.head()


Unnamed: 0,match_type,winner,season_year,is_modern_ipl,team1_toss,team2_toss,team1_Deccan Chargers,team1_Delhi Capitals,team1_Delhi Daredevils,team1_Gujarat Lions,...,toss_winner_Mumbai Indians,toss_winner_Pune Warriors,toss_winner_Punjab Kings,toss_winner_Rajasthan Royals,toss_winner_Rising Pune Supergiant,toss_winner_Rising Pune Supergiants,toss_winner_Royal Challengers Bangalore,toss_winner_Royal Challengers Bengaluru,toss_winner_Sunrisers Hyderabad,toss_decision_field
0,4,8,2007,0,1,0,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
1,4,0,2007,0,0,1,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,4,3,2007,0,0,1,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
3,4,16,2007,0,1,0,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,4,8,2007,0,0,1,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
X = ml_df.drop('winner', axis=1)
y = ml_df['winner']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [13]:
X_train.dtypes


match_type                                 int64
season_year                                int64
is_modern_ipl                              int64
team1_toss                                 int64
team2_toss                                 int64
                                           ...  
toss_winner_Rising Pune Supergiants         bool
toss_winner_Royal Challengers Bangalore     bool
toss_winner_Royal Challengers Bengaluru     bool
toss_winner_Sunrisers Hyderabad             bool
toss_decision_field                         bool
Length: 117, dtype: object

In [14]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.5504587155963303

Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.43      0.53        28
           1       0.00      0.00      0.00         6
           2       0.56      0.56      0.56         9
           3       0.56      0.38      0.45        13
           4       0.00      0.00      0.00         3
           5       0.62      0.83      0.71         6
           6       0.37      0.39      0.38        18
           7       0.00      0.00      0.00         1
           8       0.50      0.69      0.58        26
           9       0.62      1.00      0.77         5
          10       0.58      0.76      0.66        29
          11       0.00      0.00      0.00         2
          12       0.20      0.20      0.20         5
          13       0.48      0.45      0.47        22
          14       1.00      0.50      0.67         2
          15       0.00      0.00      0.00         1
       

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [15]:
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.44954128440366975

Classification Report:
               precision    recall  f1-score   support

           0       0.40      0.36      0.38        28
           1       0.00      0.00      0.00         6
           2       0.33      0.11      0.17         9
           3       0.50      0.46      0.48        13
           4       0.00      0.00      0.00         3
           5       0.56      0.83      0.67         6
           6       0.31      0.44      0.36        18
           7       0.00      0.00      0.00         1
           8       0.46      0.62      0.52        26
           9       0.38      0.60      0.46         5
          10       0.46      0.45      0.46        29
          11       0.00      0.00      0.00         2
          12       0.25      0.20      0.22         5
          13       0.50      0.55      0.52        22
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         1
          16

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)

print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_gb))


In [None]:
import joblib
joblib.dump(gb, "../models/win_predictor.pkl")
print("Model saved at ../models/win_predictor.pkl")


Model saved at ../models/win_predictor.pkl


In [None]:
import json

with open("../models/feature_columns.json", "w") as f:
    json.dump(X_train.columns.tolist(), f, indent=4)

print("feature_columns.json saved.")

feature_columns.json saved.
