## Imports

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

ModuleNotFoundError: No module named 'tensorflow'

In [26]:
df = pd.read_csv(r'D:\data-science-end\notebook\data\ipl_fully_standardized_features.csv')

In [27]:
df.describe()

Unnamed: 0,team1_avg_elo,team2_avg_elo,team1_avg_form,team2_avg_form,team2_batsmen_avg_elo,team2_batsmen_avg_form,team1_last_5_wins,team2_last_5_wins,team1_vs_team2_matches,team1_vs_team2_wins,team2_vs_team1_wins,head_to_head_wins,head_to_head_losses,team1_won
count,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0,1095.0
mean,1601.918177,1600.469559,0.18921,0.189872,1591.469173,0.165474,2.424658,2.451142,19.085845,9.389041,9.614612,13.714155,13.890411,0.506849
std,91.201659,91.498735,0.020136,0.020299,93.213757,0.029489,1.13408,1.142952,10.535998,5.746652,5.849996,5.652775,5.814064,0.500182
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,1582.8,1582.853535,0.178571,0.179362,1562.75,0.146505,2.0,2.0,9.0,4.0,5.0,11.0,11.0,0.0
50%,1611.272727,1607.363636,0.191046,0.191868,1597.666667,0.169061,2.0,2.0,21.0,10.0,10.0,15.0,15.0,1.0
75%,1629.909091,1627.904545,0.201749,0.202243,1625.975,0.185558,3.0,3.0,29.0,14.0,14.0,17.0,18.0,1.0
max,1726.5,1743.571429,0.251447,0.247839,1749.0,0.25096,5.0,5.0,37.0,23.0,23.0,26.0,26.0,1.0


## Some feature engineering

In [99]:
def create_features(df):
    # Create a copy of the dataframe
    df_new = df.copy()
    
    # # # Calculate ELO difference
    # df_new['elo_difference'] = df_new['team1_avg_elo'] - df_new['team2_avg_elo']
    
    # # # Calculate form difference
    # df_new['form_difference'] = df_new['team1_avg_form'] - df_new['team2_avg_form']
    
    # # Calculate win rate in head-to-head matches
    # df_new['head_to_head_win_rate'] = df_new['team1_vs_team2_wins'] / df_new['team1_vs_team2_matches']
    
    # # Calculate recent performance difference
    # df_new['recent_performance_diff'] = df_new['team1_last_5_wins'] - df_new['team2_last_5_wins']
    
    # # Win rate calculation
    # df_new['team1_win_rate'] = df_new['head_to_head_wins'] / (df_new['head_to_head_wins'] + df_new['head_to_head_losses'])
    
    # # Calculate ratio of elo ratings
    # df_new['elo_ratio'] = df_new['team1_avg_elo'] / df_new['team2_avg_elo']
    
    # # Calculate form ratio
    # df_new['form_ratio'] = df_new['team1_avg_form'] / (df_new['team2_avg_form'] + 0.0001)  # Adding small value to avoid division by zero
    
    # One-hot encode teams and venues
    team1_encoded = pd.get_dummies(df_new['team1'], prefix='team1')
    team2_encoded = pd.get_dummies(df_new['team2'], prefix='team2')
    venues_encoded = pd.get_dummies(df_new['venue'], prefix='venue')
    
    # Combine all features
    df_encoded = pd.concat([df_new.reset_index(drop=True), 
                           team1_encoded.reset_index(drop=True), 
                           team2_encoded.reset_index(drop=True), 
                           venues_encoded.reset_index(drop=True)], axis=1)
    
    # Drop original string columns
    df_encoded = df_encoded.drop(['team1', 'team2', 'venue'], axis=1)
    
    return df_encoded

In [100]:
enhanced_data = create_features(df)
print("\nMissing values in enhanced data:")
print(enhanced_data.isnull().sum())
enhanced_data = enhanced_data.fillna(0)
print("\nMissing values in enhanced data after filling NAs:")
print(enhanced_data.isnull().sum())
print("\nEnhanced dataset shape:", enhanced_data.shape)
X=enhanced_data.drop('team1_won', axis=1)
y = enhanced_data['team1_won']


Missing values in enhanced data:
team1_avg_elo                                 0
team2_avg_elo                                 0
team1_avg_form                                0
team2_avg_form                                0
team2_batsmen_avg_elo                         0
                                             ..
venue_Sheikh Zayed Stadium                    0
venue_St George's Park                        0
venue_SuperSport Park                         0
venue_Vidarbha Cricket Association Stadium    0
venue_Wankhede Stadium                        0
Length: 73, dtype: int64

Missing values in enhanced data after filling NAs:
team1_avg_elo                                 0
team2_avg_elo                                 0
team1_avg_form                                0
team2_avg_form                                0
team2_batsmen_avg_elo                         0
                                             ..
venue_Sheikh Zayed Stadium                    0
venue_St George's Park   

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [102]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Creating a function to evaluate the models:-

In [103]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))
    
    return {
        'model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [104]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42)
}

## Time to train all these models!

In [105]:
results = {}
for name, model in models.items():
    results[name] = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test, name)

# Hyperparameter tuning for the best model
best_model_name = max(results, key=lambda x: results[x]['f1'])
print(f"\nBest model: {best_model_name} with accuracy: {results[best_model_name]['accuracy']:.4f}")


Logistic Regression Results:
Accuracy: 0.6164
Precision: 0.6792
Recall: 0.5902
F1 Score: 0.6316

Confusion Matrix:
[[63 34]
 [50 72]]

Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.65      0.60        97
           1       0.68      0.59      0.63       122

    accuracy                           0.62       219
   macro avg       0.62      0.62      0.62       219
weighted avg       0.63      0.62      0.62       219


Random Forest Results:
Accuracy: 0.5753
Precision: 0.6306
Recall: 0.5738
F1 Score: 0.6009

Confusion Matrix:
[[56 41]
 [52 70]]

Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.58      0.55        97
           1       0.63      0.57      0.60       122

    accuracy                           0.58       219
   macro avg       0.57      0.58      0.57       219
weighted avg       0.58      0.58      0.58       219


Gradient Boosting Results:
Accura

## Cross validation for better understanding of accuracy

In [106]:
from sklearn.model_selection import StratifiedKFold

def evaluate_model_cv(model, X, y, model_name, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
    
    print(f"\n{model_name} Cross-Validation Results:")
    print(f"Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
    
    model.fit(X, y)
    
    return {
        'model': model,
        'accuracy': cv_scores.mean(),
        'std': cv_scores.std()
    }

In [107]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
results = {}
for name, model in models.items():
    results[name] = evaluate_model_cv(model, X_scaled, y, name)

# Find the best model
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
print(f"\nBest model: {best_model_name} with accuracy: {results[best_model_name]['accuracy']:.4f}")


Logistic Regression Cross-Validation Results:
Accuracy: 0.6027 (±0.0161)

Random Forest Cross-Validation Results:
Accuracy: 0.6091 (±0.0146)

Gradient Boosting Cross-Validation Results:
Accuracy: 0.6301 (±0.0067)

SVM Cross-Validation Results:
Accuracy: 0.5644 (±0.0216)

Best model: Gradient Boosting with accuracy: 0.6301


In [108]:
if best_model_name == 'Logistic Regression':
    param_grid = {
        'C': [0.1, 1, 10],
        'penalty': ['l2', None],
        'solver': ['liblinear', 'lbfgs']
    }
    grid_model = LogisticRegression(random_state=42, max_iter=2000)
elif best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [50, 100],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5]
    }
    grid_model = RandomForestClassifier(random_state=42)
elif best_model_name == 'Gradient Boosting':
    param_grid = {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    }
    grid_model = GradientBoostingClassifier(random_state=42)
elif best_model_name == 'SVM':
    param_grid = {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto'],
        'kernel': ['rbf', 'linear']
    }
    grid_model = SVC(probability=True, random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(grid_model, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_scaled, y)

In [None]:
print(f"\nBest parameters for {best_model_name}:")
print(grid_search.best_params_)
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# default parameters of gradient boosting produced best results


Best parameters for Gradient Boosting:
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best cross-validation score: 0.6137


## Underwhelming results so let's get to deep learning