In [2]:
## Print multiple output per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
import pandas as pd
import numpy as np

train_data = pd.read_csv('train_data_processed.csv')
test_data = pd.read_csv('test_data_processed.csv')

In [4]:
train_data.head(3)

Unnamed: 0,winner_01,team_count_runs_last15,team_consistency_last10,team1_winp_team2_last5,team_bat_strenght_last15,team_bowl_strenght_last15,team_count_wickets_last15,team_discipline_last15,team_winp_vanue_last5
0,0,1.0,0.744186,100.0,0.987319,1.060542,1.647059,1.022727,0.50495
1,1,1.063492,0.986111,50.0,0.924142,1.09203,0.823529,0.412742,0.009901
2,1,0.866667,1.179688,0.0,0.998788,0.966072,1.105042,1.431193,0.02439


In [10]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

features = train_data.drop(columns=['winner_01']).columns

X = train_data[features]
y = train_data['winner_01']

# Imputer for Nan values #change this in features only
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)
test_data_imputed = imputer.transform(test_data[features])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Parameter grid for GBM
param_grid_gbm = {
    'n_estimators': [100, 250],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

gbm = GradientBoostingClassifier(random_state=42)
grid_search_gbm = GridSearchCV(estimator=gbm, param_grid=param_grid_gbm, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search_gbm.fit(X_train, y_train)

# Best model
best_gbm = grid_search_gbm.best_estimator_

# Predict on validation and test data
y_val_pred_gbm = best_gbm.predict(X_val)
y_test_pred_gbm = best_gbm.predict(test_data_imputed)

# Evaluate the model
accuracy_gbm = accuracy_score(y_val, y_val_pred_gbm)
print(f"GBM Validation Accuracy: {accuracy_gbm}")
print(classification_report(y_val, y_val_pred_gbm))

# predictions
test_data['gbm_predictions'] = y_test_pred_gbm

# # LightGBM model
param_grid_lgbm = {
    'num_leaves': [31, 50],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200]
}

grid_search_lgbm = GridSearchCV(lgb.LGBMClassifier(), param_grid_lgbm, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search_lgbm.fit(X_train, y_train)

# Best model
best_lgbm = grid_search_lgbm.best_estimator_

# Predict on validation and test data
y_val_pred_lgbm = best_lgbm.predict(X_val)
y_test_pred_lgbm = best_lgbm.predict(test_data_imputed)

# Evaluate the model
accuracy_lgbm = accuracy_score(y_val, y_val_pred_lgbm)
print(f"LightGBM Validation Accuracy: {accuracy_lgbm}")
print(classification_report(y_val, y_val_pred_lgbm))

# predictions
test_data['lightgbm_predictions'] = y_test_pred_lgbm

# XGBoost model
param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_child_weight': [1, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_model = xgb.XGBClassifier(random_state=42)
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search_xgb.fit(X_train, y_train)

# Best model
best_xgb = grid_search_xgb.best_estimator_

# Predict on validation and test data
y_val_pred_xgb = best_xgb.predict(X_val)
y_test_pred_xgb = best_xgb.predict(test_data_imputed)

# Evaluate the model
accuracy_xgb = accuracy_score(y_val, y_val_pred_xgb)
print(f"XGBoost Validation Accuracy: {accuracy_xgb}")
print(classification_report(y_val, y_val_pred_xgb))

test_data['xgboost_predictions'] = y_test_pred_xgb

# CatBoost model
param_grid_cat = {
    'iterations': [100, 200],
    'learning_rate': [0.01, 0.1],
    'depth': [3, 5],
    'l2_leaf_reg': [1, 3, 5]
}

cat_model = CatBoostClassifier(random_seed=42, verbose=0)
grid_search_cat = GridSearchCV(estimator=cat_model, param_grid=param_grid_cat, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search_cat.fit(X_train, y_train)

# Best model
best_cat = grid_search_cat.best_estimator_

# Predict on validation and test data
y_val_pred_cat = best_cat.predict(X_val)
y_test_pred_cat = best_cat.predict(test_data_imputed)

# Evaluate the model
accuracy_cat = accuracy_score(y_val, y_val_pred_cat)
print(f"CatBoost Validation Accuracy: {accuracy_cat}")
print(classification_report(y_val, y_val_pred_cat))

# predictions
test_data['catboost_predictions'] = y_test_pred_cat

#save
test_data.to_csv('test_data_with_predictions.csv', index=False)


Fitting 3 folds for each of 32 candidates, totalling 96 fits


GBM Validation Accuracy: 0.5210526315789473
              precision    recall  f1-score   support

           0       0.50      0.58      0.54        91
           1       0.55      0.46      0.50        99

    accuracy                           0.52       190
   macro avg       0.52      0.52      0.52       190
weighted avg       0.52      0.52      0.52       190

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Number of positive: 372, number of negative: 386
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000280 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1568
[LightGBM] [Info] Number of data points in the train set: 758, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.490765 -> initscore=-0.036944
[LightGBM] [Info] Start training from score -0.036944


LightGBM Validation Accuracy: 0.5210526315789473
              precision    recall  f1-score   support

           0       0.50      0.52      0.51        91
           1       0.54      0.53      0.53        99

    accuracy                           0.52       190
   macro avg       0.52      0.52      0.52       190
weighted avg       0.52      0.52      0.52       190

Fitting 3 folds for each of 64 candidates, totalling 192 fits


XGBoost Validation Accuracy: 0.5052631578947369
              precision    recall  f1-score   support

           0       0.48      0.53      0.51        91
           1       0.53      0.48      0.51        99

    accuracy                           0.51       190
   macro avg       0.51      0.51      0.51       190
weighted avg       0.51      0.51      0.51       190

Fitting 3 folds for each of 24 candidates, totalling 72 fits


CatBoost Validation Accuracy: 0.5842105263157895
              precision    recall  f1-score   support

           0       0.56      0.64      0.59        91
           1       0.62      0.54      0.57        99

    accuracy                           0.58       190
   macro avg       0.59      0.59      0.58       190
weighted avg       0.59      0.58      0.58       190

