In [1]:
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from lightgbm import LGBMClassifier


from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import VotingClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv('data6_&_odds.csv')
df.dropna(inplace=True)

In [3]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_val, y_val = valid_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), valid_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins


In [4]:
def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("Training Results: \n===============================")
    clf_report = classification_report(y_train, y_train_pred)
    print(f"Confusion Matrix:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

    print("Testing Results: \n===============================")
    clf_report = classification_report(y_test, y_test_pred)
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

### Combine top 5 Pycaret models to a Stacking Classifier 

In [14]:
# create a stacking classifier
clf = [
                  # ('lightgbm', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, 
                  #                           importance_type='split', learning_rate=0.1, max_depth=-1,
                  #                           min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                  #                           n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                  #                           random_state=8807, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
                  #                           subsample=1.0, subsample_for_bin=200000, subsample_freq=0)),
          ('nb', GaussianNB(priors=None, var_smoothing=8e-09)),
          ('extra', ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
              criterion='gini', max_depth=None, max_features='auto',
              max_leaf_nodes=None, max_samples=None,
              min_impurity_decrease=0.0,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
              oob_score=False, random_state=7419, verbose=0, warm_start=False)),
          ('gbc', GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                                            learning_rate=0.1, loss='deviance', max_depth=3,
                                            max_features=None, max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_samples_leaf=1, min_samples_split=2,
                                            min_weight_fraction_leaf=0.0, n_estimators=100,
                                            n_iter_no_change=None,
                                            random_state=8807, subsample=1.0, tol=0.0001,
                                            validation_fraction=0.1, verbose=0,
                                            warm_start=False)),
        #   ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
        #   ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
          ('rf', RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                criterion='entropy', max_depth=10, max_features='sqrt',
                max_leaf_nodes=None, max_samples=None,
                min_impurity_decrease=0.0002,
                min_samples_leaf=3, min_samples_split=2,
                min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
                oob_score=False, random_state=8807, verbose=0,
                warm_start=False)),
          ('ada', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.3,
            n_estimators=220, random_state=7419))
        #   ('gnb', GaussianNB())],
        ]          

mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=100, solver='lbfgs')


# make predictions with the 2-stage stacking model

stacking_model = StackingClassifier(estimators=clf, final_estimator=mlp, cv=10, stack_method='auto', n_jobs=-1) 


In [15]:
start_time = time.time()
stacking_model.fit(X_train, y_train)
val_score = stacking_model.score(X_val, y_val)

preds = stacking_model.predict(X_test)
test_score = stacking_model.score(X_test, y_test)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("val score:", val_score)
print("test score:", test_score)


Συνολικός χρόνος fit και predict: 253.56390523910522 seconds
              precision    recall  f1-score   support

   home_loss       0.63      0.55      0.59      1935
    home_win       0.70      0.77      0.73      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.66      0.66      4583
weighted avg       0.67      0.67      0.67      4583

val score: 0.6995867768595041
test score: 0.6735762600916431


### Lasso Dataset

In [8]:
X_train_lasso = train_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train

X_val_lasso = valid_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_val_lasso = y_val


X_test_lasso = test_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_test_lasso = y_test

In [10]:
# create a stacking classifier
clf = [
                  # ('lightgbm', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, 
                  #                           importance_type='split', learning_rate=0.1, max_depth=-1,
                  #                           min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                  #                           n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                  #                           random_state=8807, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
                  #                           subsample=1.0, subsample_for_bin=200000, subsample_freq=0)),
          ('nb', GaussianNB(priors=None, var_smoothing=8e-09)),
          ('extra', ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
              criterion='gini', max_depth=None, max_features='auto',
              max_leaf_nodes=None, max_samples=None,
              min_impurity_decrease=0.0,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
              oob_score=False, random_state=7419, verbose=0, warm_start=False)),
          ('gbc', GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                                            learning_rate=0.1, loss='deviance', max_depth=3,
                                            max_features=None, max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_samples_leaf=1, min_samples_split=2,
                                            min_weight_fraction_leaf=0.0, n_estimators=100,
                                            n_iter_no_change=None,
                                            random_state=8807, subsample=1.0, tol=0.0001,
                                            validation_fraction=0.1, verbose=0,
                                            warm_start=False)),
        #   ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
        #   ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
          ('rf', RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                criterion='entropy', max_depth=10, max_features='sqrt',
                max_leaf_nodes=None, max_samples=None,
                min_impurity_decrease=0.0002,
                min_samples_leaf=3, min_samples_split=2,
                min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
                oob_score=False, random_state=8807, verbose=0,
                warm_start=False)),
          ('ada', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.3,
            n_estimators=220, random_state=7419))
        #   ('gnb', GaussianNB())],
        ]          

mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')


# make predictions with the 2-stage stacking model

stacking_model = StackingClassifier(estimators=clf, final_estimator=mlp, cv=10, stack_method='auto', n_jobs=-1) 


In [11]:
start_time = time.time()
stacking_model.fit(X_train_lasso, y_train_lasso)
val_score = stacking_model.score(X_val_lasso, y_val_lasso)

preds = stacking_model.predict(X_test_lasso)
test_score = stacking_model.score(X_test_lasso, y_test_lasso)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_lasso, preds, target_names=target_names))
print("val score:", val_score)
print("test score:", test_score)


Συνολικός χρόνος fit και predict: 50.57762837409973 seconds
              precision    recall  f1-score   support

   home_loss       0.63      0.56      0.59      1935
    home_win       0.70      0.76      0.73      2648

    accuracy                           0.68      4583
   macro avg       0.67      0.66      0.66      4583
weighted avg       0.67      0.68      0.67      4583

val score: 0.7008264462809918
test score: 0.6751036439013747
