In [1]:
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import StackingClassifier, AdaBoostClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

import warnings

In [2]:
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv('../../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [4]:
train_data = df.loc[(df.season < 2016) & (df.season >= 2007)]
test_data = df.loc[df.season >= 2016]

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins


### GridSearchCV

In [5]:
# create a stacking classifier
layer_one_estimators = [ ('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=10, max_features='log2', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
        # ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100))
        ]
        # ('ada', AdaBoostClassifier(learning_rate=0.1, n_estimators=100))
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
layer_two_estimators = [
        ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
        # ('rf', RandomForestClassifier(bootstrap=True, max_depth=10, max_features='log2', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
        ('gnb', GaussianNB())
        ]

        
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')
ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)

layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=ada)

# make predictions with the 3-stage stacking model

stacking_model = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two) 


In [6]:
start_time = time.time()
stacking_model.fit(X_train, y_train)

preds = stacking_model.predict(X_test)
test_score = stacking_model.score(X_test, y_test)


In [7]:
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("test score", test_score)


Συνολικός χρόνος fit και predict: 81.66969132423401 seconds
              precision    recall  f1-score   support

   home_loss       0.65      0.47      0.54      1935
    home_win       0.68      0.81      0.74      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.64      0.64      4583
weighted avg       0.66      0.67      0.66      4583

val score: 0.6942148760330579
test score 0.667248527165612


### RFECV Dataset

In [8]:
X_train_rcv = train_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_train_rcv = y_train

X_test_rcv = test_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_test_rcv = y_test

In [11]:
layer_one_estimators = [ 
        ('svm', SVC(random_state=42, C=1000, gamma=0.0001, kernel='rbf')),
        ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100))
        ]   
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
layer_two_estimators = [
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='auto', min_samples_leaf=5, min_samples_split=10, n_estimators=100)),
        ('gnb', GaussianNB())]

ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')


layer_two = StackingClassifier(estimators=layer_one_estimators, final_estimator=ada)

# make predictions with the 3-stage stacking model

stacking_model = StackingClassifier(estimators=layer_two_estimators, final_estimator=layer_two) 

In [12]:
start_time = time.time()
stacking_model.fit(X_train_rcv, y_train_rcv)

preds = stacking_model.predict(X_test_rcv)
test_score = stacking_model.score(X_test_rcv, y_test_rcv)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_rcv, preds, target_names=target_names))
print("test score:", test_score)


Συνολικός χρόνος fit και predict: 33.23134708404541 seconds
              precision    recall  f1-score   support

   home_loss       0.63      0.56      0.59      1935
    home_win       0.70      0.77      0.73      2648

    accuracy                           0.68      4583
   macro avg       0.67      0.66      0.66      4583
weighted avg       0.67      0.68      0.67      4583

val score: 0.6975206611570248
test score: 0.6777220161466289


### Lasso Dataset

In [13]:
X_train_lasso = train_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train


X_test_lasso = test_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_test_lasso = y_test

In [14]:
layer_one_estimators = [ 
        ('svm', SVC(random_state=42, C=100, gamma=0.001, kernel='rbf')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
        # ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100))
        ]   
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
layer_two_estimators = [
        ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
        # ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
        ('gnb', GaussianNB())]

ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')


layer_two = StackingClassifier(estimators=layer_one_estimators, final_estimator=ada)

# make predictions with the 3-stage stacking model

stacking_model = StackingClassifier(estimators=layer_two_estimators, final_estimator=layer_two) 

In [15]:
start_time = time.time()
stacking_model.fit(X_train_lasso, y_train_lasso)

preds = stacking_model.predict(X_test_lasso)
test_score = stacking_model.score(X_test_lasso, y_test_lasso)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_lasso, preds, target_names=target_names))
print("test score:", test_score)


Συνολικός χρόνος fit και predict: 31.36198663711548 seconds
              precision    recall  f1-score   support

   home_loss       0.64      0.56      0.60      1935
    home_win       0.70      0.77      0.74      2648

    accuracy                           0.68      4583
   macro avg       0.67      0.66      0.67      4583
weighted avg       0.68      0.68      0.68      4583

val score: 0.6979338842975207
test score: 0.6801221907047785


### SFS Forward Dataset

In [16]:
X_train_for_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_train_for_sfs = y_train


X_test_for_sfs = test_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_test_for_sfs = y_test

In [17]:
layer_one_estimators = [ 
        ('svm', SVC(random_state=42, C=100, gamma=0.001, kernel='rbf')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
        # ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100))
        ]   
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
layer_two_estimators = [
        ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
        # ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
        ('gnb', GaussianNB())]

ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')


layer_two = StackingClassifier(estimators=layer_one_estimators, final_estimator=ada)

# make predictions with the 3-stage stacking model

stacking_model = StackingClassifier(estimators=layer_two_estimators, final_estimator=layer_two) 

In [18]:
start_time = time.time()
stacking_model.fit(X_train_for_sfs, y_train_for_sfs)

preds = stacking_model.predict(X_test_for_sfs)
test_score = stacking_model.score(X_test_for_sfs, y_test_for_sfs)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_for_sfs, preds, target_names=target_names))
print("test score:", test_score)


Συνολικός χρόνος fit και predict: 29.624431848526 seconds
              precision    recall  f1-score   support

   home_loss       0.65      0.48      0.55      1935
    home_win       0.68      0.81      0.74      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.64      0.64      4583
weighted avg       0.67      0.67      0.66      4583

val score: 0.6917355371900826
test score: 0.6694305040366572
