## 3-Stage Stacking

In [1]:
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import StackingClassifier, AdaBoostClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

import warnings

In [2]:
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv('../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [4]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_val, y_val = valid_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), valid_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins


### GridSearchCV

In [21]:
# create a stacking classifier
layer_one_estimators = [ ('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=10, max_features='log2', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
        # ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100))
        ]
        # ('ada', AdaBoostClassifier(learning_rate=0.1, n_estimators=100))
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
layer_two_estimators = [
        ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
        # ('rf', RandomForestClassifier(bootstrap=True, max_depth=10, max_features='log2', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
        ('gnb', GaussianNB())
        ]

        
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')
ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)

layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=ada, cv=5)

# make predictions with the 3-stage stacking model

stacking_model = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two, cv=5) 


In [22]:
start_time = time.time()
stacking_model.fit(X_train, y_train)
val_score = stacking_model.score(X_val, y_val)

preds = stacking_model.predict(X_val)
test_score = stacking_model.score(X_val, y_val)


In [23]:
target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)


Συνολικός χρόνος fit και predict: 104.21606087684631 seconds
              precision    recall  f1-score   support

   home_loss       0.64      0.63      0.63      1011
    home_win       0.74      0.75      0.74      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.69      0.69      2420
weighted avg       0.70      0.70      0.70      2420

val score: 0.697107438016529


### Univariate Feature Selection Dataset

In [24]:
X_train_uni = train_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_train_uni = y_train

X_val_uni = valid_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_val_uni = y_val

In [31]:
# create a stacking classifier
layer_one_estimators = [ 
        ('svm', SVC(random_state=42, C=1000, gamma=0.0001, kernel='rbf')),
        ('xgb', xgb.XGBClassifier(gamma=0.3, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100))
        ]
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs')),
layer_two_estimators = [
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='log2', min_samples_leaf=3, min_samples_split=8, n_estimators=100)),
        ('gnb', GaussianNB())]
        
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs')
ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)

layer_two = StackingClassifier(estimators=layer_two_estimators, final_estimator=ada, cv=5)

# make predictions with the 3-stage stacking model

stacking_model = StackingClassifier(estimators=layer_one_estimators, final_estimator=layer_two, cv=5) 

In [32]:
start_time = time.time()
stacking_model.fit(X_train_uni, y_train_uni)
val_score = stacking_model.score(X_val_uni, y_val_uni)

preds = stacking_model.predict(X_val_uni)
test_score = stacking_model.score(X_val_uni, y_val_uni)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_uni, preds, target_names=target_names))
print("val score:", val_score)

Συνολικός χρόνος fit και predict: 157.25669360160828 seconds
              precision    recall  f1-score   support

   home_loss       0.66      0.52      0.58      1011
    home_win       0.70      0.81      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.67      0.67      2420
weighted avg       0.69      0.69      0.68      2420

val score: 0.6896694214876034


### ExtraTrees Classifier Dataset

In [33]:
X_train_extra = train_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_train_extra = y_train

X_val_extra = valid_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_val_extra = y_val

In [42]:
layer_one_estimators = [ ('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
        # ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='log2', min_samples_leaf=3, min_samples_split=8, n_estimators=100)),

        ]
        # ('ada' , AdaBoostClassifier(learning_rate=0.1, n_estimators=100)),
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=100, solver='lbfgs')),
layer_two_estimators = [
        ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
        # ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='log2', min_samples_leaf=3, min_samples_split=8, n_estimators=100)),
        ('gnb', GaussianNB())]

mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=100, solver='lbfgs')
ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)

layer_two = StackingClassifier(estimators=layer_one_estimators, final_estimator=mlp, cv=5)

# make predictions with the 3-stage stacking model

stacking_model = StackingClassifier(estimators=layer_two_estimators, final_estimator=layer_two, cv=5) 

In [43]:
start_time = time.time()
stacking_model.fit(X_train_extra, y_train_extra)
val_score = stacking_model.score(X_val_extra, y_val_extra)

preds = stacking_model.predict(X_val_extra)
test_score = stacking_model.score(X_val_extra, y_val_extra)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_extra, preds, target_names=target_names))
print("val score:", val_score)

# model = stacking_model.best_estimator_
y_fit = stacking_model.predict(X_val_extra)


Συνολικός χρόνος fit και predict: 24.33743691444397 seconds
              precision    recall  f1-score   support

   home_loss       0.64      0.60      0.62      1011
    home_win       0.73      0.76      0.74      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.68      0.68      2420
weighted avg       0.69      0.69      0.69      2420

val score: 0.693801652892562


### RFECV Dataset

In [44]:
X_train_rcv = train_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_train_rcv = y_train

X_val_rcv = valid_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_val_rcv = y_val

In [45]:
layer_one_estimators = [ 
        ('svm', SVC(random_state=42, C=1000, gamma=0.0001, kernel='rbf')),
        ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100))
        ]   
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
layer_two_estimators = [
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='auto', min_samples_leaf=5, min_samples_split=10, n_estimators=100)),
        ('gnb', GaussianNB())]

ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')


layer_two = StackingClassifier(estimators=layer_one_estimators, final_estimator=mlp, cv=5)

# make predictions with the 3-stage stacking model

stacking_model = StackingClassifier(estimators=layer_two_estimators, final_estimator=layer_two, cv=5) 

In [46]:
start_time = time.time()
stacking_model.fit(X_train_rcv, y_train_rcv)
val_score = stacking_model.score(X_val_rcv, y_val_rcv)

preds = stacking_model.predict(X_val_rcv)
test_score = stacking_model.score(X_val_rcv, y_val_rcv)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_rcv, preds, target_names=target_names))
print("val score:", val_score)


Συνολικός χρόνος fit και predict: 22.538868188858032 seconds
              precision    recall  f1-score   support

   home_loss       0.65      0.57      0.61      1011
    home_win       0.72      0.78      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.69      0.70      0.69      2420

val score: 0.6954545454545454


### PCA

In [47]:
n_components = 30
pca = PCA(n_components=n_components).fit(X_train)
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

In [52]:
layer_one_estimators = [ 
        ('svm', SVC(random_state=42, C=10, gamma=0.0001, kernel='rbf')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='sqrt', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
        # ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100))
        ]   
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
layer_two_estimators = [
        ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
        # ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='sqrt', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
        ('gnb', GaussianNB())]

ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')


layer_two = StackingClassifier(estimators=layer_one_estimators, final_estimator=mlp, cv=5)

# make predictions with the 3-stage stacking model

stacking_model = StackingClassifier(estimators=layer_two_estimators, final_estimator=layer_two, cv=5) 

In [53]:
start_time = time.time()
stacking_model.fit(X_train_pca, y_train)
val_score = stacking_model.score(X_val_pca, y_val)

preds = stacking_model.predict(X_val_pca)
test_score = stacking_model.score(X_val_pca, y_val)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val, preds, target_names=target_names))
print("val score:", val_score)


Συνολικός χρόνος fit και predict: 25.889492750167847 seconds
              precision    recall  f1-score   support

   home_loss       0.62      0.63      0.63      1011
    home_win       0.73      0.73      0.73      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.68      0.68      2420
weighted avg       0.69      0.69      0.69      2420

val score: 0.6867768595041323


### Lasso Dataset

In [54]:
X_train_lasso = train_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train

X_val_lasso = valid_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_val_lasso = y_val

In [55]:
layer_one_estimators = [ 
        ('svm', SVC(random_state=42, C=100, gamma=0.001, kernel='rbf')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
        # ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100))
        ]   
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
layer_two_estimators = [
        ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
        # ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
        ('gnb', GaussianNB())]

ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')


layer_two = StackingClassifier(estimators=layer_one_estimators, final_estimator=ada, cv=5)

# make predictions with the 3-stage stacking model

stacking_model = StackingClassifier(estimators=layer_two_estimators, final_estimator=layer_two, cv=5) 

In [56]:
start_time = time.time()
stacking_model.fit(X_train_lasso, y_train_lasso)
val_score = stacking_model.score(X_val_lasso, y_val_lasso)

preds = stacking_model.predict(X_val_lasso)
test_score = stacking_model.score(X_val_lasso, y_val_lasso)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_lasso, preds, target_names=target_names))
print("val score:", val_score)


Συνολικός χρόνος fit και predict: 21.658222675323486 seconds
              precision    recall  f1-score   support

   home_loss       0.66      0.58      0.62      1011
    home_win       0.72      0.79      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.70      0.70      0.70      2420

val score: 0.6995867768595041


### SFS Forward Dataset

In [57]:
X_train_for_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_train_for_sfs = y_train

X_val_for_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_val_for_sfs = y_val


In [62]:
layer_one_estimators = [ 
        ('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
        # ('rf', RandomForestClassifier(bootstrap=True, max_depth=10, max_features='sqrt', min_samples_leaf=3, min_samples_split=8, n_estimators=100)),
        ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100))
        ]   
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
layer_two_estimators = [
        # ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=10, max_features='sqrt', min_samples_leaf=3, min_samples_split=8, n_estimators=100)),
        ('gnb', GaussianNB())]

ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=200, solver='lbfgs')


layer_two = StackingClassifier(estimators=layer_one_estimators, final_estimator=ada, cv=5)

# make predictions with the 3-stage stacking model

stacking_model = StackingClassifier(estimators=layer_two_estimators, final_estimator=layer_two, cv=5) 

In [63]:
start_time = time.time()
stacking_model.fit(X_train_for_sfs, y_train_for_sfs)
val_score = stacking_model.score(X_val_for_sfs, y_val_for_sfs)

preds = stacking_model.predict(X_val_for_sfs)
test_score = stacking_model.score(X_val_for_sfs, y_val_for_sfs)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_for_sfs, preds, target_names=target_names))
print("val score:", val_score)


Συνολικός χρόνος fit και predict: 30.913140535354614 seconds
              precision    recall  f1-score   support

   home_loss       0.65      0.58      0.62      1011
    home_win       0.72      0.77      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.69      0.70      0.69      2420

val score: 0.6950413223140496


### SFS Backwards Dataset

In [64]:
X_train_back_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_train_back_sfs = y_train

X_val_back_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_val_back_sfs = y_val

In [69]:
layer_one_estimators = [ 
        ('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='sqrt', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
        # ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100))
        ]   
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
layer_two_estimators = [
        ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
        # ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='sqrt', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
        ('gnb', GaussianNB())]

ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs')


layer_two = StackingClassifier(estimators=layer_one_estimators, final_estimator=ada, cv=5)

# make predictions with the 3-stage stacking model

stacking_model = StackingClassifier(estimators=layer_two_estimators, final_estimator=layer_two, cv=5) 

In [70]:
start_time = time.time()
stacking_model.fit(X_train_back_sfs, y_train_back_sfs)
val_score = stacking_model.score(X_val_back_sfs, y_val_back_sfs)

preds = stacking_model.predict(X_val_back_sfs)
test_score = stacking_model.score(X_val_back_sfs, y_val_back_sfs)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_val_back_sfs, preds, target_names=target_names))
print("val score:", val_score)


Συνολικός χρόνος fit και predict: 23.10113024711609 seconds
              precision    recall  f1-score   support

   home_loss       0.63      0.63      0.63      1011
    home_win       0.74      0.74      0.74      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.68      0.68      2420
weighted avg       0.69      0.69      0.69      2420

val score: 0.6929752066115702
