In [1]:
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import StackingClassifier, AdaBoostClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

import warnings

In [2]:
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv('../../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [4]:
train_data = df.loc[(df.season < 2016) & (df.season >= 2007)]
test_data = df.loc[df.season >= 2016]

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins


In [14]:
def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("Training Results: \n===============================")
    clf_report = classification_report(y_train, y_train_pred)
    print(f"Confusion Matrix:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

    print("Testing Results: \n===============================")
    clf_report = classification_report(y_test, y_test_pred)
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

### Stacking Model with GridSearchCV

In [8]:
# create a voting classifier with hard voting
clf = [ ('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
        ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
        # ('ada', AdaBoostClassifier(learning_rate=0.1, n_estimators=100))
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=10, max_features='log2', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
        ('gnb', GaussianNB())]

        
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')
ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
# make predictions with the 2-stage stacking model

stacking_model = StackingClassifier(estimators=clf, final_estimator=mlp) 


In [9]:
start_time = time.time()
stacking_model.fit(X_train, y_train)

preds = stacking_model.predict(X_test)
test_score = stacking_model.score(X_test, y_test)


In [10]:
target_names = ['home_loss', 'home_win']


print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("test score", test_score)


Συνολικός χρόνος fit και predict: 80.71528005599976 seconds
              precision    recall  f1-score   support

   home_loss       0.64      0.51      0.56      1935
    home_win       0.69      0.79      0.73      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.65      0.65      4583
weighted avg       0.67      0.67      0.66      4583

val score: 0.6983471074380165
test score 0.6700850970979708


#### Univariate Feature Selection Dataset

In [11]:
X_train_uni = train_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_train_uni = y_train


X_test_uni = test_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_test_uni = y_test

In [12]:
# create a stacking classifier
clf = [ ('svm', SVC(random_state=42, C=1000, gamma=0.0001, kernel='rbf')),
        ('xgb', xgb.XGBClassifier(gamma=0.3, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='log2', min_samples_leaf=3, min_samples_split=8, n_estimators=100)),
        ('gnb', GaussianNB())]
        
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs')
ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
# make predictions with the 2-stage stacking model

stacking_model = StackingClassifier(estimators=clf, final_estimator=ada, stack_method='auto', n_jobs=-1) 


In [13]:
start_time = time.time()
stacking_model.fit(X_train_uni, y_train_uni)

preds = stacking_model.predict(X_test_uni)
test_score = stacking_model.score(X_test_uni, y_test_uni)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_uni, preds, target_names=target_names))
print("test score:", test_score)


Συνολικός χρόνος fit και predict: 84.82153940200806 seconds
              precision    recall  f1-score   support

   home_loss       0.62      0.56      0.59      1935
    home_win       0.70      0.75      0.72      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.65      0.65      4583
weighted avg       0.67      0.67      0.67      4583

val score: 0.6904958677685951
test score: 0.6685577132882391


#### ExtraTreesClassifier Dataset

In [14]:
X_train_extra = train_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_train_extra = y_train


X_test_extra = test_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_test_extra = y_test

In [15]:
clf = [ ('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
        ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
        # ('ada' , AdaBoostClassifier(learning_rate=0.1, n_estimators=100)),
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=100, solver='lbfgs')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='log2', min_samples_leaf=3, min_samples_split=8, n_estimators=100)),
        ('gnb', GaussianNB())]

mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=100, solver='lbfgs')
ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
# make predictions with the 2-stage stacking model

stacking_model = StackingClassifier(estimators=clf, final_estimator=ada, stack_method='auto', n_jobs=-1) 


In [16]:
start_time = time.time()
stacking_model.fit(X_train_extra, y_train_extra)

preds = stacking_model.predict(X_test_extra)
test_score = stacking_model.score(X_test_extra, y_test_extra)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_extra, preds, target_names=target_names))
print("test score:", test_score)


Συνολικός χρόνος fit και predict: 22.68713641166687 seconds
              precision    recall  f1-score   support

   home_loss       0.62      0.55      0.58      1935
    home_win       0.70      0.76      0.73      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.65      0.65      4583
weighted avg       0.67      0.67      0.67      4583

val score: 0.6983471074380165
test score: 0.6694305040366572


#### RFECV Dataset

In [17]:
X_train_rcv = train_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_train_rcv = y_train


X_test_rcv = test_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_test_rcv = y_test

In [18]:
clf = [ ('svm', SVC(random_state=42, C=1000, gamma=0.0001, kernel='rbf')),
        ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='auto', min_samples_leaf=5, min_samples_split=10, n_estimators=100)),
        ('gnb', GaussianNB())]

ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')
# make predictions with the 2-stage stacking model

stacking_model = StackingClassifier(estimators=clf, final_estimator=ada, stack_method='auto', n_jobs=-1)

In [19]:
start_time = time.time()
stacking_model.fit(X_train_rcv, y_train_rcv)

preds = stacking_model.predict(X_test_rcv)
test_score = stacking_model.score(X_test_rcv, y_test_rcv)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_rcv, preds, target_names=target_names))
print("test score:", test_score)


Συνολικός χρόνος fit και predict: 26.551005840301514 seconds
              precision    recall  f1-score   support

   home_loss       0.63      0.56      0.59      1935
    home_win       0.70      0.76      0.73      2648

    accuracy                           0.67      4583
   macro avg       0.67      0.66      0.66      4583
weighted avg       0.67      0.67      0.67      4583

val score: 0.6925619834710743
test score: 0.6748854462142702


#### PCA

In [20]:
n_components = 30
pca = PCA(n_components=n_components).fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [21]:
clf = [ ('svm', SVC(random_state=42, C=10, gamma=0.0001, kernel='rbf')),
        ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='sgd')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='sqrt', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
        ('gnb', GaussianNB())]
        

ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='sgd')

stacking_model = StackingClassifier(estimators=clf, final_estimator=mlp, stack_method='auto', n_jobs=-1)

In [22]:
start_time = time.time()
stacking_model.fit(X_train_pca, y_train)

preds = stacking_model.predict(X_test_pca)
test_score = stacking_model.score(X_test_pca, y_test)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test, preds, target_names=target_names))
print("test score", test_score)


Συνολικός χρόνος fit και predict: 33.09489583969116 seconds
              precision    recall  f1-score   support

   home_loss       0.64      0.41      0.50      1935
    home_win       0.66      0.83      0.73      2648

    accuracy                           0.65      4583
   macro avg       0.65      0.62      0.62      4583
weighted avg       0.65      0.65      0.64      4583

val score: 0.6962809917355371
test score 0.6530656775038185


#### Lasso Dataset

In [5]:
X_train_lasso = train_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train


X_test_lasso = test_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_test_lasso = y_test

In [10]:
clf = [ ('svm', SVC(C=10, gamma=0.01, kernel='rbf')),
        ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
        ('gnb', GaussianNB())]


ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=100, solver='sgd')

stacking_model = StackingClassifier(estimators=clf, final_estimator=mlp, stack_method='auto', n_jobs=-1)

In [11]:
start_time = time.time()
stacking_model.fit(X_train_lasso, y_train_lasso)

preds = stacking_model.predict(X_test_lasso)
test_score = stacking_model.score(X_test_lasso, y_test_lasso)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_lasso, preds, target_names=target_names))
print("test score:", test_score)


Συνολικός χρόνος fit και predict: 28.10810136795044 seconds
              precision    recall  f1-score   support

   home_loss       0.65      0.54      0.59      1935
    home_win       0.70      0.78      0.74      2648

    accuracy                           0.68      4583
   macro avg       0.67      0.66      0.66      4583
weighted avg       0.68      0.68      0.68      4583

test score: 0.6812131791403011


In [15]:
evaluate(stacking_model, X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso)

Training Results: 
Confusion Matrix:
[[2168 1627]
 [ 994 4531]]
Accuracy Score:
0.7188
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.57      0.62      3795
           1       0.74      0.82      0.78      5525

    accuracy                           0.72      9320
   macro avg       0.71      0.70      0.70      9320
weighted avg       0.72      0.72      0.71      9320

Testing Results: 
Confusion Matrix:
[[1045  890]
 [ 571 2077]]
Accuracy Score:
0.6812
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.54      0.59      1935
           1       0.70      0.78      0.74      2648

    accuracy                           0.68      4583
   macro avg       0.67      0.66      0.66      4583
weighted avg       0.68      0.68      0.68      4583



#### SFS Forward Dataset

In [26]:
X_train_for_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_train_for_sfs = y_train

X_test_for_sfs = test_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_test_for_sfs = y_test

In [27]:
clf = [ ('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
        ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=200, solver='lbfgs')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=10, max_features='sqrt', min_samples_leaf=3, min_samples_split=8, n_estimators=100)),
        ('gnb', GaussianNB())]

ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=200, solver='lbfgs')

stacking_model = StackingClassifier(estimators=clf, final_estimator=ada, stack_method='auto', n_jobs=-1)

In [28]:
start_time = time.time()
stacking_model.fit(X_train_for_sfs, y_train_for_sfs)

preds = stacking_model.predict(X_test_for_sfs)
test_score = stacking_model.score(X_test_for_sfs, y_test_for_sfs)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_for_sfs, preds, target_names=target_names))
print("test score:", test_score)


Συνολικός χρόνος fit και predict: 28.699158430099487 seconds
              precision    recall  f1-score   support

   home_loss       0.63      0.57      0.60      1935
    home_win       0.71      0.75      0.73      2648

    accuracy                           0.68      4583
   macro avg       0.67      0.66      0.66      4583
weighted avg       0.67      0.68      0.67      4583

val score: 0.6966942148760331
test score: 0.6751036439013747


#### SFS Backwards Dataset

In [29]:
X_train_back_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_train_back_sfs = y_train


X_test_back_sfs = test_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_test_back_sfs = y_test

In [30]:
clf = [ ('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
        ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
        # ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
        # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs')),
        ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='sqrt', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
        ('gnb', GaussianNB())]

ada = AdaBoostClassifier(learning_rate=0.1, n_estimators=100)
mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs')

stacking_model = StackingClassifier(estimators=clf, final_estimator=mlp, stack_method='auto', n_jobs=-1)

In [31]:
start_time = time.time()
stacking_model.fit(X_train_back_sfs, y_train_back_sfs)

preds = stacking_model.predict(X_test_back_sfs)
test_score = stacking_model.score(X_test_back_sfs, y_test_back_sfs)

target_names = ['home_loss', 'home_win']

print("Συνολικός χρόνος fit και predict: %s seconds" % (time.time() - start_time))
print(classification_report(y_test_back_sfs, preds, target_names=target_names))
print("test score:", test_score)


Συνολικός χρόνος fit και predict: 34.22588634490967 seconds
              precision    recall  f1-score   support

   home_loss       0.61      0.55      0.58      1935
    home_win       0.70      0.75      0.72      2648

    accuracy                           0.66      4583
   macro avg       0.65      0.65      0.65      4583
weighted avg       0.66      0.66      0.66      4583

val score: 0.693801652892562
test score: 0.6644119572332533
