In [11]:
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from lightgbm import LGBMClassifier


from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import VotingClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv('data6_&_odds.csv')
df.dropna(inplace=True)

In [3]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_val, y_val = valid_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), valid_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins


In [4]:
def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("Training Results: \n===============================")
    clf_report = classification_report(y_train, y_train_pred)
    print(f"Confusion Matrix:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

    print("Testing Results: \n===============================")
    clf_report = classification_report(y_test, y_test_pred)
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

### Combine top 5 Pycaret models to a Voting Classifier 

In [14]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [
                  # ('lightgbm', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, 
                  #                           importance_type='split', learning_rate=0.1, max_depth=-1,
                  #                           min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                  #                           n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                  #                           random_state=8807, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
                  #                           subsample=1.0, subsample_for_bin=200000, subsample_freq=0)),
                  ('nb', GaussianNB(priors=None, var_smoothing=8e-09)),
                  ('extra', ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=7419, verbose=0, warm_start=False)),
                  ('gbc', GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                                                    learning_rate=0.1, loss='deviance', max_depth=3,
                                                    max_features=None, max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_samples_leaf=1, min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0, n_estimators=100,
                                                    n_iter_no_change=None,
                                                    random_state=8807, subsample=1.0, tol=0.0001,
                                                    validation_fraction=0.1, verbose=0,
                                                    warm_start=False)),
                #   ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                #   ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=10, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0002,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
                       oob_score=False, random_state=8807, verbose=0,
                       warm_start=False)),
                  ('ada', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.3,
                   n_estimators=220, random_state=7419)),
                #   ('gnb', GaussianNB())],
                ],          
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train, y_train)
y_pred_vch = voting_classifier_hard.predict(X_val)

In [15]:
evaluate(voting_classifier_hard, X_train, X_val, y_train, y_val)

Training Results: 
Confusion Matrix:
[[1994  790]
 [ 652 3464]]
Accuracy Score:
0.7910
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.72      0.73      2784
           1       0.81      0.84      0.83      4116

    accuracy                           0.79      6900
   macro avg       0.78      0.78      0.78      6900
weighted avg       0.79      0.79      0.79      6900

Testing Results: 
Confusion Matrix:
[[ 598  413]
 [ 334 1075]]
Accuracy Score:
0.6913
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.59      0.62      1011
           1       0.72      0.76      0.74      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.68      0.68      2420
weighted avg       0.69      0.69      0.69      2420



In [18]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [
                  # ('lightgbm', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, 
                  #                           importance_type='split', learning_rate=0.1, max_depth=-1,
                  #                           min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                  #                           n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                  #                           random_state=8807, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
                  #                           subsample=1.0, subsample_for_bin=200000, subsample_freq=0)),
                  ('nb', GaussianNB(priors=None, var_smoothing=8e-09)),
                  ('extra', ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=7419, verbose=0, warm_start=False)),
                  ('gbc', GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                                                    learning_rate=0.1, loss='deviance', max_depth=3,
                                                    max_features=None, max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_samples_leaf=1, min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0, n_estimators=100,
                                                    n_iter_no_change=None,
                                                    random_state=8807, subsample=1.0, tol=0.0001,
                                                    validation_fraction=0.1, verbose=0,
                                                    warm_start=False)),
                #   ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                #   ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=10, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0002,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
                       oob_score=False, random_state=8807, verbose=0,
                       warm_start=False)),
                  ('ada', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.3,
                   n_estimators=220, random_state=7419)),
                #   ('gnb', GaussianNB())],
                ],          
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train, y_train)
y_pred_vch = voting_classifier_hard.predict(X_test)

In [19]:
evaluate(voting_classifier_hard, X_train, X_test, y_train, y_test)

Training Results: 
Confusion Matrix:
[[1994  790]
 [ 652 3464]]
Accuracy Score:
0.7910
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.72      0.73      2784
           1       0.81      0.84      0.83      4116

    accuracy                           0.79      6900
   macro avg       0.78      0.78      0.78      6900
weighted avg       0.79      0.79      0.79      6900

Testing Results: 
Confusion Matrix:
[[1079  856]
 [ 645 2003]]
Accuracy Score:
0.6725
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.56      0.59      1935
           1       0.70      0.76      0.73      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.66      0.66      4583
weighted avg       0.67      0.67      0.67      4583



In [10]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [('lightgbm', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, 
                                            importance_type='split', learning_rate=0.1, max_depth=-1,
                                            min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                                            n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                                            random_state=8807, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
                                            subsample=1.0, subsample_for_bin=200000, subsample_freq=0)),
                  ('gbc', GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                                                    learning_rate=0.1, loss='deviance', max_depth=3,
                                                    max_features=None, max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_samples_leaf=1, min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0, n_estimators=100,
                                                    n_iter_no_change=None,
                                                    random_state=8807, subsample=1.0, tol=0.0001,
                                                    validation_fraction=0.1, verbose=0,
                                                    warm_start=False)),
                #   ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                #   ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=10, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0002,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
                       oob_score=False, random_state=8807, verbose=0,
                       warm_start=False)),
                #   ('gnb', GaussianNB())],
                ],          
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train, y_train)
y_pred_vch = voting_classifier_hard.predict(X_test)

In [11]:
evaluate(voting_classifier_hard, X_train, X_test, y_train, y_test)

Training Results: 
Confusion Matrix:
[[2454  330]
 [ 406 3710]]
Accuracy Score:
0.8933
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      2784
           1       0.92      0.90      0.91      4116

    accuracy                           0.89      6900
   macro avg       0.89      0.89      0.89      6900
weighted avg       0.89      0.89      0.89      6900

Testing Results: 
Confusion Matrix:
[[1045  890]
 [ 623 2025]]
Accuracy Score:
0.6699
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.54      0.58      1935
           1       0.69      0.76      0.73      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.65      0.65      4583
weighted avg       0.67      0.67      0.67      4583



### Extra Trees Classifier Dataset

In [20]:
X_train_lasso = train_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train

X_val_lasso = valid_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_val_lasso = y_val

X_test_lasso = test_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_test_lasso = y_test

In [23]:
# create a voting classifier with hard voting
voting_classifier_soft = VotingClassifier(
    estimators = [('lightgbm', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, 
                                            importance_type='split', learning_rate=0.1, max_depth=-1,
                                            min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                                            n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                                            random_state=8807, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
                                            subsample=1.0, subsample_for_bin=200000, subsample_freq=0)),
                  ('gbc', GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                                                    learning_rate=0.1, loss='deviance', max_depth=3,
                                                    max_features=None, max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_samples_leaf=1, min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0, n_estimators=100,
                                                    n_iter_no_change=None,
                                                    random_state=8807, subsample=1.0, tol=0.0001,
                                                    validation_fraction=0.1, verbose=0,
                                                    warm_start=False)),
                #   ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                #   ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=10, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0002,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
                       oob_score=False, random_state=8807, verbose=0,
                       warm_start=False)),
                #   ('gnb', GaussianNB())],
                ],          
    voting='soft')


# make predictions with the hard voting model
voting_classifier_soft.fit(X_train_lasso, y_train_lasso)
y_pred_vch = voting_classifier_soft.predict(X_test_lasso)

In [24]:
evaluate(voting_classifier_soft, X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso)

Training Results: 
Confusion Matrix:
[[2001  783]
 [ 656 3460]]
Accuracy Score:
0.7914
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.72      0.74      2784
           1       0.82      0.84      0.83      4116

    accuracy                           0.79      6900
   macro avg       0.78      0.78      0.78      6900
weighted avg       0.79      0.79      0.79      6900

Testing Results: 
Confusion Matrix:
[[1170  765]
 [ 713 1935]]
Accuracy Score:
0.6775
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.60      0.61      1935
           1       0.72      0.73      0.72      2648

    accuracy                           0.68      4583
   macro avg       0.67      0.67      0.67      4583
weighted avg       0.68      0.68      0.68      4583



In [28]:
# create a voting classifier with hard voting
voting_classifier_soft = VotingClassifier(
    estimators = [
                  # ('lightgbm', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, 
                  #                           importance_type='split', learning_rate=0.1, max_depth=-1,
                  #                           min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                  #                           n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                  #                           random_state=8807, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
                  #                           subsample=1.0, subsample_for_bin=200000, subsample_freq=0)),
                  ('nb', GaussianNB(priors=None, var_smoothing=8e-09)),
                  ('extra', ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=7419, verbose=0, warm_start=False)),
                  ('gbc', GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                                                    learning_rate=0.1, loss='deviance', max_depth=3,
                                                    max_features=None, max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_samples_leaf=1, min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0, n_estimators=100,
                                                    n_iter_no_change=None,
                                                    random_state=8807, subsample=1.0, tol=0.0001,
                                                    validation_fraction=0.1, verbose=0,
                                                    warm_start=False)),
                #   ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                #   ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=10, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0002,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
                       oob_score=False, random_state=8807, verbose=0,
                       warm_start=False)),
                  ('ada', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.3,
                   n_estimators=220, random_state=7419)),
                #   ('gnb', GaussianNB())],
                ],          
    voting='soft')


# make predictions with the hard voting model
voting_classifier_soft.fit(X_train_lasso, y_train_lasso)
y_pred_vch = voting_classifier_soft.predict(X_test_lasso)

In [29]:
evaluate(voting_classifier_soft, X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso)

Training Results: 
Confusion Matrix:
[[2294  490]
 [ 510 3606]]
Accuracy Score:
0.8551
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.82      0.82      2784
           1       0.88      0.88      0.88      4116

    accuracy                           0.86      6900
   macro avg       0.85      0.85      0.85      6900
weighted avg       0.86      0.86      0.86      6900

Testing Results: 
Confusion Matrix:
[[1202  733]
 [ 738 1910]]
Accuracy Score:
0.6790
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.62      0.62      1935
           1       0.72      0.72      0.72      2648

    accuracy                           0.68      4583
   macro avg       0.67      0.67      0.67      4583
weighted avg       0.68      0.68      0.68      4583



### SFS Backwards Dataset

In [32]:
X_train_back_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_train_back_sfs = y_train

X_val_back_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_val_back_sfs = y_val

X_test_back_sfs = test_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_test_back_sfs = y_test

In [6]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [('lightgbm', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, 
                                            importance_type='split', learning_rate=0.1, max_depth=-1,
                                            min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                                            n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                                            random_state=8807, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
                                            subsample=1.0, subsample_for_bin=200000, subsample_freq=0)),
                  ('gbc', GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                                                    learning_rate=0.1, loss='deviance', max_depth=3,
                                                    max_features=None, max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_samples_leaf=1, min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0, n_estimators=100,
                                                    n_iter_no_change=None,
                                                    random_state=8807, subsample=1.0, tol=0.0001,
                                                    validation_fraction=0.1, verbose=0,
                                                    warm_start=False)),
                #   ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                #   ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=10, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0002,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
                       oob_score=False, random_state=8807, verbose=0,
                       warm_start=False)),
                #   ('gnb', GaussianNB())],
                ],          
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train_back_sfs, y_train_back_sfs)
y_pred_vch = voting_classifier_hard.predict(X_test_back_sfs)

In [7]:
evaluate(voting_classifier_hard, X_train_back_sfs, X_test_back_sfs, y_train_back_sfs, y_test_back_sfs)

Training Results: 
Confusion Matrix:
[[2316  468]
 [ 533 3583]]
Accuracy Score:
0.8549
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.83      0.82      2784
           1       0.88      0.87      0.88      4116

    accuracy                           0.85      6900
   macro avg       0.85      0.85      0.85      6900
weighted avg       0.86      0.85      0.86      6900

Testing Results: 
Confusion Matrix:
[[1136  799]
 [ 712 1936]]
Accuracy Score:
0.6703
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.59      0.60      1935
           1       0.71      0.73      0.72      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.66      0.66      4583
weighted avg       0.67      0.67      0.67      4583



In [39]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [
                  # ('lightgbm', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, 
                  #                           importance_type='split', learning_rate=0.1, max_depth=-1,
                  #                           min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
                  #                           n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
                  #                           random_state=8807, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
                  #                           subsample=1.0, subsample_for_bin=200000, subsample_freq=0)),
                  ('nb', GaussianNB(priors=None, var_smoothing=8e-09)),
                  ('extra', ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=7419, verbose=0, warm_start=False)),
                  ('gbc', GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                                                    learning_rate=0.1, loss='deviance', max_depth=3,
                                                    max_features=None, max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_samples_leaf=1, min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0, n_estimators=100,
                                                    n_iter_no_change=None,
                                                    random_state=8807, subsample=1.0, tol=0.0001,
                                                    validation_fraction=0.1, verbose=0,
                                                    warm_start=False)),
                #   ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                #   ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=10, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0002,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=-1,
                       oob_score=False, random_state=8807, verbose=0,
                       warm_start=False)),
                  ('ada', AdaBoostClassifier(algorithm='SAMME', base_estimator=None, learning_rate=0.3,
                   n_estimators=220, random_state=7419)),
                #   ('gnb', GaussianNB())],
                ],          
    voting='hard')

# make predictions with the hard voting model
voting_classifier_hard.fit(X_train_back_sfs, y_train_back_sfs)
y_pred_vch = voting_classifier_hard.predict(X_test_back_sfs)

In [40]:
evaluate(voting_classifier_hard, X_train_back_sfs, X_test_back_sfs, y_train_back_sfs, y_test_back_sfs)

Training Results: 
Confusion Matrix:
[[1951  833]
 [ 709 3407]]
Accuracy Score:
0.7765
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.70      0.72      2784
           1       0.80      0.83      0.82      4116

    accuracy                           0.78      6900
   macro avg       0.77      0.76      0.77      6900
weighted avg       0.78      0.78      0.78      6900

Testing Results: 
Confusion Matrix:
[[1139  796]
 [ 703 1945]]
Accuracy Score:
0.6729
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.59      0.60      1935
           1       0.71      0.73      0.72      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.66      0.66      4583
weighted avg       0.67      0.67      0.67      4583

