In [1]:
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import VotingClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

import warnings

In [2]:
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv('../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [4]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_val, y_val = valid_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), valid_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins


In [5]:
def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("Training Results: \n===============================")
    clf_report = classification_report(y_train, y_train_pred)
    print(f"Confusion Matrix:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

    print("Testing Results: \n===============================")
    clf_report = classification_report(y_test, y_test_pred)
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

### Create a Voting Classifier with our models

#### Make use of basic models with GridSearchCV method

In [9]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=10, max_features='log2', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train, y_train)
y_pred_vch = voting_classifier_hard.predict(X_val)

In [10]:
evaluate(voting_classifier_hard, X_train, X_val, y_train, y_val)

Training Results: 
Confusion Matrix:
[[1844  940]
 [ 601 3515]]
Accuracy Score:
0.7767
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.66      0.71      2784
           1       0.79      0.85      0.82      4116

    accuracy                           0.78      6900
   macro avg       0.77      0.76      0.76      6900
weighted avg       0.77      0.78      0.77      6900

Testing Results: 
Confusion Matrix:
[[ 576  435]
 [ 306 1103]]
Accuracy Score:
0.6938
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.57      0.61      1011
           1       0.72      0.78      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.69      0.69      0.69      2420



In [11]:
# create a voting classifier with soft voting
voting_classifier_soft = VotingClassifier(
    estimators = [('svm', SVC(probability=True, random_state=42, C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=10, max_features='log2', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='soft')


In [12]:
# make predictions with the soft voting model
voting_classifier_soft.fit(X_train, y_train)
y_pred_vcs = voting_classifier_soft.predict(X_val)


In [13]:
evaluate(voting_classifier_soft, X_train, X_val, y_train, y_val)

Training Results: 
Confusion Matrix:
[[2047  737]
 [1290 2826]]
Accuracy Score:
0.7062
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.74      0.67      2784
           1       0.79      0.69      0.74      4116

    accuracy                           0.71      6900
   macro avg       0.70      0.71      0.70      6900
weighted avg       0.72      0.71      0.71      6900

Testing Results: 
Confusion Matrix:
[[693 318]
 [447 962]]
Accuracy Score:
0.6839
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.69      0.64      1011
           1       0.75      0.68      0.72      1409

    accuracy                           0.68      2420
   macro avg       0.68      0.68      0.68      2420
weighted avg       0.69      0.68      0.69      2420



#### Univariate Feature Selection Dataset

In [14]:
X_train_uni = train_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_train_uni = y_train

X_val_uni = valid_data[['diff_curr_win_pct','diff_curr_away_record','odds_home','odds_away','elo_diff']]
y_val_uni = y_val

In [15]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [('svm', SVC(random_state=42, C=1000, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.3, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='log2', min_samples_leaf=3, min_samples_split=8, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train_uni, y_train_uni)
y_pred_vch = voting_classifier_hard.predict(X_val_uni)

In [16]:
evaluate(voting_classifier_hard, X_train_uni, X_val_uni, y_train_uni, y_val_uni)

Training Results: 
Confusion Matrix:
[[1621 1163]
 [ 872 3244]]
Accuracy Score:
0.7051
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.58      0.61      2784
           1       0.74      0.79      0.76      4116

    accuracy                           0.71      6900
   macro avg       0.69      0.69      0.69      6900
weighted avg       0.70      0.71      0.70      6900

Testing Results: 
Confusion Matrix:
[[ 600  411]
 [ 342 1067]]
Accuracy Score:
0.6888
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.59      0.61      1011
           1       0.72      0.76      0.74      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.68      0.68      2420
weighted avg       0.69      0.69      0.69      2420



In [24]:
# create a voting classifier with soft voting
voting_classifier_soft = VotingClassifier(
    estimators = [('svm', SVC(probability=True, random_state=42, C=1000, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.3, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='log2', min_samples_leaf=3, min_samples_split=8, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='soft')


# make predictions with the soft voting model
voting_classifier_soft.fit(X_train_uni, y_train_uni)
y_pred_vch = voting_classifier_soft.predict(X_val_uni)

In [25]:
evaluate(voting_classifier_soft, X_train_uni, X_val_uni, y_train_uni, y_val_uni)

Training Results: 
Confusion Matrix:
[[1616 1168]
 [ 874 3242]]
Accuracy Score:
0.7041
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.58      0.61      2784
           1       0.74      0.79      0.76      4116

    accuracy                           0.70      6900
   macro avg       0.69      0.68      0.69      6900
weighted avg       0.70      0.70      0.70      6900

Testing Results: 
Confusion Matrix:
[[ 583  428]
 [ 318 1091]]
Accuracy Score:
0.6917
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.58      0.61      1011
           1       0.72      0.77      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.68      0.68      2420
weighted avg       0.69      0.69      0.69      2420



#### ExtraTreesClassifier Dataset

In [21]:
X_train_extra = train_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_train_extra = y_train

X_val_extra = valid_data[['odds_home', 'odds_away', 'home_elo', 'visitor_elo', 'elo_diff',
                    'eff_diff', 'eff_visitor', 'top_player_diff', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'ROAD_RECORD_home',
                    'diff_road_record_last_season', 'diff_win_pct_7_last_games', 'W_PCT_home',
                    'W_PCT_away', 'W_PCT_prev_away', 'diff_curr_away_record', 'HOME_RECORD_home', 'diff_curr_home_record',
                    'diff_curr_win_pct']]

y_val_extra = y_val

In [22]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=100, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='log2', min_samples_leaf=3, min_samples_split=8, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train_extra, y_train_extra)
y_pred_vch = voting_classifier_hard.predict(X_val_extra)

In [23]:
evaluate(voting_classifier_hard, X_train_extra, X_val_extra, y_train_extra, y_val_extra)

Training Results: 
Confusion Matrix:
[[1711 1073]
 [ 727 3389]]
Accuracy Score:
0.7391
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.61      0.66      2784
           1       0.76      0.82      0.79      4116

    accuracy                           0.74      6900
   macro avg       0.73      0.72      0.72      6900
weighted avg       0.74      0.74      0.74      6900

Testing Results: 
Confusion Matrix:
[[ 595  416]
 [ 317 1092]]
Accuracy Score:
0.6971
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.59      0.62      1011
           1       0.72      0.78      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.69      0.70      0.69      2420



In [26]:
# create a voting classifier with soft voting
voting_classifier_soft = VotingClassifier(
    estimators = [('svm', SVC(probability=True,random_state=42, C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=100, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='log2', min_samples_leaf=3, min_samples_split=8, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='soft')


# make predictions with the soft voting model
voting_classifier_soft.fit(X_train_extra, y_train_extra)
y_pred_vch = voting_classifier_soft.predict(X_val_extra)

In [27]:
evaluate(voting_classifier_soft, X_train_extra, X_val_extra, y_train_extra, y_val_extra)

Training Results: 
Confusion Matrix:
[[1715 1069]
 [ 875 3241]]
Accuracy Score:
0.7183
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.62      0.64      2784
           1       0.75      0.79      0.77      4116

    accuracy                           0.72      6900
   macro avg       0.71      0.70      0.70      6900
weighted avg       0.72      0.72      0.72      6900

Testing Results: 
Confusion Matrix:
[[ 611  400]
 [ 333 1076]]
Accuracy Score:
0.6971
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.60      0.63      1011
           1       0.73      0.76      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.69      2420
weighted avg       0.69      0.70      0.70      2420



#### RFECV Dataset

In [28]:
X_train_rcv = train_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_train_rcv = y_train

X_val_rcv = valid_data[['missing_player_diff','odds_home','odds_away','visitor_elo']]
y_val_rcv = y_val

In [29]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [('svm', SVC(random_state=42, C=1000, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='auto', min_samples_leaf=5, min_samples_split=10, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train_rcv, y_train_rcv)
y_pred_vch = voting_classifier_hard.predict(X_val_rcv)

In [30]:
evaluate(voting_classifier_hard, X_train_rcv, X_val_rcv, y_train_rcv, y_val_extra)

Training Results: 
Confusion Matrix:
[[1537 1247]
 [ 732 3384]]
Accuracy Score:
0.7132
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.55      0.61      2784
           1       0.73      0.82      0.77      4116

    accuracy                           0.71      6900
   macro avg       0.70      0.69      0.69      6900
weighted avg       0.71      0.71      0.71      6900

Testing Results: 
Confusion Matrix:
[[ 561  450]
 [ 290 1119]]
Accuracy Score:
0.6942
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.55      0.60      1011
           1       0.71      0.79      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.69      0.67      0.68      2420
weighted avg       0.69      0.69      0.69      2420



In [31]:
# create a voting classifier with soft voting
voting_classifier_soft = VotingClassifier(
    estimators = [('svm', SVC(probability=True, random_state=42, C=1000, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='auto', min_samples_leaf=5, min_samples_split=10, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='soft')


# make predictions with the soft voting model
voting_classifier_soft.fit(X_train_rcv, y_train_rcv)
y_pred_vch = voting_classifier_soft.predict(X_val_rcv)

In [32]:
evaluate(voting_classifier_soft, X_train_rcv, X_val_rcv, y_train_rcv, y_val_rcv)

Training Results: 
Confusion Matrix:
[[1453 1331]
 [ 646 3470]]
Accuracy Score:
0.7135
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.52      0.60      2784
           1       0.72      0.84      0.78      4116

    accuracy                           0.71      6900
   macro avg       0.71      0.68      0.69      6900
weighted avg       0.71      0.71      0.70      6900

Testing Results: 
Confusion Matrix:
[[ 526  485]
 [ 262 1147]]
Accuracy Score:
0.6913
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.52      0.58      1011
           1       0.70      0.81      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.69      0.67      0.67      2420
weighted avg       0.69      0.69      0.68      2420



#### PCA

In [33]:
n_components = 30
pca = PCA(n_components=n_components).fit(X_train)
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

In [34]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [('svm', SVC(random_state=42, C=10, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='sgd')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='sqrt', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train_pca, y_train)
y_pred_vch = voting_classifier_hard.predict(X_val_pca)

In [35]:
evaluate(voting_classifier_hard, X_train_pca, X_val_pca, y_train, y_val)

Training Results: 
Confusion Matrix:
[[1898  886]
 [ 654 3462]]
Accuracy Score:
0.7768
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.68      0.71      2784
           1       0.80      0.84      0.82      4116

    accuracy                           0.78      6900
   macro avg       0.77      0.76      0.76      6900
weighted avg       0.78      0.78      0.78      6900

Testing Results: 
Confusion Matrix:
[[ 576  435]
 [ 302 1107]]
Accuracy Score:
0.6955
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.57      0.61      1011
           1       0.72      0.79      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.69      0.70      0.69      2420



In [36]:
# create a voting classifier with soft voting
voting_classifier_soft = VotingClassifier(
    estimators = [('svm', SVC(probability=True, random_state=42, C=10, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='sgd')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='sqrt', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='soft')


# make predictions with the soft voting model
voting_classifier_soft.fit(X_train_pca, y_train)
y_pred_vch = voting_classifier_soft.predict(X_val_pca)

In [37]:
evaluate(voting_classifier_soft, X_train_pca, X_val_pca, y_train, y_val)

Training Results: 
Confusion Matrix:
[[1788  996]
 [ 482 3634]]
Accuracy Score:
0.7858
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.64      0.71      2784
           1       0.78      0.88      0.83      4116

    accuracy                           0.79      6900
   macro avg       0.79      0.76      0.77      6900
weighted avg       0.79      0.79      0.78      6900

Testing Results: 
Confusion Matrix:
[[ 523  488]
 [ 276 1133]]
Accuracy Score:
0.6843
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.52      0.58      1011
           1       0.70      0.80      0.75      1409

    accuracy                           0.68      2420
   macro avg       0.68      0.66      0.66      2420
weighted avg       0.68      0.68      0.68      2420



#### Lasso Dataset

In [6]:
X_train_lasso = train_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train

X_val_lasso = valid_data[['odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_val_lasso = y_val

In [7]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [('svm', SVC(C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train_lasso, y_train_lasso)
y_pred_vch = voting_classifier_hard.predict(X_val_lasso)

In [8]:
evaluate(voting_classifier_hard, X_train_lasso, X_val_lasso, y_train_lasso, y_val_lasso)

Training Results: 
Confusion Matrix:
[[1733 1051]
 [ 766 3350]]
Accuracy Score:
0.7367
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.62      0.66      2784
           1       0.76      0.81      0.79      4116

    accuracy                           0.74      6900
   macro avg       0.73      0.72      0.72      6900
weighted avg       0.73      0.74      0.73      6900

Testing Results: 
Confusion Matrix:
[[ 600  411]
 [ 332 1077]]
Accuracy Score:
0.6930
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.59      0.62      1011
           1       0.72      0.76      0.74      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.68      0.68      2420
weighted avg       0.69      0.69      0.69      2420



In [10]:
# create a voting classifier with soft voting
voting_classifier_soft = VotingClassifier(
    estimators = [('svm', SVC(probability=True,C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='soft')


# make predictions with the soft voting model
voting_classifier_soft.fit(X_train_lasso, y_train_lasso)
y_pred_vch = voting_classifier_soft.predict(X_val_lasso)

In [11]:
evaluate(voting_classifier_soft, X_train_lasso, X_val_lasso, y_train_lasso, y_val_lasso)

Training Results: 
Confusion Matrix:
[[1571 1213]
 [ 699 3417]]
Accuracy Score:
0.7229
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.56      0.62      2784
           1       0.74      0.83      0.78      4116

    accuracy                           0.72      6900
   macro avg       0.72      0.70      0.70      6900
weighted avg       0.72      0.72      0.72      6900

Testing Results: 
Confusion Matrix:
[[ 553  458]
 [ 288 1121]]
Accuracy Score:
0.6917
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.55      0.60      1011
           1       0.71      0.80      0.75      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.67      0.67      2420
weighted avg       0.69      0.69      0.69      2420



### Choose less models for the voting procedure, add weights to each model

In [23]:
# create a voting classifier with hard voting
second_voting_classifier_hard = VotingClassifier(
    estimators = [
            # ('svm', SVC(C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
                  ('knn', KNeighborsClassifier(n_neighbors=72)),
                  # ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=200, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
                  # ('gnb', GaussianNB())
                  ],
    voting='hard', weights=[2,1,1])


# make predictions with the hard voting model
second_voting_classifier_hard.fit(X_train_lasso, y_train_lasso)
y_pred_vch = second_voting_classifier_hard.predict(X_val_lasso)

In [24]:
evaluate(second_voting_classifier_hard, X_train_lasso, X_val_lasso, y_train_lasso, y_val_lasso)

Training Results: 
Confusion Matrix:
[[1752 1032]
 [ 778 3338]]
Accuracy Score:
0.7377
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.63      0.66      2784
           1       0.76      0.81      0.79      4116

    accuracy                           0.74      6900
   macro avg       0.73      0.72      0.72      6900
weighted avg       0.74      0.74      0.74      6900

Testing Results: 
Confusion Matrix:
[[ 601  410]
 [ 319 1090]]
Accuracy Score:
0.6988
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.59      0.62      1011
           1       0.73      0.77      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.69      2420
weighted avg       0.70      0.70      0.70      2420



#### SFS Forward Dataset

In [11]:
X_train_for_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_train_for_sfs = y_train

X_val_for_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                    'W_PCT_away', 'W_PCT_prev_home', 'ROAD_RECORD_prev_home', 'W_PCT_prev_away',
                    'HOME_RECORD_prev_away', 'ROAD_RECORD_prev_away', 'WIN_PRCT_home_3g',
                    'FT_PCT_home_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'WIN_PRCT_home_7g',
                    'FT_PCT_away_7g', 'REB_away_7g', 'diff_avg_ast_home', 'diff_avg_ast_away',
                    'diff_avg_fg3_pct_home', 'diff_avg_fg_pct_away', 'diff_avg_reb_away',
                    'top_players', 'eff', 'eff_visitor', 'G_7days', 'back2back',
                    'HG_7days_VISITOR', 'AG_7days_VISITOR', 'G_7days_VISITOR',
                    'back2back_visitor', 'missing_players', 'missing_players_visitor',
                    'home_elo', 'elo_diff', 'missing_player_diff', 'eff_diff',
                    'Home_Last_5_Avg_FG3_PCT_home', 'Home_Last_5_Avg_FG3_PCT_away',
                    'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_FT_PCT_away',
                    'diff_fg_pct_last_3_games', 'diff_fg3_pct_last_7_games',
                    'diff_ft_pct_last_3_games', 'diff_ast_last_3_games',
                    'diff_ast_last_7_games', 'diff_win_pct_prev_season',
                    'diff_home_record_last_season', 'diff_road_record_last_season',
                    'diff_curr_win_pct']]

y_val_for_sfs = y_val


In [12]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=200, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=10, max_features='sqrt', min_samples_leaf=3, min_samples_split=8, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train_for_sfs, y_train_for_sfs)
y_pred_vch = voting_classifier_hard.predict(X_val_for_sfs)

In [13]:
evaluate(voting_classifier_hard, X_train_for_sfs, X_val_for_sfs, y_train_for_sfs, y_val_for_sfs)

Training Results: 
Confusion Matrix:
[[1820  964]
 [ 810 3306]]
Accuracy Score:
0.7429
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.65      0.67      2784
           1       0.77      0.80      0.79      4116

    accuracy                           0.74      6900
   macro avg       0.73      0.73      0.73      6900
weighted avg       0.74      0.74      0.74      6900

Testing Results: 
Confusion Matrix:
[[ 598  413]
 [ 331 1078]]
Accuracy Score:
0.6926
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.59      0.62      1011
           1       0.72      0.77      0.74      1409

    accuracy                           0.69      2420
   macro avg       0.68      0.68      0.68      2420
weighted avg       0.69      0.69      0.69      2420



In [14]:
# create a voting classifier with soft voting
voting_classifier_soft = VotingClassifier(
    estimators = [('svm', SVC(probability=True, random_state=42, C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=200, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=10, max_features='sqrt', min_samples_leaf=3, min_samples_split=8, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='soft')


# make predictions with the soft voting model
voting_classifier_soft.fit(X_train_for_sfs, y_train_for_sfs)
y_pred_vch = voting_classifier_soft.predict(X_val_for_sfs)

In [15]:
evaluate(voting_classifier_soft, X_train_for_sfs, X_val_for_sfs, y_train_for_sfs, y_val_for_sfs)

Training Results: 
Confusion Matrix:
[[1770 1014]
 [ 875 3241]]
Accuracy Score:
0.7262
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.64      0.65      2784
           1       0.76      0.79      0.77      4116

    accuracy                           0.73      6900
   macro avg       0.72      0.71      0.71      6900
weighted avg       0.72      0.73      0.73      6900

Testing Results: 
Confusion Matrix:
[[ 604  407]
 [ 332 1077]]
Accuracy Score:
0.6946
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.60      0.62      1011
           1       0.73      0.76      0.74      1409

    accuracy                           0.69      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.69      0.69      0.69      2420



#### SFS Backwards Dataset

In [16]:
X_train_back_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_train_back_sfs = y_train

X_val_back_sfs = valid_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_val_back_sfs = y_val

In [17]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='sqrt', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train_back_sfs, y_train_back_sfs)
y_pred_vch = voting_classifier_hard.predict(X_val_back_sfs)

In [18]:
evaluate(voting_classifier_hard, X_train_back_sfs, X_val_back_sfs, y_train_back_sfs, y_val_back_sfs)

Training Results: 
Confusion Matrix:
[[1752 1032]
 [ 800 3316]]
Accuracy Score:
0.7345
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.63      0.66      2784
           1       0.76      0.81      0.78      4116

    accuracy                           0.73      6900
   macro avg       0.72      0.72      0.72      6900
weighted avg       0.73      0.73      0.73      6900

Testing Results: 
Confusion Matrix:
[[ 614  397]
 [ 332 1077]]
Accuracy Score:
0.6988
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.61      0.63      1011
           1       0.73      0.76      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.69      0.69      2420
weighted avg       0.70      0.70      0.70      2420



In [19]:
# create a voting classifier with soft voting
voting_classifier_soft = VotingClassifier(
    estimators = [('svm', SVC(probability=True, random_state=42, C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='sqrt', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='soft')


# make predictions with the soft voting model
voting_classifier_soft.fit(X_train_back_sfs, y_train_back_sfs)
y_pred_vch = voting_classifier_soft.predict(X_val_back_sfs)

In [20]:
evaluate(voting_classifier_soft, X_train_back_sfs, X_val_back_sfs, y_train_back_sfs, y_val_back_sfs)

Training Results: 
Confusion Matrix:
[[1699 1085]
 [ 862 3254]]
Accuracy Score:
0.7178
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.61      0.64      2784
           1       0.75      0.79      0.77      4116

    accuracy                           0.72      6900
   macro avg       0.71      0.70      0.70      6900
weighted avg       0.72      0.72      0.72      6900

Testing Results: 
Confusion Matrix:
[[ 619  392]
 [ 333 1076]]
Accuracy Score:
0.7004
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.61      0.63      1011
           1       0.73      0.76      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.69      0.69      2420
weighted avg       0.70      0.70      0.70      2420

