In [2]:
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import VotingClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

import warnings

In [3]:
warnings.filterwarnings('ignore')


In [5]:
df = pd.read_csv('../../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [6]:
train_data = df.loc[(df.season < 2016) & (df.season >= 2007)]
test_data = df.loc[df.season >= 2016]

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins


In [7]:
def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("Training Results: \n===============================")
    clf_report = classification_report(y_train, y_train_pred)
    print(f"Confusion Matrix:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

    print("Testing Results: \n===============================")
    clf_report = classification_report(y_test, y_test_pred)
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

### Create a Voting Classifier with our models

#### Lasso Dataset

In [13]:
X_train_lasso = train_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_train_lasso = y_train

X_test_lasso = test_data[['elo_diff', 'odds_away', 'odds_home', 'eff_diff', 'eff_visitor',
              'missing_players', 'top_players', 'FT_PCT_home_7g', 'diff_avg_reb_away',
              'home_elo']]
y_test_lasso = y_test

In [14]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [('svm', SVC(random_state=42, C=100, gamma=0.001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train_lasso, y_train_lasso)
y_pred_vch = voting_classifier_hard.predict(X_test_lasso)

In [15]:
evaluate(voting_classifier_hard, X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso)

Training Results: 
Confusion Matrix:
[[1928  856]
 [ 770 3346]]
Accuracy Score:
0.7643
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.69      0.70      2784
           1       0.80      0.81      0.80      4116

    accuracy                           0.76      6900
   macro avg       0.76      0.75      0.75      6900
weighted avg       0.76      0.76      0.76      6900

Testing Results: 
Confusion Matrix:
[[1160  775]
 [ 695 1953]]
Accuracy Score:
0.6792
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.60      0.61      1935
           1       0.72      0.74      0.73      2648

    accuracy                           0.68      4583
   macro avg       0.67      0.67      0.67      4583
weighted avg       0.68      0.68      0.68      4583



In [16]:
# create a voting classifier with soft voting
voting_classifier_soft = VotingClassifier(
    estimators = [('svm', SVC(probability=True, random_state=42, C=100, gamma=0.001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.2, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=11, max_features='auto', min_samples_leaf=4, min_samples_split=8, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='soft')


# make predictions with the soft voting model
voting_classifier_soft.fit(X_train_lasso, y_train_lasso)
y_pred_vch = voting_classifier_soft.predict(X_test_lasso)

In [17]:
evaluate(voting_classifier_soft, X_train_lasso, X_test_lasso, y_train_lasso, y_test_lasso)

Training Results: 
Confusion Matrix:
[[1772 1012]
 [ 574 3542]]
Accuracy Score:
0.7701
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.64      0.69      2784
           1       0.78      0.86      0.82      4116

    accuracy                           0.77      6900
   macro avg       0.77      0.75      0.75      6900
weighted avg       0.77      0.77      0.77      6900

Testing Results: 
Confusion Matrix:
[[1030  905]
 [ 577 2071]]
Accuracy Score:
0.6766
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.53      0.58      1935
           1       0.70      0.78      0.74      2648

    accuracy                           0.68      4583
   macro avg       0.67      0.66      0.66      4583
weighted avg       0.67      0.68      0.67      4583



#### SFS Backwards

In [18]:
X_train_back_sfs = train_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_train_back_sfs = y_train


X_test_back_sfs = test_data[['num_possible_outcomes', 'odds_home', 'odds_away', 'HOME_RECORD_home',
                     'W_PCT_away', 'W_PCT_prev_away', 'HOME_RECORD_prev_away',
                     'ROAD_RECORD_prev_away', 'FT_PCT_home_3g', 'FG3_PCT_home_3g', 'PTS_away_3g',
                     'FG_PCT_away_3g', 'FT_PCT_away_3g', 'FG3_PCT_away_3g', 'PTS_home_7g',
                     'FG_PCT_home_7g', 'AST_home_7g', 'AST_away_7g', 'REB_away_7g',
                     'diff_avg_pts_away', 'diff_avg_ast_home', 'diff_avg_ast_away',
                     'diff_avg_fg3_pct_home', 'top_players', 'top_players_visitor', 'eff_visitor',
                     'G_7days', 'back2back', 'HG_7days_VISITOR', 'AG_7days_VISITOR',
                     'G_7days_VISITOR', 'back2back_visitor', 'home_elo', 'elo_diff',
                     'missing_player_diff', 'eff_diff', 'Home_Last_5_Avg_AST_home',
                     'Home_Last_5_Avg_REB_home', 'Home_Last_5_Avg_REB_away',
                     'Home_Last_5_Avg_FG3_PCT_away', 'Away_Last_5_Avg_PTS_home',
                     'Away_Last_5_Avg_FG3_PCT_home', 'Away_Last_5_Avg_AST_home',
                     'Away_Last_5_Avg_FT_PCT_away', 'diff_fg3_pct_last_3_games',
                     'diff_fg3_pct_last_7_games', 'diff_ft_pct_last_3_games',
                     'diff_ast_last_7_games', 'diff_reb_last_3_games',
                     'diff_win_pct_3_last_games']]

y_test_back_sfs = y_test

In [23]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='sqrt', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train_back_sfs, y_train_back_sfs)
y_pred_vch = voting_classifier_hard.predict(X_test_back_sfs)

In [24]:
evaluate(voting_classifier_hard, X_train_back_sfs, X_test_back_sfs, y_train_back_sfs, y_test_back_sfs)

Training Results: 
Confusion Matrix:
[[1762 1022]
 [ 809 3307]]
Accuracy Score:
0.7346
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.63      0.66      2784
           1       0.76      0.80      0.78      4116

    accuracy                           0.73      6900
   macro avg       0.72      0.72      0.72      6900
weighted avg       0.73      0.73      0.73      6900

Testing Results: 
Confusion Matrix:
[[1084  851]
 [ 683 1965]]
Accuracy Score:
0.6653
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.56      0.59      1935
           1       0.70      0.74      0.72      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.65      0.65      4583
weighted avg       0.66      0.67      0.66      4583



In [21]:
# create a voting classifier with soft voting
voting_classifier_soft = VotingClassifier(
    estimators = [('svm', SVC(probability=True, random_state=42, C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=11)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='sqrt', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='soft')


# make predictions with the soft voting model
voting_classifier_soft.fit(X_train_back_sfs, y_train_back_sfs)
y_pred_vch = voting_classifier_soft.predict(X_test_back_sfs)

In [22]:
evaluate(voting_classifier_soft, X_train_back_sfs, X_test_back_sfs, y_train_back_sfs, y_test_back_sfs)

Training Results: 
Confusion Matrix:
[[1713 1071]
 [ 867 3249]]
Accuracy Score:
0.7191
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.62      0.64      2784
           1       0.75      0.79      0.77      4116

    accuracy                           0.72      6900
   macro avg       0.71      0.70      0.70      6900
weighted avg       0.72      0.72      0.72      6900

Testing Results: 
Confusion Matrix:
[[1073  862]
 [ 674 1974]]
Accuracy Score:
0.6648
Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.55      0.58      1935
           1       0.70      0.75      0.72      2648

    accuracy                           0.66      4583
   macro avg       0.66      0.65      0.65      4583
weighted avg       0.66      0.66      0.66      4583

