In [15]:
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import VotingClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import evaluate

import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

import warnings

In [16]:
warnings.filterwarnings('ignore')


In [17]:
df = pd.read_csv('../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [18]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_val, y_val = valid_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), valid_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins



In [25]:
def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("Training Results: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))
    print(f"Confusion Matrix:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

    print("Testing Results: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"Accuracy Score:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Classification Report:\n{clf_report}")

### Create a Voting Classifier with our models

#### Make use of basic models with GridSearchCV method

In [23]:
# create a voting classifier with hard voting
voting_classifier_hard = VotingClassifier(
    estimators = [('svm', SVC(random_state=42, C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=10, max_features='log2', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='hard')


# make predictions with the hard voting model
voting_classifier_hard.fit(X_train, y_train)
y_pred_vch = voting_classifier_hard.predict(X_val)




In [26]:
evaluate(voting_classifier_hard, X_train, X_val, y_train, y_val)

Training Results: 
Confusion Matrix:
[[1844  940]
 [ 596 3520]]
Accuracy Score:
0.7774
Classification Report:
                     0            1  accuracy    macro avg  weighted avg
precision     0.755738     0.789238  0.777391     0.772488      0.775721
recall        0.662356     0.855199  0.777391     0.758778      0.777391
f1-score      0.705972     0.820896  0.777391     0.763434      0.774527
support    2784.000000  4116.000000  0.777391  6900.000000   6900.000000
Testing Results: 
Confusion Matrix:
[[ 574  437]
 [ 309 1100]]
Accuracy Score:
0.6917
Classification Report:
                     0            1  accuracy    macro avg  weighted avg
precision     0.650057     0.715680  0.691736     0.682868      0.688265
recall        0.567755     0.780696  0.691736     0.674225      0.691736
f1-score      0.606125     0.746775  0.691736     0.676450      0.688016
support    1011.000000  1409.000000  0.691736  2420.000000   2420.000000


In [30]:
# create a voting classifier with soft voting
voting_classifier_soft = VotingClassifier(
    estimators = [('svm', SVC(probability=True, random_state=42, C=1, gamma=0.0001, kernel='rbf')),
                  ('xgb', xgb.XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100)),
                  ('knn', KNeighborsClassifier(leaf_size=1, n_neighbors=13)),
                  ('mlp', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=150, solver='lbfgs')),
                  ('rf', RandomForestClassifier(bootstrap=True, max_depth=10, max_features='log2', min_samples_leaf=5, min_samples_split=12, n_estimators=100)),
                  ('gnb', GaussianNB())],
    voting='soft')


In [31]:
# make predictions with the soft voting model
voting_classifier_soft.fit(X_train, y_train)
y_pred_vcs = voting_classifier_soft.predict(X_val)


In [32]:
evaluate(voting_classifier_soft, X_train, X_val, y_train, y_val)

Training Results: 
Confusion Matrix:
[[1805  979]
 [ 820 3296]]
Accuracy Score:
0.7393
Classification Report:
                     0            1  accuracy    macro avg  weighted avg
precision     0.687619     0.770994  0.739275     0.729307      0.737354
recall        0.648348     0.800777  0.739275     0.724563      0.739275
f1-score      0.667406     0.785604  0.739275     0.726505      0.737914
support    2784.000000  4116.000000  0.739275  6900.000000   6900.000000
Testing Results: 
Confusion Matrix:
[[ 612  399]
 [ 334 1075]]
Accuracy Score:
0.6971
Classification Report:
                     0            1  accuracy    macro avg  weighted avg
precision     0.646934     0.729308  0.697107     0.688121      0.694895
recall        0.605341     0.762952  0.697107     0.684147      0.697107
f1-score      0.625447     0.745751  0.697107     0.685599      0.695492
support    1011.000000  1409.000000  0.697107  2420.000000   2420.000000
