In [120]:
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE, SequentialFeatureSelector
from sklearn.feature_selection import SelectKBest, SelectFromModel, mutual_info_classif
from sklearn.feature_selection import f_classif, chi2
from sklearn.linear_model import LogisticRegression, LassoCV, RidgeCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

from sklearn.neural_network import MLPClassifier


import warnings


In [17]:
warnings.filterwarnings('ignore')


In [18]:
df = pd.read_csv('../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [19]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_val, y_val = valid_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), valid_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins



Use Pipeline models combining scaling, normalization, feature selection and Classification methods

### SVM Pipeline Model

In [47]:
pipe_model = Pipeline([
    ('selector', SelectKBest(score_func=mutual_info_classif, k=5)),
    ('normalizer', MinMaxScaler()),
    # ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', SVC(C=1, gamma=1, kernel='rbf'))
])

pipe_model.fit(X_train, y_train)

Pipeline(steps=[('selector',
                 SelectKBest(k=5,
                             score_func=<function mutual_info_classif at 0x00000217F36CD1F0>)),
                ('normalizer', MinMaxScaler()), ('pca', PCA()),
                ('classifier', SVC(C=1, gamma=1))])

In [48]:
# Predict
preds = pipe_model.predict(X_val)
test_score = pipe_model.score(X_val, y_val)

target_names=['home loss', 'home win']

print(classification_report(y_val, preds, target_names=target_names))

print("test score", test_score)


              precision    recall  f1-score   support

   home loss       0.70      0.48      0.57      1011
    home win       0.69      0.86      0.77      1409

    accuracy                           0.70      2420
   macro avg       0.70      0.67      0.67      2420
weighted avg       0.70      0.70      0.68      2420

test score 0.6975206611570248


### MLP Pipeline Model

In [79]:
pipe_model = Pipeline([
    ('normalizer', MinMaxScaler()),
    ('selector', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs'))
])

pipe_model.fit(X_train, y_train)

Pipeline(steps=[('normalizer', MinMaxScaler()), ('selector', SelectKBest(k=5)),
                ('classifier',
                 MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5),
                               max_iter=250, solver='lbfgs'))])

In [80]:
# Predict
preds = pipe_model.predict(X_val)
test_score = pipe_model.score(X_val, y_val)

target_names=['home loss', 'home win']

print(classification_report(y_val, preds, target_names=target_names))

print("test score", test_score)


              precision    recall  f1-score   support

   home loss       0.68      0.54      0.60      1011
    home win       0.71      0.81      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.70      0.70      0.69      2420

test score 0.7004132231404959


### Random Forest Pipeline Model

In [232]:
pipe_model = Pipeline([
    ('selector', SelectFromModel(RandomForestClassifier())),
    ('scaler', StandardScaler()),
    # ('normalizer', MinMaxScaler()),
    ('classifier', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='sqrt', min_samples_leaf=4, min_samples_split=12, n_estimators=100))
])

pipe_model.fit(X_train, y_train)

Pipeline(steps=[('selector',
                 SelectFromModel(estimator=RandomForestClassifier())),
                ('scaler', StandardScaler()),
                ('classifier',
                 RandomForestClassifier(max_depth=8, max_features='sqrt',
                                        min_samples_leaf=4,
                                        min_samples_split=12))])

In [233]:
# Predict
preds = pipe_model.predict(X_val)
test_score = pipe_model.score(X_val, y_val)

target_names=['home loss', 'home win']

print(classification_report(y_val, preds, target_names=target_names))

print("test score", test_score)


              precision    recall  f1-score   support

   home loss       0.67      0.57      0.62      1011
    home win       0.72      0.80      0.76      1409

    accuracy                           0.71      2420
   macro avg       0.70      0.69      0.69      2420
weighted avg       0.70      0.71      0.70      2420

test score 0.7057851239669422


### XGBoost Pipeline Model

In [190]:
pipe_model = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(LassoCV())),
    # ('normalizer', MinMaxScaler()),
    # ('pca', PCA()),
    ('classifier', xgb.XGBClassifier(gamma=0.1, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100))
])

pipe_model.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('selector', SelectFromModel(estimator=LassoCV())),
                ('classifier',
                 XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               gamma=0.1, gpu_id=-1, grow_policy='depthwise',
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.1, max_bin=256,
                               max_cat_to_onehot=4, max_delta_step=0,
                               max_depth=3, max_leaves=0, min_child_weight=3,
                               missing=nan, monotone_constraints='()',
                               n_estimators=100, n_jobs=0, num_parallel_tree=1,
                           

In [191]:
# Predict
preds = pipe_model.predict(X_val)
test_score = pipe_model.score(X_val, y_val)

target_names=['home loss', 'home win']

print(classification_report(y_val, preds, target_names=target_names))

print("test score", test_score)


              precision    recall  f1-score   support

   home loss       0.66      0.57      0.61      1011
    home win       0.72      0.79      0.75      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.68      2420
weighted avg       0.69      0.70      0.69      2420

test score 0.697107438016529


In [182]:
pipe_model = Pipeline([
    ('selector', SelectFromModel(LogisticRegression(solver='lbfgs', C=0.1, penalty='l2'))),
    ('scaler', StandardScaler()),
    # ('normalizer', MinMaxScaler()),
    ('classifier', xgb.XGBClassifier(gamma=0.01, learning_rate=0.1, max_depth=3, min_child_weight=3, n_estimators=100))
])

pipe_model.fit(X_train, y_train)

Pipeline(steps=[('selector',
                 SelectFromModel(estimator=LogisticRegression(C=0.1))),
                ('scaler', StandardScaler()),
                ('classifier',
                 XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               gamma=0.01, gpu_id=-1, grow_policy='depthwise',
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.1, max_bin=256,
                               max_cat_to_onehot=4, max_delta_step=0,
                               max_depth=3, max_leaves=0, min_child_weight=3,
                               missing=nan, monotone_constraints='()',
                               n_estimators=100, n_jobs=0, num_parallel_t

In [183]:
# Predict
preds = pipe_model.predict(X_val)
test_score = pipe_model.score(X_val, y_val)

target_names=['home loss', 'home win']

print(classification_report(y_val, preds, target_names=target_names))

print("test score", test_score)


              precision    recall  f1-score   support

   home loss       0.67      0.57      0.61      1011
    home win       0.72      0.80      0.76      1409

    accuracy                           0.70      2420
   macro avg       0.69      0.68      0.69      2420
weighted avg       0.70      0.70      0.70      2420

test score 0.7012396694214876
