In [2]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
import time

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE, SequentialFeatureSelector
from sklearn.feature_selection import SelectKBest, SelectFromModel, mutual_info_classif
from sklearn.feature_selection import f_classif, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

from sklearn.neural_network import MLPClassifier


import warnings


In [3]:
warnings.filterwarnings('ignore')


In [4]:
df = pd.read_csv('../../FeatureEngineering/MetaData/data6_&_odds.csv')
df.dropna(inplace=True)

In [5]:
train_data = df.loc[(df.season <= 2013) & (df.season >= 2007)]
valid_data = df.loc[(df.season > 2013) & (df.season < 2016)]
test_data = df.loc[df.season >= 2016]
full_train_data = pd.concat([train_data, valid_data], axis=0)

X, y = train_data.drop(columns=['home_team_wins']), train_data.home_team_wins
valid_X, valid_y = valid_data.drop(columns=['home_team_wins']), valid_data.home_team_wins
test_X, test_y = test_data.drop(columns=['home_team_wins']), test_data.home_team_wins

# Split our data
X_train, y_train = train_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), train_data.home_team_wins
X_val, y_val = valid_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), valid_data.home_team_wins
X_test, y_test = test_data.drop(columns=["game_date_est","season","game_id","home_team","visitor_team","home_team_id","visitor_team_id","home_team_wins","conference","conference_visitor"]), test_data.home_team_wins



In [6]:
pipe_model = Pipeline([
    ('selector', SelectFromModel(RandomForestClassifier())),
    ('scaler', StandardScaler()),
    # ('normalizer', MinMaxScaler()),
    ('classifier', RandomForestClassifier(bootstrap=True, max_depth=8, max_features='sqrt', min_samples_leaf=4, min_samples_split=12, n_estimators=100))
])

pipe_model.fit(X_train, y_train)

Pipeline(steps=[('selector',
                 SelectFromModel(estimator=RandomForestClassifier())),
                ('scaler', StandardScaler()),
                ('classifier',
                 RandomForestClassifier(max_depth=8, max_features='sqrt',
                                        min_samples_leaf=4,
                                        min_samples_split=12))])

In [7]:
# Predict
preds = pipe_model.predict(X_test)
test_score = pipe_model.score(X_test, y_test)

target_names=['home loss', 'home win']

print(classification_report(y_test, preds, target_names=target_names))

print("test score", test_score)


              precision    recall  f1-score   support

   home loss       0.63      0.55      0.59      1935
    home win       0.70      0.77      0.73      2648

    accuracy                           0.67      4583
   macro avg       0.67      0.66      0.66      4583
weighted avg       0.67      0.67      0.67      4583

test score 0.6746672485271656


In [8]:
pipe_model = Pipeline([
    ('normalizer', MinMaxScaler()),
    ('selector', SelectKBest(score_func=f_classif, k=5)),
    ('classifier', MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5), max_iter=250, solver='lbfgs'))
])

pipe_model.fit(X_train, y_train)

Pipeline(steps=[('normalizer', MinMaxScaler()), ('selector', SelectKBest(k=5)),
                ('classifier',
                 MLPClassifier(alpha=1e-05, hidden_layer_sizes=(20, 10, 5),
                               max_iter=250, solver='lbfgs'))])

In [9]:
# Predict
preds = pipe_model.predict(X_test)
test_score = pipe_model.score(X_test, y_test)

target_names=['home loss', 'home win']

print(classification_report(y_test, preds, target_names=target_names))

print("test score", test_score)


              precision    recall  f1-score   support

   home loss       0.62      0.56      0.59      1935
    home win       0.70      0.75      0.72      2648

    accuracy                           0.67      4583
   macro avg       0.66      0.65      0.66      4583
weighted avg       0.67      0.67      0.67      4583

test score 0.6685577132882391
