In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import joblib

differences 

In [2]:
model_df = pd.read_csv('./data_frames/game_stats_differences_2020.csv')

In [3]:
model_df.head()

Unnamed: 0,goal_diff,won,game_ids,goals,shots,blocked_shots,penalty_minutes,power_play_percentage,power_play_goals,power_play_chances,faceoff_percent,takeaways,giveaways,hits,ice_tilt
0,3,yes,2020020001,3,-7,2,0,33.4,1.0,0.0,16.0,-2,0,8,-1058.0
1,4,yes,2020020002,4,10,5,-2,16.7,1.0,1.0,1.8,0,0,2,89.0
2,1,yes,2020020003,1,2,-5,-2,-16.7,0.0,1.0,8.0,-1,-4,-18,1328.0
3,-2,no,2020020004,-2,-4,3,-4,0.0,0.0,2.0,3.6,-1,4,-1,695.0
4,-3,no,2020020005,-3,-5,-8,-4,25.0,1.0,2.0,8.8,3,5,1,405.0


In [4]:
X = model_df.drop(columns=['won', 'goal_diff','goals','game_ids', ])
y = model_df['won']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

In [5]:
sc = StandardScaler()

Xs_train = sc.fit_transform(X_train)
Xs_test = sc.transform(X_test)

In [6]:
logreg = LogisticRegression(random_state=42, max_iter=1000)

In [7]:
log_params = {
    'C': np.linspace(0.1,4,100)
             }

In [8]:
log_reg = RandomizedSearchCV(logreg,log_params,n_iter=100,n_jobs=-1,verbose=1)

In [9]:
log_reg.fit(Xs_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(estimator=LogisticRegression(max_iter=1000, random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'C': array([0.1       , 0.13939394, 0.17878788, 0.21818182, 0.25757576,
       0.2969697 , 0.33636364, 0.37575758, 0.41515152, 0.45454545,
       0.49393939, 0.53333333, 0.57272727, 0.61212121, 0.65151515,
       0.69090909, 0.73030303, 0.76969697, 0.80909091, 0.84848485,
       0.8878...
       2.66060606, 2.7       , 2.73939394, 2.77878788, 2.81818182,
       2.85757576, 2.8969697 , 2.93636364, 2.97575758, 3.01515152,
       3.05454545, 3.09393939, 3.13333333, 3.17272727, 3.21212121,
       3.25151515, 3.29090909, 3.33030303, 3.36969697, 3.40909091,
       3.44848485, 3.48787879, 3.52727273, 3.56666667, 3.60606061,
       3.64545455, 3.68484848, 3.72424242, 3.76363636, 3.8030303 ,
       3.84242424, 3.88181818, 3.92121212, 3.96060606, 4.        ])},
                   verbose=1)

In [10]:
log_reg.best_params_

{'C': 0.17878787878787877}

In [11]:
log_reg.score(Xs_test,y_test), log_reg.score(Xs_train,y_train)

(0.7419354838709677, 0.6912442396313364)

In [12]:
joblib.dump(log_reg,'./models/log_reg.pkl')

['./models/log_reg.pkl']

random forest

In [13]:
rfc = RandomForestClassifier()

In [14]:
rfc_params = {
    'n_estimators': list(range(2,51,1)),
    'max_depth': list(range(3,51,1)),
    'min_samples_split': list(range(2,51,1)),
    'min_samples_leaf': list(range(2,51,1)),
    'max_features': ['auto','sqrt','log2']
}

In [15]:
rfc_search = RandomizedSearchCV(rfc,rfc_params,n_iter=1000,n_jobs=-2,verbose=1,random_state=42)

In [16]:
rfc_search.fit(X_train,y_train)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


RandomizedSearchCV(estimator=RandomForestClassifier(), n_iter=1000, n_jobs=-2,
                   param_distributions={'max_depth': [3, 4, 5, 6, 7, 8, 9, 10,
                                                      11, 12, 13, 14, 15, 16,
                                                      17, 18, 19, 20, 21, 22,
                                                      23, 24, 25, 26, 27, 28,
                                                      29, 30, 31, 32, ...],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [2, 3, 4, 5, 6, 7,
                                                             8, 9, 10, 11, 12,
                                                             13, 14, 15, 16, 17,
                                                             18, 19, 20, 21, 22,
                                                             23, 24, 25, 

In [17]:
rfc_search.score(X_train,y_train), rfc_search.score(X_test,y_test)

(0.7035330261136713, 0.7142857142857143)

In [18]:
rfc_search.best_estimator_.feature_importances_

array([0.04444509, 0.12279573, 0.01589059, 0.31355386, 0.23780591,
       0.01008572, 0.0581746 , 0.0486683 , 0.04723833, 0.0695592 ,
       0.03178267])

In [19]:
rfc_search.best_params_

{'n_estimators': 35,
 'min_samples_split': 12,
 'min_samples_leaf': 17,
 'max_features': 'log2',
 'max_depth': 3}

In [20]:
joblib.dump(rfc_search,'./models/random_forest_classifier.pkl')

['./models/random_forest_classifier.pkl']

just box scores

In [21]:
box_scores = pd.read_csv('./data_frames/2020_end_game_boxscores.csv')

In [22]:
box_scores.tail(380)

Unnamed: 0,team,home_status,goals,shots,blocked_shots,penalty_minutes,power_play_percentage,power_play_goals,power_play_chances,faceoff_percent,takeaways,giveaways,hits,goal_diff,won
1356,Florida Panthers,away,2,38,11,4,0.0,0.0,3.0,66.0,3,0,26,-1,no
1357,Tampa Bay Lightning,home,3,19,7,6,50.0,1.0,2.0,34.0,7,1,35,1,yes
1358,Philadelphia Flyers,away,1,27,13,6,0.0,0.0,3.0,51.5,5,6,25,0,no
1359,Pittsburgh Penguins,home,1,32,6,6,0.0,0.0,3.0,48.5,9,9,16,0,no
1360,Chicago Blackhawks,away,1,25,6,2,0.0,0.0,3.0,38.1,5,4,9,-3,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1731,Vegas Golden Knights,home,4,29,17,10,25.0,1.0,4.0,42.9,9,2,25,3,yes
1732,Colorado Avalanche,away,3,30,13,4,33.3,1.0,3.0,42.2,5,9,30,1,yes
1733,Los Angeles Kings,home,2,18,12,6,0.0,0.0,2.0,57.8,1,7,23,-1,no
1734,Arizona Coyotes,away,5,32,8,6,50.0,1.0,2.0,51.6,6,9,13,1,yes


In [23]:
X = box_scores.drop(columns=['won', 'team', 'home_status', 'goal_diff','goals'])
y = box_scores['won']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

In [24]:
# Step 2: Instantiate our model.
logreg = LogisticRegression(max_iter=5000)

# Step 3: Fit our model.
logreg.fit(X_train,y_train)

LogisticRegression(max_iter=5000)

In [25]:
logreg.score(X_test,y_test), logreg.score(X_train,y_train)

(0.6336405529953917, 0.6482334869431644)

In [26]:
joblib.dump(logreg, './models/box_score_log_reg_2020')

['./models/box_score_log_reg_2020']