In [3]:
import numpy as np
import pandas as pd

# Pre-processing and setup functions
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Algorithm
from sklearn.ensemble import GradientBoostingClassifier

# Report and model validation
from sklearn.metrics import classification_report
# Model persistence
from joblib import dump, load

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
main = pd.read_csv('main.csv')

In [3]:
main = main.drop(
    ['Unnamed: 0', 
     'country', 
     'date_of_birth', 
     'ep_id', 
     'full_name', 
     'hometown',
     'youth_team',
     'team_14',
     'team_15',
     'team_16',
     'team_17',
     'league_14',
     'league_15',
     'league_16',
     'league_17',
     'league_25'
    ], 1)

In [4]:
main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4094 entries, 0 to 4093
Data columns (total 26 columns):
height             4093 non-null float64
position           4094 non-null object
shoots             4094 non-null object
weight             4093 non-null float64
games_played_14    4094 non-null float64
goals_14           4094 non-null float64
assists_14         4094 non-null float64
penalty_min_14     4094 non-null float64
plus_minus_14      4094 non-null float64
games_played_15    4094 non-null float64
goals_15           4094 non-null float64
assists_15         4094 non-null float64
penalty_min_15     4094 non-null float64
plus_minus_15      4094 non-null float64
games_played_16    4094 non-null float64
goals_16           4094 non-null float64
assists_16         4094 non-null float64
penalty_min_16     4094 non-null float64
plus_minus_16      4094 non-null float64
games_played_17    4094 non-null float64
goals_17           4094 non-null float64
assists_17         4094 non-null f

In [5]:
main.sample(10)

Unnamed: 0,height,position,shoots,weight,games_played_14,goals_14,assists_14,penalty_min_14,plus_minus_14,games_played_15,goals_15,assists_15,penalty_min_15,plus_minus_15,games_played_16,goals_16,assists_16,penalty_min_16,plus_minus_16,games_played_17,goals_17,assists_17,penalty_min_17,plus_minus_17,response,birth_region
1985,188.0,D,R,88.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,0.0,3.0,8.0,0.0,36.0,4.0,11.0,16.0,0.0,Not Active,Canada
2765,178.0,Mixed,R,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45.0,49.0,17.0,61.0,0.0,50.0,28.0,30.0,63.0,0.0,ECHL,Canada
568,193.0,D,L,91.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72.0,12.0,20.0,116.0,0.0,136.0,4.0,18.0,108.0,-5.0,Lower,Canada
1694,185.0,Winger,L,96.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58.0,13.0,15.0,20.0,0.0,57.0,27.0,21.0,18.0,8.0,NHL,USA
2734,185.0,Forward,L,93.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,5.0,4.0,14.0,0.0,35.0,20.0,28.0,60.0,0.0,Not Active,Canada
2869,191.0,Forward,L,92.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64.0,15.0,15.0,6.0,0.0,84.0,54.0,55.0,128.0,0.0,ECHL,USA
1901,178.0,RW,R,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,10.0,14.0,4.0,0.0,34.0,10.0,10.0,29.0,0.0,Not Active,Canada
991,191.0,RW,R,2.0,0.0,0.0,0.0,0.0,0.0,30.0,20.0,28.0,114.0,0.0,88.0,38.0,52.0,12.0,0.0,90.0,22.0,12.0,54.0,0.0,AHL,Canada
3395,180.0,LW,L,77.0,21.0,9.0,13.0,12.0,0.0,20.0,42.0,33.0,72.0,0.0,38.0,9.0,5.0,44.0,0.0,40.0,11.0,15.0,42.0,0.0,Collegiate,Canada
3874,186.0,LW,L,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,40.0,16.0,24.0,78.0,0.0,Lower,Canada


In [8]:
y = main['response']
features = main.drop(['response'], 1)

In [14]:
features = pd.get_dummies(features)

In [16]:
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)

  return self.partial_fit(X, y)


In [25]:
pd.DataFrame(scaled_features)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, y, random_state=42, stratify=y)

In [18]:
gbm_search = GradientBoostingClassifier()

gbm_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [25, 50, 100, 200, 400],
    'max_depth': [1, 2, 3, 4, 5]
}

grid = GridSearchCV(
    gbm_search,
    gbm_grid,
    scoring='accuracy',
    cv=5,
    verbose=3,
    n_jobs=-1
)

In [19]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed:  5.9min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [25, 50, 100, 200, 400], 'max_depth': [1, 2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=3)

In [20]:
print(grid.best_params_)

{'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 25}


In [21]:
print(grid.best_score_)

0.655700325732899


In [4]:
# predictions = grid.predict(X_test)
# print(classification_report(y_test, predictions))