In [1]:
import numpy as np
import pandas as pd

# Pre-processing and setup functions
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Algorithm
from sklearn.ensemble import GradientBoostingClassifier

# Report and model validation
from sklearn.metrics import classification_report
# Model persistence
from joblib import dump, load

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
main = pd.read_csv('main.csv')

In [3]:
main = main.drop(
    ['Unnamed: 0', 
     'country', 
     'date_of_birth', 
     'ep_id', 
     'full_name', 
     'hometown',
     'youth_team',
     'team_14',
     'team_15',
     'team_16',
     'team_17',
     'league_14',
     'league_15',
     'league_16',
     'league_17',
     'league_25'
    ], 1)

In [4]:
main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4094 entries, 0 to 4093
Data columns (total 26 columns):
height             4093 non-null float64
position           4094 non-null object
shoots             4094 non-null object
weight             4093 non-null float64
games_played_14    4094 non-null float64
goals_14           4094 non-null float64
assists_14         4094 non-null float64
penalty_min_14     4094 non-null float64
plus_minus_14      4094 non-null float64
games_played_15    4094 non-null float64
goals_15           4094 non-null float64
assists_15         4094 non-null float64
penalty_min_15     4094 non-null float64
plus_minus_15      4094 non-null float64
games_played_16    4094 non-null float64
goals_16           4094 non-null float64
assists_16         4094 non-null float64
penalty_min_16     4094 non-null float64
plus_minus_16      4094 non-null float64
games_played_17    4094 non-null float64
goals_17           4094 non-null float64
assists_17         4094 non-null f

In [5]:
main.sample(10)

Unnamed: 0,height,position,shoots,weight,games_played_14,goals_14,assists_14,penalty_min_14,plus_minus_14,games_played_15,goals_15,assists_15,penalty_min_15,plus_minus_15,games_played_16,goals_16,assists_16,penalty_min_16,plus_minus_16,games_played_17,goals_17,assists_17,penalty_min_17,plus_minus_17,response,birth_region
2941,184.0,D,R,78.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,3.0,16.0,24.0,0.0,40.0,5.0,20.0,8.0,-0.0,Not Active,Canada
4051,175.0,Mixed,L,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2.0,0.0,ECHL,USA
3358,183.0,RW,R,83.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Not Active,Western Europe
3179,180.0,D,L,89.0,0.0,0.0,0.0,0.0,0.0,19.0,2.0,7.0,12.0,0.0,53.0,1.0,14.0,24.0,0.0,67.0,3.0,25.0,45.0,35.0,Not Active,Canada
3961,170.0,RW,R,73.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,26.0,41.0,21.0,0.0,59.0,26.0,37.0,30.0,0.0,Not Active,Canada
1028,175.0,C,L,74.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84.0,18.0,22.0,4.0,0.0,80.0,50.0,64.0,72.0,0.0,Not Active,Canada
354,181.0,D,L,88.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,108.0,2.0,12.0,44.0,-1.0,Not Active,Canada
3560,191.0,LW,L,88.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,9.0,21.0,54.0,0.0,Not Active,Canada
1252,188.0,RW,R,91.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69.0,52.0,29.0,74.0,0.0,60.0,7.0,8.0,53.0,-1.0,Not Active,Canada
2330,191.0,D,L,95.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,36.0,6.0,6.0,44.0,0.0,65.0,3.0,12.0,43.0,0.0,NHL,Canada


In [6]:
y = main['response']
features = main.drop(['response'], 1)

In [7]:
features = pd.get_dummies(features)

In [8]:
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)

  return self.partial_fit(X, y)


In [9]:
pd.DataFrame(scaled_features)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38
0,0.666667,0.030303,0.000000,0.000000,0.000000,0.000000,0.171053,0.000000,0.000000,0.000000,0.0000,0.149606,0.000000,0.000000,0.000000,0.000000,0.384615,0.000000,0.000,0.000000,0.000000,0.341463,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.476190,0.868687,0.000000,0.000000,0.000000,0.000000,0.171053,0.000000,0.000000,0.000000,0.0000,0.149606,0.000000,0.000000,0.000000,0.000000,0.384615,0.013514,0.000,0.000000,0.000000,0.341463,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.595238,0.767677,0.000000,0.000000,0.000000,0.000000,0.171053,0.000000,0.000000,0.000000,0.0000,0.149606,0.340206,0.088983,0.199219,0.197628,0.384615,0.594595,0.135,0.338028,0.197674,0.341463,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.404762,0.909091,0.000000,0.000000,0.000000,0.000000,0.171053,0.000000,0.000000,0.000000,0.0000,0.149606,0.340206,0.025424,0.058594,0.071146,0.384615,0.189189,0.000,0.021127,0.139535,0.325203,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.595238,0.898990,0.000000,0.000000,0.000000,0.000000,0.171053,0.000000,0.000000,0.000000,0.0000,0.149606,0.309278,0.139831,0.187500,0.150198,0.384615,0.743243,0.240,0.549296,1.000000,0.341463,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.595238,0.919192,0.000000,0.000000,0.000000,0.000000,0.171053,0.000000,0.000000,0.000000,0.0000,0.149606,0.381443,0.050847,0.117188,0.260870,0.384615,0.554054,0.075,0.528169,0.534884,0.341463,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,0.404762,0.888889,0.000000,0.000000,0.000000,0.000000,0.171053,0.000000,0.000000,0.000000,0.0000,0.149606,0.000000,0.000000,0.000000,0.000000,0.384615,0.000000,0.000,0.000000,0.000000,0.341463,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,0.642857,0.939394,0.000000,0.000000,0.000000,0.000000,0.171053,0.723214,0.088889,0.069444,0.0750,0.149606,0.391753,0.114407,0.199219,0.229249,0.384615,0.729730,0.090,0.401408,0.273256,0.463415,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.476190,0.929293,0.000000,0.000000,0.000000,0.000000,0.171053,0.000000,0.000000,0.000000,0.0000,0.149606,0.000000,0.000000,0.000000,0.000000,0.384615,0.000000,0.000,0.000000,0.000000,0.341463,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,0.404762,0.868687,0.000000,0.000000,0.000000,0.000000,0.171053,0.053571,0.000000,0.000000,0.0150,0.149606,0.319588,0.203390,0.269531,0.086957,0.384615,0.554054,0.015,0.063380,0.034884,0.268293,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, y, random_state=42, stratify=y)

In [11]:
gbm_search = GradientBoostingClassifier()

gbm_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [25, 50, 100, 200, 400],
    'max_depth': [1, 2, 3, 4, 5]
}

grid = GridSearchCV(
    gbm_search,
    gbm_grid,
    scoring='accuracy',
    cv=5,
    verbose=3,
    n_jobs=-1
)

In [12]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed:  5.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [25, 50, 100, 200, 400], 'max_depth': [1, 2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=3)

In [13]:
print(grid.best_params_)

{'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 25}


In [14]:
print(grid.best_score_)

0.655700325732899


In [15]:
predictions = grid.predict(X_test)
print(classification_report(y_test, predictions))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').