# Gradient Boosting

In [6]:
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [2]:
# Loads in data frmae as df
df = pd.read_csv('/Users/tomjones/Documents/determining shot project/modelling.csv')

In [3]:
y = df.pop('shot_outcome') # sets target variable as shot_outcome and removes from data frame
X = df # sets remaining data frame as predictor variables

In [8]:
# creates test train split 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)

In [9]:
# standardises the predictor variables using standard scaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [7]:
# sets up base model for grid search
gbc = GradientBoostingClassifier()

In [17]:
# sets up parameters for grid search cv
params = {"loss":["deviance"],
              "learning_rate": [0.5, 0.1, 0.2],
              "min_samples_split": np.linspace(0.1, 0.5, 5),
              "min_samples_leaf": np.linspace(0.1, 0.5, 5),
              "max_depth":[3,5,8],
              "max_features":["log2","sqrt"],
              "criterion": ["friedman_mse",  "mae"],
              "subsample":[ 0.4, 0.6, 0.8, 1.0],
              "n_estimators":[10, 100, 500]}

In [18]:
# add base model to grid search parameters 
gbc_gs = GridSearchCV(estimator = gbc, param_grid = params, cv = 5, verbose=3, n_jobs = -1)
# Fit the random search model
gbc_gs.fit(X_train, y_train)

Fitting 5 folds for each of 10800 candidates, totalling 54000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed: 21.8min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed: 26.0min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed: 30.9min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed: 37.6min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed: 43.2min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | e

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'criterion': ['friedman_mse', 'mae'],
                         'learning_rate': [0.5, 0.1, 0.2], 'loss': ['deviance'],
                         'max_depth': [3, 5, 8],
                         'max_features': ['log2', 'sqrt'],
                         'min_samples_leaf': array([0.1, 0.2, 0.3, 0.4, 0.5]),
                         'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5]),
                         'n_estimators': [10, 100, 500],
                         'subsample': [0.4, 0.6, 0.8, 1.0]},
             verbose=3)

In [19]:
print('Best Parameters:')
print(gbc_gs.best_params_)
print('Best estimator mean cross validated training score:')
print(gbc_gs.best_score_)
print('Best estimator score on the full training set:')
print(gbc_gs.score(X_train, y_train))
print('Best estimator score on the test set:')
print(gbc_gs .score(X_test, y_test))

Best Parameters:
{'criterion': 'friedman_mse', 'learning_rate': 0.5, 'loss': 'deviance', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 0.30000000000000004, 'n_estimators': 500, 'subsample': 1.0}
Best estimator mean cross validated training score:
0.9489828468674343
Best estimator score on the full training set:
0.9559119356280733
Best estimator score on the test set:
0.9465921787709497


In [28]:
# saves model 
import joblib
joblib.dump( model, 'gradient_boost')

['gradient_boost']