# Random Forest Classifier 

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [3]:
# laods in data frame as df
df = pd.read_csv('/Users/tomjones/Documents/determining shot project/modelling.csv')

In [4]:
y = df.pop('shot_outcome') # sets target variable as y and removes it from df
X = df # sets remaining df as predictor variables x 

In [6]:
# creates test train split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                        stratify=y, test_size=0.3, random_state=1)

In [7]:
# standardises predictor vriables 
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [8]:
# sets up parameters for grid search
params = {'bootstrap': [True],
          'max_depth': [10, 20, None],
          'max_features': ['auto'],
          'criterion' : ['gini', 'entropy'],
          'class_weight' : ['balanced', None],
          'random_state' : [1],
          'min_samples_split': [2, 5, 10]}

In [9]:
# First create the base model to tune
rfc = RandomForestClassifier()
# Adds base model to hyper parameters for grid search cv
rfc_gs = GridSearchCV(estimator = rfc, param_grid = params, cv = 5, verbose=3, n_jobs = -1)
# Fit the random search model
rfc_gs.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   32.1s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  7.1min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True],
                         'class_weight': ['balanced', None],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [10, 20, None], 'max_features': ['auto'],
                         'min_samples_split': [2, 5, 10], 'random_state': [1]},
             verbose=3)

In [10]:
print('Best Parameters:')
print(rfc_gs.best_params_)
print('Best estimator mean cross validated training score:')
print(rfc_gs.best_score_)
print('Best estimator score on the full training set:')
print(rfc_gs.score(X_train, y_train))
print('Best estimator score on the test set:')
print(rfc_gs.score(X_test, y_test))

Best Parameters:
{'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'auto', 'min_samples_split': 5, 'random_state': 1}
Best estimator mean cross validated training score:
0.9339035998520133
Best estimator score on the full training set:
0.9839708793664985
Best estimator score on the test set:
0.9284862932061978


In [11]:
# parameters for tuned grid search 
params_2 = {'bootstrap': [True],
          'max_depth': [15, 20, 25, 30],
          'max_features': ['auto'],
          'criterion' : ['entropy'],
          'class_weight' : ['balanced'],
          'random_state' : [1],
          'min_samples_split': [4, 5, 6,7]}

In [12]:
# Adds base model to tuned parameters for grid search cv
rf_gs2 = GridSearchCV(estimator = rf, param_grid = params_2, cv = 5, verbose=3, n_jobs = -2)
# Fit the random search model
rf_gs2.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  18 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-2)]: Done  80 out of  80 | elapsed:  3.5min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-2,
             param_grid={'bootstrap': [True], 'class_weight': ['balanced'],
                         'criterion': ['entropy'],
                         'max_depth': [15, 20, 25, 30],
                         'max_features': ['auto'],
                         'min_samples_split': [4, 5, 6, 7],
                         'random_state': [1]},
             verbose=3)

In [13]:
print('Best Parameters:')
print(rf_gs2.best_params_)
print('Best estimator mean cross validated training score:')
print(rf_gs2.best_score_)
print('Best estimator score on the full training set:')
print(rf_gs2.score(X_train, y_train))
print('Best estimator score on the test set:')
print(rf_gs2.score(X_test, y_test))

Best Parameters:
{'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'auto', 'min_samples_split': 6, 'random_state': 1}
Best estimator mean cross validated training score:
0.9349254947741714
Best estimator score on the full training set:
0.9818634650999425
Best estimator score on the test set:
0.9331048867699643


In [26]:
# creates data frame of the feature importances 
df_feature = pd.DataFrame(rf_gs2.best_estimator_.feature_importances_)

In [27]:
# creates data frame of the 10 most improtant feature variables with their name instead of index number 
df_feature_importances = pd.DataFrame(zip(X_train.columns,rf_gs2.best_estimator_.feature_importances_)).sort_values(by = 1).tail(10)

In [28]:
# renames columns in data frame
df_feature_importances.columns = ['Feature','Importance']

In [29]:
df_feature_importances

Unnamed: 0,Feature,Importance
3402,area_of_goal_TL,0.012303
3404,area_of_goal_TR,0.012411
3400,area_of_goal_BR,0.013371
151,defenders_infront_of_goal,0.016086
149,angle_of_shot,0.026322
150,distance_from_goal,0.029581
6,duration,0.040603
3401,area_of_goal_OFF_TARGET,0.071596
53,pass_shot_assist_2nd,0.088226
99,pass_shot_assist_3rd,0.094714


In [25]:
import joblib
joblib.dump( rf_gs2, 'random_forest')

['random_forest']