# KNN

In [3]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [4]:
# laods in data frame as df
df = pd.read_csv('/Users/tomjones/Documents/determining shot project/modelling.csv')

In [5]:
y = df.pop('shot_outcome') # sets target variable as y and removes it from df
X = df # sets remaining df as predictor variables x 

In [6]:
# creates test train split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                        stratify=y, test_size=0.3, random_state=1)

In [7]:
# standardises predictor vriables 
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [24]:
# sets up parameters for grid search
params = {'n_neighbors': [3, 5, 7, 9, 11],
          'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
          'leaf_size' : [10,20, 30 ,40, 50],
          'p' : [1,2]}

In [25]:
# First create the base model to tune
knn = KNeighborsClassifier()
# Random search of parameters
knn_gs = GridSearchCV(estimator = knn, param_grid = params, cv = 5, verbose=3, n_jobs = -1)
# Fit the random search model
knn_gs.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 102.9min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 219.7min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed: 397.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 624.4min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 705.2min finished


GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'leaf_size': [10, 20, 30, 40, 50],
                         'n_neighbors': [3, 5, 7, 9, 11], 'p': [1, 2]},
             verbose=3)

In [10]:
print('Best Parameters:')
print(knn_gs.best_params_)
print('Best estimator mean cross validated training score:')
print(knn_gs.best_score_)
print('Best estimator score on the full training set:')
print(knn_gs.score(X_train, y_train))
print('Best estimator score on the test set:')
print(knn_gs.score(X_test, y_test))

Best Parameters:
{'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'auto', 'min_samples_split': 5, 'random_state': 1}
Best estimator mean cross validated training score:
0.9339035998520133
Best estimator score on the full training set:
0.9839708793664985
Best estimator score on the test set:
0.9284862932061978


In [29]:
import joblib
joblib.dump( knn_gs, 'knn')

['knn']