# KNN

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor

## Read dataset

In [2]:
data_train = pd.read_csv('processed_training.csv')
data_test = pd.read_csv('processed_test.csv')
data_train.head()

Unnamed: 0,attendance,is_federal_holiday,venue,on_grass,temperature,wind_speed,team1_name,team1_pre_win,team1_pre_loss,team1_pre_win_pct,...,weather_Rain,weather_Sunny,season_type,season,home_team_avg_att_last_year,start_hour,start_hour_label_afternoon,start_hour_label_evening,start_hour_label_night,start_hour_label_noon
0,35055,0,Wrigley Field,1,-2.786672,-0.09309,STL,-1.62645,-1.654514,0.0,...,0,1,regular,2015,0.345307,19,0,1,0,0
1,49043,0,Chase Field,1,0.705205,-0.09309,SFG,-1.62645,-1.654514,0.0,...,0,1,regular,2015,-0.573314,19,0,1,0,0
2,43633,0,Great American Ball Park,1,-0.52167,1.490839,PIT,-1.62645,-1.654514,0.0,...,0,0,regular,2015,0.066649,16,1,0,0,0
3,45030,0,Comerica Park,1,-2.031671,-0.489072,MIN,-1.62645,-1.654514,0.0,...,0,0,regular,2015,0.766348,13,0,0,0,1
4,43753,0,Minute Maid Park,1,0.516455,1.292848,CLE,-1.62645,-1.654514,0.0,...,0,0,regular,2015,-1.084575,18,0,1,0,0


In [3]:
data_train.drop(['venue', 'start_hour'], axis=1, inplace=True)
data_test.drop(['venue', 'start_hour'], axis=1, inplace=True)

In [4]:
data_train = pd.get_dummies(data_train, columns=['team1_name'])
data_test = pd.get_dummies(data_test, columns=['team1_name'])

In [5]:
data_train = pd.get_dummies(data_train, columns=['team2_name'])
data_test = pd.get_dummies(data_test, columns=['team2_name'])

In [6]:
data_train = pd.get_dummies(data_train, columns=['season_type'])
data_test = pd.get_dummies(data_test, columns=['season_type'])

In [7]:
train_cols = data_train.columns.tolist()
data_test = data_test[train_cols].copy()

In [8]:
print(set(data_train.columns) - set(data_test.columns))
print(set(data_test.columns) - set(data_train.columns))

set()
set()


In [9]:
from sklearn.utils import shuffle
data_train = shuffle(data_train)

In [10]:
y_train = data_train['attendance'].to_numpy()
y_test = data_test['attendance'].to_numpy()

data_train.drop(['attendance'], axis=1, inplace=True)
data_test.drop(['attendance'], axis=1, inplace=True)

x_train = data_train.to_numpy()
x_test = data_test.to_numpy()

print(x_train.shape)

(12270, 273)


## kfold

In [11]:
from sklearn.model_selection import GridSearchCV

In [16]:
scoring = ['neg_mean_squared_error', 'neg_root_mean_squared_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error']
parameters = {'n_neighbors': [3, 5, 10, 20, 30, 50, 100], 'p': [1, 2],
 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
model = KNeighborsRegressor(n_jobs=-1)
clf = GridSearchCV(model, parameters, cv=5, n_jobs=-1, scoring=scoring, refit='neg_root_mean_squared_error')
clf.fit(x_train, y_train)

In [17]:
print("Best estimators:", clf.best_estimator_)
print("Best parameters:", clf.best_params_)

Best estimators: KNeighborsRegressor(n_jobs=-1, p=1, weights='distance')
Best parameters: {'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}


In [18]:
df_res = pd.DataFrame(data=clf.cv_results_)
df_res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_n_neighbors,param_p,param_weights,params,split0_test_neg_mean_squared_error,...,std_test_neg_mean_absolute_error,rank_test_neg_mean_absolute_error,split0_test_neg_mean_absolute_percentage_error,split1_test_neg_mean_absolute_percentage_error,split2_test_neg_mean_absolute_percentage_error,split3_test_neg_mean_absolute_percentage_error,split4_test_neg_mean_absolute_percentage_error,mean_test_neg_mean_absolute_percentage_error,std_test_neg_mean_absolute_percentage_error,rank_test_neg_mean_absolute_percentage_error
0,0.036198,0.002786,8.969412,0.511707,auto,3,1,uniform,"{'algorithm': 'auto', 'n_neighbors': 3, 'p': 1...",-31983380.0,...,82.828566,9,-0.163238,-0.163793,-0.156657,-0.157887,-0.169582,-0.162232,0.004635,9
1,0.041615,0.010172,9.21455,0.899181,auto,3,1,distance,"{'algorithm': 'auto', 'n_neighbors': 3, 'p': 1...",-30277570.0,...,83.594107,1,-0.157106,-0.156979,-0.15045,-0.152231,-0.162908,-0.155935,0.004356,1
2,0.038991,0.002593,0.737424,0.098235,auto,3,2,uniform,"{'algorithm': 'auto', 'n_neighbors': 3, 'p': 2...",-33878580.0,...,91.303532,21,-0.171562,-0.169354,-0.162304,-0.163784,-0.177062,-0.168813,0.005358,17
3,0.047268,0.005237,0.848741,0.073533,auto,3,2,distance,"{'algorithm': 'auto', 'n_neighbors': 3, 'p': 2...",-32758330.0,...,87.971539,15,-0.167665,-0.165623,-0.159062,-0.160823,-0.173394,-0.165314,0.005102,15
4,0.048969,0.012149,9.654542,0.862578,auto,5,1,uniform,"{'algorithm': 'auto', 'n_neighbors': 5, 'p': 1...",-31802180.0,...,71.549185,29,-0.169857,-0.171104,-0.165079,-0.164769,-0.178426,-0.169847,0.004974,25


## Train with best params

In [20]:
knn = KNeighborsRegressor(algorithm='auto', n_neighbors=5, p=1, weights='distance', n_jobs=-1)
knn = knn.fit(x_train, y_train)

In [21]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
train_predict = knn.predict(x_train)
print("Train MSE:", mean_squared_error(y_train, train_predict))
print("Train RMSE:", mean_squared_error(y_train, train_predict, squared=False))
print("Train MAE:", mean_absolute_error(y_train, train_predict))
print("Train MAPE:", mean_absolute_percentage_error(y_train, train_predict))

Train MSE: 0.0
Train RMSE: 0.0
Train MAE: 0.0
Train MAPE: 0.0


## Testing set

In [22]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
y_pred = knn.predict(x_test)
print("Test MSE:", mean_squared_error(y_test, y_pred))
print("Test RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("Test MAE:", mean_absolute_error(y_test, y_pred))
print("Test MAPE:", mean_absolute_percentage_error(y_test, y_pred))

Test MSE: 60502758.128228575
Test RMSE: 7778.351890228969
Test MAE: 6060.722916084423
Test MAPE: 0.2829741209038142


## Save model

In [23]:
import pickle
from datetime import datetime
time = datetime.today().strftime('%Y%m%d_%H%M%S')
filename = f'./model/knn_model_{time}.sav'
pickle.dump(knn, open(filename, 'wb'))
cv_filename = f'./model/knn_model_{time}_cv.csv'
df_res.to_csv(cv_filename, index=False)
# loaded_model = pickle.load(open(filename, 'rb'))