In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error

In [2]:
#load dataset with redundant features removed (see NBA_predictive_model.ipynb)
df_NBA_slim = pd.read_csv('NBA_numeric_slim.csv')
df_NBA_slim.head()

Unnamed: 0,season,age,w,sos,o_rtg,d_rtg,pace,f_tr,x3p_ar,ts_percent,...,away_orb_percent,away_opp_e_fg_percent,away_opp_tov_percent,away_opp_drb_percent,away_opp_ft_fga,away_attend,away_attend_g,home_score,away_score,spread
0,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,27.2,0.481,14.2,70.1,0.274,656081.0,16002.0,120,117,3
1,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,29.5,0.497,14.7,70.9,0.269,591701.0,14432.0,90,93,-3
2,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,28.4,0.454,14.4,72.6,0.262,828384.0,20204.0,105,114,-9
3,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,28.4,0.482,15.1,71.7,0.247,723949.0,17657.0,110,107,3
4,2005,26.0,13.0,0.09,100.6,111.1,91.4,0.299,0.146,0.504,...,31.2,0.461,13.5,73.0,0.207,905116.0,22076.0,68,95,-27


In [3]:
#create training and test sets
train_set_r, test_val_set_r = train_test_split(df_NBA_slim, test_size=0.2, 
                                       stratify=df_NBA_slim['season'], random_state=42)

#separate into data (X) and targets(y).  Remove "season" and 'home_score' and 'away_score'
X_train_r = train_set_r.iloc[:,1:-3]
y_train_r = train_set_r.iloc[:,-1]

#split test_val_set into test and validation sets
test_set_r, val_set_r = train_test_split(test_val_set_r, test_size=0.5, 
                                       stratify=test_val_set_r['season'], random_state=42)

X_test_r = test_set_r.iloc[:,1:-3]
y_test_r = test_set_r.iloc[:,-1]

X_val_r = val_set_r.iloc[:,1:-3]
y_val_r = val_set_r.iloc[:,-1]

In [4]:
#z-score normalize the data
ss = StandardScaler()

X_train_r_ss = ss.fit_transform(X_train_r)
X_test_r_ss = ss.transform(X_test_r)
X_val_r_ss = ss.transform(X_val_r)

In [6]:
%%time

#begin grid searches to find best hyperparameters

rfr = RandomForestRegressor()

param_rfr = [{'criterion':['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
             'max_depth': [5, 7, 10]}]

grid_search_rfr = GridSearchCV(rfr, param_rfr, cv=5, scoring = 'neg_root_mean_squared_error', n_jobs=-1)
grid_search_rfr.fit(X_train_r_ss, y_train_r)
print(grid_search_rfr.best_params_)
print(grid_search_rfr.best_score_)

"""{'criterion': 'squared_error', 'max_depth': 7}
-12.286935979875325
CPU times: total: 5.69 s
Wall time: 17min 2s"""

#poisson not allowed with negative scoring

15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\johann\AppData\Local\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\johann\AppData\Local\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 359, in fit
    raise ValueError(
ValueError: Some value(s) of y are negative which is not allowed for Poisson regression.

 -12.3389092  -12.30404525 -12.28770527 -12.34574496          nan
          nan          nan]


{'criterion': 'squared_error', 'max_depth': 7}
-12.286935979875325
CPU times: total: 5.69 s
Wall time: 17min 2s


In [7]:
%%time

#another grid search to find best hyperparameters

rfr = RandomForestRegressor()

param_rfr = [{'criterion':['squared_error', 'absolute_error', 'friedman_mse'],
             'max_depth': [7], 'min_samples_leaf': [1,2,4], 'min_samples_split': [2,4,8]}]

grid_search_rfr = GridSearchCV(rfr, param_rfr, cv=5, scoring = 'neg_root_mean_squared_error', n_jobs=-1)
grid_search_rfr.fit(X_train_r_ss, y_train_r)
print(grid_search_rfr.best_params_)
print(grid_search_rfr.best_score_)

"""{'criterion': 'squared_error', 'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 2}
-12.279770349117157
CPU times: total: 5.98 s
Wall time: 51min"""

{'criterion': 'squared_error', 'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 2}
-12.279770349117157
CPU times: total: 5.98 s
Wall time: 51min


In [9]:
%%time

#another grid search to find best hyperparameters

rfr = RandomForestRegressor(max_depth =7, criterion='squared_error')

param_rfr = [{'min_samples_leaf': [2,4,8,16]}]

grid_search_rfr = GridSearchCV(rfr, param_rfr, cv=5, scoring = 'neg_root_mean_squared_error', n_jobs=-1)
grid_search_rfr.fit(X_train_r_ss, y_train_r)
print(grid_search_rfr.best_params_)
print(grid_search_rfr.best_score_)

"""{'min_samples_leaf': 8}
-12.280358330682516
CPU times: total: 4.86 s
Wall time: 29.4 """

{'min_samples_leaf': 8}
-12.280358330682516
CPU times: total: 4.86 s
Wall time: 29.4 s
