In [56]:
# Give yourself access to common
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# Setup autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [95]:
# Gather training and testing data
from common import *
from sklearn.model_selection import train_test_split

nba = get_cleaned_external_data()
# nba = get_cleaned_baseline_data()
nba = add_log_y_values(nba)
X_train, X_test, y_train, y_test = split_data(nba)
print(X_train.shape)

(5754, 31)


In [96]:
# Fit the model
from sklearn.neural_network import MLPRegressor
import numpy as np

# MLPClassifier only classifies data as integers or strings, therefore, our problem is one of regression for the neural network
# Consequently I should use the mlp regressor
params = {
    'activation': 'tanh',
    'alpha': 0.001,
    'hidden_layer_sizes': (100, 100),
    'learning_rate_init': 0.001,
}

mlp = MLPRegressor(solver='adam', max_iter=10000, **params)
mlp.fit(X_train,y_train)


In [97]:
# Evaluate the model
import numpy as np
from sklearn.metrics import mean_squared_error
predict_test = mlp.predict(X_test)
test_set_rsquared = mlp.score(X_test, y_test)
test_set_rmse = np.sqrt(mean_squared_error(predict_test, y_test))
print('R_squared value: ', test_set_rsquared) # Variances are not nicely correlated
print("MSE:", mean_squared_error(y_test, predict_test))
print('RMSE: ', test_set_rmse) # Fairly good at data prediction

R_squared value:  0.09021872975208434
MSE: 1.1682098625948054
RMSE:  1.080837574566505


In [60]:
# Tune the Hyerparameters
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
# Define the hyperparameters you want to tune
params = {
    'hidden_layer_sizes': [(10,10,10), (20,20,20), (30,30,30), (20, 20), (10, 20)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate_init': [0.001, 0.01, 0.1]
}

# Use GridSearchCV to search over the hyperparameter grid
grid_search = GridSearchCV(mlp, param_grid=params, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object on the training data
grid_search.fit(X_train, y_train)

# Print the best set of hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)
print("Best MSE:", grid_search.best_score_)

# Evaluate the model on the validation set using the best hyperparameters
mlp = grid_search.best_estimator_

Best Hyperparameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (30, 30, 30), 'learning_rate_init': 0.01}
Best MSE: -1.1731360744922879


In [61]:
best_score = 0
best_mse = 0
best_rmse = 0

params = grid_search.best_params_
for i in range(40):
    estimator = MLPRegressor(solver='adam', max_iter=10000, **params)
    estimator.fit(X_train, y_train)
    predict_test = estimator.predict(X_test)
    test_set_rsquared = estimator.score(X_test, y_test)
    test_set_rmse = np.sqrt(mean_squared_error(predict_test, y_test))
    if test_set_rsquared > best_score:
        best_score = test_set_rsquared
        best_mse = mean_squared_error(y_test, predict_test)
        best_rmse = test_set_rmse
        
        
print("R2:", best_score)
print("MSE:", best_mse)
print("RMSE:", best_rmse)
    # print('R_squared value: ', test_set_rsquared) # Variances are not nicely correlated
    # print("MSE:", mean_squared_error(y_test, predict_test))
    # print('RMSE: ', test_set_rmse) 
    

R2: 0.06277157937313427
MSE: 1.480380017977984
RMSE: 1.2167086824618225


In [62]:
# Evaluate the hyper parameter tuned model
import numpy as np
from sklearn.metrics import mean_squared_error
predict_test = mlp.predict(X_test)
test_set_rsquared = mlp.score(X_test, y_test)
test_set_rmse = np.sqrt(mean_squared_error(predict_test, y_test))
print('R_squared value: ', test_set_rsquared) # Variances are not nicely correlated
print("MSE:", mean_squared_error(y_test, predict_test))
print('RMSE: ', test_set_rmse) # Fairly good at data prediction


R_squared value:  -0.0353849923040046
MSE: 1.6354212268722577
RMSE:  1.2788358873883146


In [63]:
# Save Model
import pickle
from common import NN_FILENAME
pickle.dump(mlp, open('saved_models/' + NN_FILENAME, 'wb'))