In [168]:
# Setup autoreload
%load_ext autoreload
%autoreload 2

# Give yourself access to common
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [169]:
# Gather training and testing data and process it to be configured for time intervals
from common import *
from sklearn.model_selection import train_test_split
import pandas as pd

nba = get_next_year_external_data()

FUT_SAL_CLASS = 'future_salary_class'
max = nba[NEXT_Y_SAL].max()
min = nba[NEXT_Y_SAL].min()
nba[FUT_SAL_CLASS] = nba[NEXT_Y_SAL].apply(lambda x: get_salary_class(x,max,min))
feats = get_extern_features()
feats.append(FUT_SAL_CLASS)

X = pd.DataFrame(columns=feats)
y = pd.DataFrame()

for i, row in nba.iterrows() :
    next_val = nba.loc[((nba[P_NAME] == row[P_NAME]) & (nba[SZN_START_Y] == (row[SZN_START_Y]+1)))]  
    if not next_val.empty :
        nba.loc[i,'y'] = next_val.index[0]
    else :
        nba.loc[i,'y'] = np.nan

X = nba.copy(deep=True)
X = X.dropna()
y = nba.loc[X['y']]

In [158]:
# Normalize the data
print(X.columns)
print(y.columns)
X = X[feats]
y = y[feats]

# Scale the stuff down
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
y_scaled = scaler.transform(y)
y_scaled = y_scaled[:,:(len(get_base_features()))]

# Sanity check the scaler
print((X['FG']).iloc[0])
print(X_scaled[0, X.columns.get_loc('FG')])
inv = scaler.inverse_transform(X_scaled)
print(inv[0, X.columns.get_loc('FG')])


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size = 0.25, random_state=42)

Index(['playerName', 'seasonStartYear', 'salary', 'inflationAdjSalary',
       'startYear', 'height', 'weight', 'MP', 'PTS', 'Age', 'games',
       'games_started', 'PER', 'FTr', 'AST', 'STL', 'TRB', 'FT', '3P', 'FG',
       'all_star_total', 'all_star_enc', 'all_nba_enc', 'all_nba_total',
       'draft_pick', 'champion', 'conference_champ', 'mvp', 'mvp_rank',
       'mvp_total', 'player_week_enc', 'player_week_total', 'dpoy',
       'dpoy_rank', 'dpoy_total', 'next_year_salary', 'inflationAdjSalary_log',
       'future_salary_class', 'y'],
      dtype='object')
Index(['playerName', 'seasonStartYear', 'salary', 'inflationAdjSalary',
       'startYear', 'height', 'weight', 'MP', 'PTS', 'Age', 'games',
       'games_started', 'PER', 'FTr', 'AST', 'STL', 'TRB', 'FT', '3P', 'FG',
       'all_star_total', 'all_star_enc', 'all_nba_enc', 'all_nba_total',
       'draft_pick', 'champion', 'conference_champ', 'mvp', 'mvp_rank',
       'mvp_total', 'player_week_enc', 'player_week_total', 'dpoy',


In [159]:
# Fit the model
from sklearn.neural_network import MLPRegressor
import numpy as np

# MLPClassifier only classifies data as integers or strings, therefore, our problem is one of regression for the neural network
# Consequently I should use the mlp regressor
mlp = MLPRegressor(hidden_layer_sizes=(10,10,10), solver='adam', max_iter=1000)
mlp.fit(X_train,y_train)

# Evaluate the model
import numpy as np
from sklearn.metrics import mean_squared_error
predict_test = mlp.predict(X_test)
test_set_rsquared = mlp.score(X_test, y_test)
test_set_rmse = np.sqrt(mean_squared_error(predict_test, y_test))
print('R_squared value: ', test_set_rsquared) # Variances are not nicely correlated
print('RMSE: ', test_set_rmse) # Fairly good at data prediction


R_squared value:  0.5430047460879769
RMSE:  0.6990395328852275


In [160]:
# Denormalize the data
predict_test = mlp.predict((X_scaled[0]).reshape(1,-1))
print(X_train.shape[1])
print(predict_test)
tmp = np.zeros((1,X_train.shape[1]))
tmp[:,:predict_test.shape[1]] = predict_test
print(tmp)
undone = scaler.inverse_transform(tmp)
print(undone.flatten().tolist()[:13])
print(y[get_base_features()].iloc[0])

31
[[ 0.15406592  0.44811503  2.14461059 -0.07218649  0.27128318  1.20844438
   0.90593634 -0.12750142 -0.27509674  1.02846222  0.85351602 -0.87813941
   0.42424165]]
[[ 0.15406592  0.44811503  2.14461059 -0.07218649  0.27128318  1.20844438
   0.90593634 -0.12750142 -0.27509674  1.02846222  0.85351602 -0.87813941
   0.42424165  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.        ]]
[1804.8091457537826, 916.9339999194603, 34.47396770479206, 61.9616744628393, 45.259926037131166, 20.130007368850883, 0.4696569039802204, 134.28851197940955, 43.66831592523483, 505.7490213502364, 235.67794614490566, -3.300345707238563, 336.73609208506787]
MP               848.000
PTS              540.000
Age               35.000
games             26.000
games_started     26.000
PER               23.300
FTr                0.462
AST               28.000
STL             

In [175]:
# Compute the confidence interval of each of the features
from common import *

# Get the unscaled predictions
predict_full = mlp.predict(X_scaled)
tmp = np.zeros(X_scaled.shape)
tmp[:,:predict_full.shape[1]] = predict_full
predicted_unscaled_vals = scaler.inverse_transform(tmp)
predicted_unscaled_vals = predicted_unscaled_vals[:,:predict_full.shape[1]]

# compute the RMSE for each feature in the output vector
actual_unscaled_vals = y[get_base_features()].values
rmse = np.sqrt(np.square(predicted_unscaled_vals - actual_unscaled_vals))

# compute the percentilse of the RMSE to get the confidence interval of each predicted feature
ci_up = np.percentile(rmse, 95, axis=0) # 95th percentile
ci_low = np.percentile(rmse, 5, axis=0) # 5th percentile
print("Off by at most this much:", ci_up.flatten().tolist())
print(np.max(actual_unscaled_vals, axis=0))
print(np.min(actual_unscaled_vals, axis=0)) 
# print(ci_low.flatten().tolist()) 

# TODO: validate these findings with the very basic network to see if it predicts that these stats will land us in our desired class
np.set_printoptions(suppress=True, precision=4)
print(confidence_interval_numpy(predicted_unscaled_vals, actual_unscaled_vals)) # requires the assumption of gaussianity, but seems to be working?


print(bootstrap_confidence_interval(predicted_unscaled_vals, actual_unscaled_vals)) # should be able to work without the assumption of gaussianity
# computing for rmse, so says that the true root mean squared error has a 95% of landing in these intervals
# residuals are likely far smaller because you have the negatives balancing things out so it is going to be less accurate for computing the confidence interval

residuals = predicted_unscaled_vals - actual_unscaled_vals
ci = np.quantile(residuals, 1 - 0.05, axis=0)
print("HELLO")
print(ci)

# I want to say that my prediction is between these 2 bounds with 95% confidence
# The two functions I provided seem to find the rmse of each parameter with 95% confidence
# the first and 4th method I try do not scale the mean of the data and therefore, I don't think they necessarily apply to giving me the confidence interval I desire





Off by at most this much: [1172.3564560815996, 569.4829357942618, 1.3896894759170844, 36.59471429489035, 44.37898972929316, 5.597520084498786, 0.2042226668532916, 158.93510736691613, 51.968732395224585, 270.11117542236786, 136.78868259442984, 65.96950758862283, 215.53969882340346]
[3485.  2832.    40.    85.    83.    45.3    6.   925.   225.  1226.
  756.   402.   978. ]
[  1.    0.   19.    1.    0.  -40.7   0.    0.    0.    0.    0.    0.
   0. ]
4772
[[476.9914 497.5006]
 [212.5013 222.6236]
 [  0.5104   0.5384]
 [ 13.7526  14.3679]
 [ 18.3791  19.1324]
 [  2.1031   2.2212]
 [  0.0725   0.0787]
 [ 52.6434  55.7149]
 [ 19.2409  20.1735]
 [ 94.9067  99.88  ]
 [ 47.6874  50.2142]
 [ 20.4182  21.6369]
 [ 81.2126  85.0352]]
{'column_0': (476.6990517898919, 497.41251769766905), 'column_1': (212.7659882709957, 222.63260566861223), 'column_2': (0.5099046510887357, 0.5387244200121231), 'column_3': (13.731411075318098, 14.397275836512582), 'column_4': (18.358303281091086, 19.12155557095763)

In [162]:
from sklearn.ensemble import RandomForestRegressor 
random_forest_model = RandomForestRegressor(n_estimators = 60, random_state = 0)

random_forest_model.fit(X_train, y_train) 

y_pred = random_forest_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("MSE: ",mse)
print("RMSE: ",rmse)


MSE:  0.5062924385242248
RMSE:  0.7115422956678153
