In [2]:
# Setup autoreload
%load_ext autoreload
%autoreload 2

# Give yourself access to common
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
# Gather training and testing data and process it to be configured for time intervals
from common import *
from sklearn.model_selection import train_test_split
import pandas as pd

nba = get_next_year_external_data()

FUT_SAL_CLASS = 'future_salary_class'
max = nba[NEXT_Y_SAL].max()
min = nba[NEXT_Y_SAL].min()
nba[FUT_SAL_CLASS] = nba[NEXT_Y_SAL].apply(lambda x: get_salary_class(x,max,min))
feats = get_extern_features()
feats.append(FUT_SAL_CLASS)

X = pd.DataFrame(columns=feats)
y = pd.DataFrame()

for i, row in nba.iterrows() :
    next_val = nba.loc[((nba[P_NAME] == row[P_NAME]) & (nba[SZN_START_Y] == (row[SZN_START_Y]+1)))]  
    if not next_val.empty :
        nba.loc[i,'y'] = next_val.index[0]
    else :
        nba.loc[i,'y'] = np.nan

X = nba.copy(deep=True)
X = X.dropna()
y = nba.loc[X['y']]

In [11]:
# Normalize the data
# print(X.columns)
# print(y.columns)
X = X[feats]
y = y[feats]
print(X.columns)
print(y.columns)
# Scale the stuff down
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
y_scaled = scaler.transform(y)
y_scaled = y_scaled[:,:(len(get_base_features()))]

# Sanity check the scaler
print((X['FG']).iloc[0])
print(X_scaled[0, X.columns.get_loc('FG')])
inv = scaler.inverse_transform(X_scaled)
print(inv[0, X.columns.get_loc('FG')])


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size = 0.25, random_state=42)

Index(['MP', 'PTS', 'Age', 'games', 'games_started', 'PER', 'FTr', 'AST',
       'STL', 'TRB', 'FT', '3P', 'FG', 'height', 'weight', 'all_star_total',
       'all_star_enc', 'all_nba_enc', 'all_nba_total', 'draft_pick',
       'champion', 'conference_champ', 'mvp', 'mvp_rank', 'mvp_total',
       'player_week_enc', 'player_week_total', 'dpoy', 'dpoy_rank',
       'dpoy_total', 'future_salary_class'],
      dtype='object')
Index(['MP', 'PTS', 'Age', 'games', 'games_started', 'PER', 'FTr', 'AST',
       'STL', 'TRB', 'FT', '3P', 'FG', 'height', 'weight', 'all_star_total',
       'all_star_enc', 'all_nba_enc', 'all_nba_total', 'draft_pick',
       'champion', 'conference_champ', 'mvp', 'mvp_rank', 'mvp_total',
       'player_week_enc', 'player_week_total', 'dpoy', 'dpoy_rank',
       'dpoy_total', 'future_salary_class'],
      dtype='object')
655.0
2.228594445658938
655.0


In [10]:
# Fit the model
from sklearn.neural_network import MLPRegressor
import numpy as np

# MLPClassifier only classifies data as integers or strings, therefore, our problem is one of regression for the neural network
# Consequently I should use the mlp regressor
mlp = MLPRegressor(hidden_layer_sizes=(10,10,10), solver='adam', max_iter=1000)
mlp.fit(X_train,y_train)

# Evaluate the model
import numpy as np
from sklearn.metrics import mean_squared_error
predict_test = mlp.predict(X_test)
test_set_rsquared = mlp.score(X_test, y_test)
test_set_rmse = np.sqrt(mean_squared_error(predict_test, y_test))
print('R_squared value: ', test_set_rsquared) # Variances are not nicely correlated
print('RMSE: ', test_set_rmse) # Fairly good at data prediction
y_test

R_squared value:  0.5384459712465142
RMSE:  0.7034877538815848


array([[ 0.8784,  0.5664,  1.503 , ...,  0.9078, -0.7322,  0.5788],
       [ 0.8065,  0.8959,  1.2436, ...,  0.9593, -0.2724,  0.9473],
       [ 0.3332,  0.2244,  0.7249, ...,  0.2809, -0.7722,  0.3237],
       ...,
       [ 0.0577, -0.0595, -0.3124, ..., -0.3546,  0.6072, -0.0505],
       [ 0.6994,  0.8628, -0.5718, ...,  0.384 ,  0.6272,  0.9643],
       [ 0.4734,  0.2058,  0.7249, ...,  0.3754,  0.1074,  0.1423]])

In [6]:
# Denormalize the data
predict_test = mlp.predict((X_scaled[0]).reshape(1,-1))
print(X_train.shape[1])
print(predict_test)
tmp = np.zeros((1,X_train.shape[1]))
tmp[:,:predict_test.shape[1]] = predict_test
print(tmp)
undone = scaler.inverse_transform(tmp)
print(undone.flatten().tolist()[:13])
print(y[get_base_features()].iloc[0])

31
[[-0.09648197  0.20043841  1.39347494 -0.1630938   0.14945895  0.90856153
   1.87347198 -0.91617976 -0.72312919  1.29120111  0.83793324 -1.47905973
   0.21048881]]
[[-0.09648197  0.20043841  1.39347494 -0.1630938   0.14945895  0.90856153
   1.87347198 -0.91617976 -0.72312919  1.29120111  0.83793324 -1.47905973
   0.21048881  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.        ]]
[1592.0039279936177, 797.4355141107494, 31.57772408588736, 60.22206248863617, 41.55932615025075, 18.75443648550864, 0.6288436335958985, 17.1602377040725, 27.327419003784147, 559.3262723279156, 233.8634095665488, -33.360441622082206, 299.0329253147766]
MP               848.000
PTS              540.000
Age               35.000
games             26.000
games_started     26.000
PER               23.300
FTr                0.462
AST               28.000
STL               1

In [7]:
# Compute the confidence interval of each of the features
from common import *

# Get the unscaled predictions
predict_full = mlp.predict(X_scaled)
tmp = np.zeros(X_scaled.shape)
tmp[:,:predict_full.shape[1]] = predict_full
predicted_unscaled_vals = scaler.inverse_transform(tmp)
predicted_unscaled_vals = predicted_unscaled_vals[:,:predict_full.shape[1]]

# compute the RMSE for each feature in the output vector
actual_unscaled_vals = y[get_base_features()].values
rmse = np.sqrt(np.square(predicted_unscaled_vals - actual_unscaled_vals))

# compute the percentilse of the RMSE to get the confidence interval of each predicted feature
ci_up = np.percentile(rmse, 95, axis=0) # 95th percentile
ci_low = np.percentile(rmse, 5, axis=0) # 5th percentile
print("Off by at most this much:", ci_up.flatten().tolist())
print(np.max(actual_unscaled_vals, axis=0))
print(np.min(actual_unscaled_vals, axis=0)) 
# print(ci_low.flatten().tolist()) 

# TODO: validate these findings with the very basic network to see if it predicts that these stats will land us in our desired class
np.set_printoptions(suppress=True, precision=4)
print(confidence_interval_numpy(predicted_unscaled_vals, actual_unscaled_vals)) # requires the assumption of gaussianity, but seems to be working?


print(bootstrap_confidence_interval(predicted_unscaled_vals, actual_unscaled_vals)) # should be able to work without the assumption of gaussianity
# computing for rmse, so says that the true root mean squared error has a 95% of landing in these intervals
# residuals are likely far smaller because you have the negatives balancing things out so it is going to be less accurate for computing the confidence interval

residuals = predicted_unscaled_vals - actual_unscaled_vals
ci = np.quantile(residuals, 1 - 0.05, axis=0)
print("HELLO")
print(ci)

# I want to say that my prediction is between these 2 bounds with 95% confidence
# The two functions I provided seem to find the rmse of each parameter with 95% confidence
# the first and 4th method I try do not scale the mean of the data and therefore, I don't think they necessarily apply to giving me the confidence interval I desire



# DO A CASE STUDY on an individual
# ANALYZE THIS MODEL ON BASELINE AND EXTERNAL




Off by at most this much: [1173.7277934161475, 572.6871336437432, 1.1032783140089961, 36.630759509685305, 44.36668716823427, 5.623357765732388, 0.19639209141556396, 166.26071943551509, 51.64756898086105, 269.47194499486983, 139.16873661189015, 64.96848336941906, 213.5963551831732]
[3485.  2832.    40.    85.    83.    45.3    6.   925.   225.  1226.
  756.   402.   978. ]
[  1.    0.   19.    1.    0.  -40.7   0.    0.    0.    0.    0.    0.
   0. ]
[[477.2444 497.728 ]
 [212.2477 222.4122]
 [  0.3876   0.4157]
 [ 13.7705  14.3844]
 [ 18.194   18.9543]
 [  2.1477   2.2671]
 [  0.0721   0.0783]
 [ 54.4207  57.5753]
 [ 18.7665  19.6952]
 [ 96.427  101.3828]
 [ 48.2369  50.8067]
 [ 21.1657  22.3882]
 [ 80.6696  84.4823]]
{'column_0': (477.3099324203418, 497.83566762939677), 'column_1': (212.34921493108195, 222.45566885636774), 'column_2': (0.38794279021124956, 0.4158607239573993), 'column_3': (13.7620954530206, 14.39440929745578), 'column_4': (18.218059582242542, 18.994197698621157), 'co

In [8]:
from sklearn.ensemble import RandomForestRegressor 
random_forest_model = RandomForestRegressor(n_estimators = 60, random_state = 0)

random_forest_model.fit(X_train, y_train) 

y_pred = random_forest_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("MSE: ",mse)
print("RMSE: ",rmse)


MSE:  0.5062924385242248
RMSE:  0.7115422956678153
