In [1]:
# Setup autoreload
%load_ext autoreload
%autoreload 2

# Give yourself access to common
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [94]:
# Gather training and testing data and process it to be configured for time intervals
from common import *
from sklearn.model_selection import train_test_split
import pandas as pd

nba = get_next_year_external_data()

FUT_SAL_CLASS = 'future_salary_class'
max = nba[NEXT_Y_SAL].max()
min = nba[NEXT_Y_SAL].min()
nba[FUT_SAL_CLASS] = nba[NEXT_Y_SAL].apply(lambda x: get_salary_class(x,max,min))
feats = get_extern_features()
feats.append(FUT_SAL_CLASS)

X = pd.DataFrame(columns=feats)
y = pd.DataFrame()

for i, row in nba.iterrows() :
    next_val = nba.loc[((nba[P_NAME] == row[P_NAME]) & (nba[SZN_START_Y] == (row[SZN_START_Y]+1)))]  
    if not next_val.empty :
        nba.loc[i,'y'] = next_val.index[0]
    else :
        nba.loc[i,'y'] = np.nan

X = nba.copy(deep=True)
X = X.dropna()
y = nba.loc[X['y']]

In [122]:
# Normalize the data
print(X.columns)
print(y.columns)
X = X[feats]
y = y[feats]

# Scale the stuff down
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
y_scaled = scaler.transform(y)
y_scaled = y_scaled[:,:(len(get_base_features()))]

# Sanity check the scaler
print((X['FG']).iloc[0])
print(X_scaled[0, X.columns.get_loc('FG')])
inv = scaler.inverse_transform(X_scaled)
print(inv[0, X.columns.get_loc('FG')])


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size = 0.25, random_state=42)

Index(['MP', 'PTS', 'Age', 'games', 'games_started', 'PER', 'FTr', 'AST',
       'STL', 'TRB', 'FT', '3P', 'FG', 'height', 'weight', 'all_star_total',
       'all_star_enc', 'all_nba_enc', 'all_nba_total', 'draft_pick',
       'champion', 'conference_champ', 'mvp', 'mvp_rank', 'mvp_total',
       'player_week_enc', 'player_week_total', 'dpoy', 'dpoy_rank',
       'dpoy_total', 'future_salary_class'],
      dtype='object')
Index(['MP', 'PTS', 'Age', 'games', 'games_started', 'PER', 'FTr', 'AST',
       'STL', 'TRB', 'FT', '3P', 'FG', 'height', 'weight', 'all_star_total',
       'all_star_enc', 'all_nba_enc', 'all_nba_total', 'draft_pick',
       'champion', 'conference_champ', 'mvp', 'mvp_rank', 'mvp_total',
       'player_week_enc', 'player_week_total', 'dpoy', 'dpoy_rank',
       'dpoy_total', 'future_salary_class'],
      dtype='object')
655.0
2.228594445658938
655.0


In [123]:
# Fit the model
from sklearn.neural_network import MLPRegressor
import numpy as np

# MLPClassifier only classifies data as integers or strings, therefore, our problem is one of regression for the neural network
# Consequently I should use the mlp regressor
mlp = MLPRegressor(hidden_layer_sizes=(10,10,10), solver='adam', max_iter=1000)
mlp.fit(X_train,y_train)

# Evaluate the model
import numpy as np
from sklearn.metrics import mean_squared_error
predict_test = mlp.predict(X_test)
test_set_rsquared = mlp.score(X_test, y_test)
test_set_rmse = np.sqrt(mean_squared_error(predict_test, y_test))
print('R_squared value: ', test_set_rsquared) # Variances are not nicely correlated
print('RMSE: ', test_set_rmse) # Fairly good at data prediction


R_squared value:  0.544563211106863
RMSE:  0.6983154608035047


In [128]:
# Denormalize the data
predict_test = mlp.predict((X_scaled[0]).reshape(1,-1))
print(X_train.shape[1])
print(predict_test)
tmp = np.zeros((1,X_train.shape[1]))
tmp[:,:predict_test.shape[1]] = predict_test
print(tmp)
undone = scaler.inverse_transform(tmp)
print(undone.flatten().tolist()[:13])
print(y[get_base_features()].iloc[0])

31
[[ 0.09979132  0.24345098  2.00683168 -0.04667311  0.43485831  0.69535791
   1.31129013 -0.29371348 -0.28086932  1.14292482  0.55134321 -0.83101524
   0.25278441]]
[[ 0.09979132  0.24345098  2.00683168 -0.04667311  0.43485831  0.69535791
   1.31129013 -0.29371348 -0.28086932  1.14292482  0.55134321 -0.83101524
   0.25278441  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.        ]]
[1758.7105049997422, 818.1881235394914, 33.94271707986599, 62.44990121115974, 50.22877392637578, 17.776465646143425, 0.53634896346997, 109.60401111655896, 43.457775226849044, 529.090034392435, 200.4914205403015, -0.9430328320497381, 306.4933077602975]
MP               848.000
PTS              540.000
Age               35.000
games             26.000
games_started     26.000
PER               23.300
FTr                0.462
AST               28.000
STL               1

In [137]:
# Compute the confidence interval of each of the features

# Get the unscaled predictions
predict_full = mlp.predict(X_scaled)
tmp = np.zeros(X_scaled.shape)
tmp[:,:predict_full.shape[1]] = predict_full
predicted_unscaled_vals = scaler.inverse_transform(tmp)
predicted_unscaled_vals = predicted_unscaled_vals[:,:predict_full.shape[1]]

# compute the RMSE for each feature in the output vector
actual_unscaled_vals = y[get_base_features()].values
rmse = np.sqrt(np.square(predicted_unscaled_vals - actual_unscaled_vals))

# compute the percentilse of the RMSE to get the confidence interval of each predicted feature
ci_up = np.percentile(rmse, 95, axis=0) # 95th percentile
ci_low = np.percentile(rmse, 5, axis=0) # 5th percentile
print("Off by at most this much:", ci_up.flatten().tolist())
print(np.max(actual_unscaled_vals, axis=0))
print(np.min(actual_unscaled_vals, axis=0)) 
# print(ci_low.flatten().tolist()) 

# TODO: validate these findings with the very basic network to see if it predicts that these stats will land us in our desired class

Off by at most this much: [1167.2935271353363, 575.2583820435141, 0.9230330865829134, 36.4534064944859, 44.51769700447245, 5.647797395913314, 0.19857049557170614, 169.67496835078165, 52.199777292083795, 271.0213703291426, 137.529373735622, 65.06267539248756, 217.14847440028575]
[3485.  2832.    40.    85.    83.    45.3    6.   925.   225.  1226.
  756.   402.   978. ]
[  1.    0.   19.    1.    0.  -40.7   0.    0.    0.    0.    0.    0.
   0. ]


In [130]:
from sklearn.ensemble import RandomForestRegressor 
random_forest_model = RandomForestRegressor(n_estimators = 60, random_state = 0)

random_forest_model.fit(X_train, y_train) 

y_pred = random_forest_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("MSE: ",mse)
print("RMSE: ",rmse)


MSE:  0.5062924385242248
RMSE:  0.7115422956678153
