In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv('Final.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Player,season,NBA_Salary,Class,Pos,School,Conf,G,MP,...,PPG,APG,TRB/G,STL/G,TOV/G,BLK/G,3P%,FG%,eFG%,sal_as_%
0,0,A.J. Hammons,2016,650000,4,C,Purdue,Big Ten,33,813.0,...,14.969697,1.121212,8.181818,0.272727,1.969697,2.545455,0.545455,0.592262,0.60119,0.006904
1,1,A.J. Price,2009,62552,4,G,Connecticut,Big East,35,1112.0,...,14.714286,4.685714,3.485714,0.685714,2.685714,0.0,0.401961,0.407674,0.505995,0.001084
2,2,Aaron Brooks,2007,972720,4,G,Oregon,Pac-10,35,1289.0,...,17.742857,4.257143,4.257143,1.371429,2.542857,0.171429,0.40404,0.460043,0.546436,0.017486
3,3,Aaron Gray,2007,427163,4,C,Pittsburgh,Big East,36,1016.0,...,13.861111,1.694444,9.472222,0.416667,1.527778,1.722222,0.0,0.565097,0.565097,0.007679
4,4,Acie Law,2007,203057,4,G,Texas A&M,Big 12,34,1153.0,...,18.058824,4.970588,3.323529,1.147059,2.588235,0.029412,0.457831,0.5,0.543981,0.00365


In [4]:
data_1 = data.rename(columns={'sal_as_%':'sal'})

In [5]:
#Using sklearn to get a baseline model on predicting salary. Since the dependent variable is continuous, we will use regression predictions.
from sklearn.linear_model import LinearRegression
lm = LinearRegression()

In [6]:
#Dropping all irrelevant columns
X = data.drop(['sal_as_%','Player','School','Conf','Unnamed: 0', 'Pos'], axis = 1)

In [7]:
X.head()

Unnamed: 0,season,NBA_Salary,Class,G,MP,FG,FGA,2P,2PA,3P,...,PTS,PPG,APG,TRB/G,STL/G,TOV/G,BLK/G,3P%,FG%,eFG%
0,2016,650000,4,33,813.0,199,336,193,325,6,...,494,14.969697,1.121212,8.181818,0.272727,1.969697,2.545455,0.545455,0.592262,0.60119
1,2009,62552,4,35,1112.0,170,417,88,213,82,...,515,14.714286,4.685714,3.485714,0.685714,2.685714,0.0,0.401961,0.407674,0.505995
2,2007,972720,4,35,1289.0,213,463,133,265,80,...,621,17.742857,4.257143,4.257143,1.371429,2.542857,0.171429,0.40404,0.460043,0.546436
3,2007,427163,4,36,1016.0,204,361,204,361,0,...,499,13.861111,1.694444,9.472222,0.416667,1.527778,1.722222,0.0,0.565097,0.565097
4,2007,203057,4,34,1153.0,216,432,178,349,38,...,614,18.058824,4.970588,3.323529,1.147059,2.588235,0.029412,0.457831,0.5,0.543981


In [8]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from statsmodels.formula.api import ols

In [9]:
#Using Ridge regression to find the most relevant features
ridge = Ridge()
ridge_cv = RidgeCV()

In [10]:
Y = data_1.sal
X_cols = X.to_numpy()

In [11]:
from itertools import combinations
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

In [12]:
#Creating a train-test split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,Y, test_size = .3, random_state = 0)

In [13]:
#Using RFE for feature selection
selector = RFE(estimator = ridge, step = 1)
selector = selector.fit(Xtrain,Ytrain)
selector.ranking_
selected_features = pd.DataFrame({'Features': X.columns, 'Ranking': selector.ranking_})

In [14]:
#Sorting the features by most important
selected_features.sort_values('Ranking')

Unnamed: 0,Features,Ranking
15,TRB,1
28,3P%,1
27,BLK/G,1
26,TOV/G,1
25,STL/G,1
24,TRB/G,1
23,APG,1
22,PPG,1
19,TOV,1
18,BLK,1


In [15]:
#sorting by best features selected by RFE
features = selected_features.query('Ranking == 1')
features

Unnamed: 0,Features,Ranking
2,Class,1
3,G,1
15,TRB,1
17,STL,1
18,BLK,1
19,TOV,1
22,PPG,1
23,APG,1
24,TRB/G,1
25,STL/G,1


In [16]:
list(features['Features'])

['Class',
 'G',
 'TRB',
 'STL',
 'BLK',
 'TOV',
 'PPG',
 'APG',
 'TRB/G',
 'STL/G',
 'TOV/G',
 'BLK/G',
 '3P%',
 'FG%',
 'eFG%']

In [17]:
#Training model on rfe features 
rfe_features_ = Xtrain.loc[:,list(features['Features'])]
ridge_rfe = RidgeCV(alphas = [1e-3, 1e-2, 1e-1, 1, 10]).fit(rfe_features_, Ytrain)

In [18]:
#Testing model on rfe features
ridge_rfe_t = Xtest.loc[:,list(features['Features'])]
ridge_rfe.score(ridge_rfe_t, Ytest)

0.11854767061061877

In [19]:
#Alternative solution to for feature selection - however it is inefficient
#Finding the optimal combination of features for the highest ridge score
#First creating all combinations

comb_1 = combinations(range(2,21),1)
comb_2 = combinations(range(2,21),2)
comb_3 = combinations(range(2,21),3)
comb_4 = combinations(range(2,21),4)
comb_5 = combinations(range(2,21),5)
comb_6 = combinations(range(2,21),6)
comb_7 = combinations(range(2,21),7)
comb_8 = combinations(range(2,21),8)

list_1 = list(comb_1)
list_2 = list(comb_2)
list_3 = list(comb_3)
list_4 = list(comb_4)
list_5 = list(comb_5)
list_6 = list(comb_6)
list_7 = list(comb_7)
list_8 = list(comb_8)

V = pd.DataFrame(X_cols)

score_list_1 = []
score_list_2 = []
score_list_3 = []
score_list_4 = []
score_list_5 = []
score_list_6 = []
score_list_7 = []
score_list_8 = []

In [20]:
#Creating a list of combinations, fitting the model to those combinations and getting sorting by the highest score
for i in range(0,len(list_1)):
    Z1 = V.iloc[:,list(list_1[i])]
    ridge.fit(Z1,Y)
    score_list_1.append([i,ridge.score(Z1,Y)])
score_df_1 = pd.DataFrame(data = score_list_1, columns = ['index', 'score'])
score_df_1.sort_values('score', ascending = False).head(1)

Unnamed: 0,index,score
0,0,0.224639


In [21]:
for i in range(0,len(list_2)):
    Z2 = V.iloc[:,list(list_2[i])]
    ridge.fit(Z2,Y)
    score_list_2.append([i,ridge.score(Z2,Y)])
score_df_2 = pd.DataFrame(data = score_list_2, columns = ['index', 'score'])
score_df_2.sort_values('score', ascending = False).head(1)

Unnamed: 0,index,score
2,2,0.254006


In [22]:
for i in range(0,len(list_3)):
    Z3 = V.iloc[:,list(list_3[i])]
    ridge.fit(Z3,Y)
    score_list_3.append([i,ridge.score(Z3,Y)])
score_df_3 = pd.DataFrame(data = score_list_3, columns = ['index', 'score'])
score_df_3.sort_values('score', ascending = False).head(1)

Unnamed: 0,index,score
134,134,0.283658


In [23]:
for i in range(0,len(list_4)):
    Z4 = V.iloc[:,list(list_4[i])]
    ridge.fit(Z4,Y)
    score_list_4.append([i,ridge.score(Z4,Y)])
score_df_4 = pd.DataFrame(data = score_list_4, columns = ['index', 'score'])
score_df_4.sort_values('score', ascending = False).head(1)

Unnamed: 0,index,score
266,266,0.300107


In [24]:
for i in range(0,len(list_5)):
    Z5 = V.iloc[:,list(list_5[i])]
    ridge.fit(Z5,Y)
    score_list_5.append([i,ridge.score(Z5,Y)])
score_df_5 = pd.DataFrame(data = score_list_5, columns = ['index', 'score'])
score_df_5.sort_values('score', ascending = False).head(1)

Unnamed: 0,index,score
1291,1291,0.309012


In [25]:
for i in range(0,len(list_6)):
    Z6 = V.iloc[:,list(list_6[i])]
    ridge.fit(Z6,Y)
    score_list_6.append([i,ridge.score(Z6,Y)])
score_df_6 = pd.DataFrame(data = score_list_6, columns = ['index', 'score'])
score_df_6.sort_values('score', ascending = False).head(1)

Unnamed: 0,index,score
5008,5008,0.3244


In [26]:
for i in range(0,len(list_7)):
    Z7 = V.iloc[:,list(list_7[i])]
    ridge.fit(Z7,Y)
    score_list_7.append([i,ridge.score(Z7,Y)])
score_df_7 = pd.DataFrame(data = score_list_7, columns = ['index', 'score'])
score_df_7.sort_values('score', ascending = False).head(1)

Unnamed: 0,index,score
11165,11165,0.335031


In [27]:
for i in range(0,len(list_8)):
    Z8 = V.iloc[:,list(list_8[i])]
    ridge.fit(Z8,Y)
    score_list_8.append([i,ridge.score(Z8,Y)])
    
score_df_8 = pd.DataFrame(data = score_list_8, columns = ['index', 'score'])
score_df_8.sort_values('score', ascending = False).head(1)

Unnamed: 0,index,score
21828,21828,0.339388


In [28]:
#This is the combination of features that gives the highest score
list_8[21828]

(2, 5, 6, 9, 12, 14, 17, 18)

In [29]:
list(X.iloc[:,[2,5,6,9,12,14,17,18]].head(0))

['Class', 'FG', 'FGA', '3P', 'FTA', 'DRB', 'STL', 'BLK']

In [30]:
features_ = list(X.iloc[:,[2,5,6,9,12,14,17,18]].head(0))
features_

['Class', 'FG', 'FGA', '3P', 'FTA', 'DRB', 'STL', 'BLK']

From the regression analysis we see that the combination of the following features give us the highest r-squared value: Class, FG, FGA, 3P, FTA, DRB, STL, BLKs. 

In [31]:
#Tuning parameters for ridge regression using the updated features.
cols = Xtrain.loc[:,features_]
reg = RidgeCV(alphas = [1e-3, 1e-2, 1e-1, 1, 10], cv = 10).fit(cols, Ytrain)



In [32]:
cols_2 = Xtest.loc[:,features_]
ridge_score = reg.score(cols_2,Ytest)
ridge_score

0.1698199782771943

We see that even with the same test set that the features selected from recursive feature elimination that our previous model has a higher score. 

In [33]:
#conducting Least squared regression using the optimal features
Data_1 = data_1.rename(columns = {'3P' : 'three'})
linear_reg = LinearRegression().fit(cols,Ytrain)
linear_reg.score(cols_2, Ytest)

0.1621778647379798

In [34]:
from sklearn.linear_model import LassoCV

In [35]:
lasso = LassoCV(cv = 10).fit(cols,Ytrain)
lasso.score(cols_2,Ytest)

0.16681272752129683

In [36]:
from sklearn.linear_model import ElasticNetCV
encv = ElasticNetCV(cv = 10).fit(cols,Ytrain)
encv.score(cols_2,Ytest)

0.16685717244949216

In [37]:
from sklearn.neighbors import KNeighborsRegressor

In [38]:
knr = KNeighborsRegressor(n_neighbors = 27)
knr_ = knr.fit(cols,Ytrain)
knr_.score(cols_2,Ytest)

0.022040159812338866

In [39]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth = 2, random_state = 0)
rf_ = rf.fit(cols, Ytrain)
rf_.score(cols_2,Ytest)



0.07605871818416332

In [40]:
model_scores = {'model': ['linear regression','ridge', 'lasso', 'elastic net','KNeighborsRegressor', 'randomforest'], 'scores': [linear_reg.score(cols_2, Ytest),ridge_score, lasso.score(cols_2,Ytest), encv.score(cols_2,Ytest),knr_.score(cols_2,Ytest),rf_.score(cols_2,Ytest)]}

In [41]:
pd.DataFrame(data = model_scores)

Unnamed: 0,model,scores
0,linear regression,0.162178
1,ridge,0.16982
2,lasso,0.166813
3,elastic net,0.166857
4,KNeighborsRegressor,0.02204
5,randomforest,0.076059


From our test, we see that ridge regression yields the highest accuracy score on the test data.

In [42]:
#Lastly we need to check if we can improve the score by separating the dataset by position
#Creating test-train-split with updated position column
pos = {'G':1, 'C': 0, 'F':0}

data_ = data.replace({'Pos' : pos}).drop(['sal_as_%','Player','School','Conf','Unnamed: 0'], axis = 1)
sal_ = data.replace({'Pos': pos})[['sal_as_%','Pos']]

xtrain_,xtest_,ytrain_,ytest_ = train_test_split(data_,sal_, test_size = 0.3, random_state = 0)

In [43]:
#Querying data by position (stats and salary)
guards = xtrain_.query('Pos == 1') #Guards training set x
guards_sal = ytrain_.query('Pos == 1') #Guards training set y
guards_test = xtest_.query('Pos == 1') #Guards test set x
guards_test_sal = ytest_.query('Pos == 1') #Guards test set y

bigs = xtrain_.query('Pos == 0')
bigs_sal = ytrain_.query('Pos == 0')
bigs_test = xtest_.query('Pos == 0')
bigs_test_sal = ytest_.query('Pos == 0')

In [44]:
features_

['Class', 'FG', 'FGA', '3P', 'FTA', 'DRB', 'STL', 'BLK']

In [45]:
#Using the features found previously for ridge regression for guards

guards_ridge = RidgeCV(alphas = [1e-3, 1e-2, 1e-1, 1, 10],cv = 5).fit(guards.loc[:,features_], guards_sal)
reg_1 = guards_ridge.predict(guards_test.loc[:,features_])
guards_ridge.score(guards_test.loc[:,features_],guards_test_sal)





0.11441501065033544

In [46]:
#Using the features found previously for ridge regression for bigs
bigs_ridge = RidgeCV(alphas = [1e-3, 1e-2, 1e-1, 1, 10], cv = 5).fit(bigs.loc[:,features_], bigs_sal)
reg_2 = bigs_ridge.predict(bigs_test.loc[:,features_])
bigs_ridge.score(bigs_test.loc[:,features_],bigs_test_sal)





0.14796997435958692

In [47]:
predicted = np.concatenate([reg_1,reg_2]) #Concatenating the two predicted sets into one column

In [48]:
predicted_index = np.concatenate([guards_test.index, bigs_test.index]) #Creating an index of the test sets to merge

In [49]:
pred = pd.DataFrame(index = predicted_index, data = predicted, columns = ['predicted_sal', 'Pos']) #Merging index with the predicted sets
pos_pred = pd.merge(pred, ytest_, left_index = True, right_index = True)
pos_pred.head()

Unnamed: 0,predicted_sal,Pos_x,sal_as_%,Pos_y
76,0.00879,1.0,0.007679,1
334,0.007763,1.0,0.007767,1
354,0.024082,1.0,0.005525,1
90,0.003856,1.0,0.003312,1
10,0.031662,1.0,0.034805,1


In [50]:
from sklearn.metrics import r2_score
r2_score(pos_pred['sal_as_%'], pos_pred['predicted_sal']) #Finding r2 value of predicted salaries vs actual

0.16749230126169512

After separating the data by position, our model yields a prediction score (.167) that is lower than our previous model (.169) - thus our previous model is the best predictor for salary.

In [51]:
#Creating an ensemble model that separates by position for only the stats that correlate to higher salaries by position
#We already have the train-test-split queried by position so we will just build the models after those steps

#Creating a model that only uses the features that are different from non-guards
guards_ridge_2 = RidgeCV(alphas = [1e-3, 1e-2, 1e-1, 1, 10],cv = 5).fit(guards.loc[:,['BLK','STL', 'FTA', 'DRB']], guards_sal)
ridge_reg_1 = guards_ridge_2.predict(guards.loc[:,['BLK','STL','FTA','DRB']]) #Ridge regression predictions of guards training set with differing features

ridge_reg_1_test = guards_ridge_2.predict(guards_test.loc[:,['BLK','STL','FTA','DRB']])



In [52]:
#Using the features 
bigs_ridge_2 = RidgeCV(alphas = [1e-3, 1e-2, 1e-1, 1, 10], cv = 5).fit(bigs.loc[:,['BLK','STL', 'FTA', 'DRB']], bigs_sal)
ridge_reg_2 = bigs_ridge_2.predict(bigs.loc[:,['BLK','STL', 'FTA', 'DRB']]) #Ridge regression of bigs training set with differing features
ridge_reg_2_test = bigs_ridge_2.predict(bigs_test.loc[:,['BLK','STL', 'FTA', 'DRB']])



In [53]:
#Creating an index that reflects the training sets
predicted_index2 = np.concatenate([guards.index, bigs.index]) #Index of both sets

predicted_2 = np.concatenate([ridge_reg_1, ridge_reg_2]) #concatenating the predicted values of the training sets
pred2 = pd.DataFrame(index = predicted_index2, data = predicted_2, columns = ['predicted_sal', 'Pos'])
pred2.head()

Unnamed: 0,predicted_sal,Pos
238,0.014654,1.0
306,0.023997,1.0
401,0.031853,1.0
234,0.02461,1.0
190,0.025521,1.0


In [54]:
#Creating an index that reflects the test sets
predicted_index_test = np.concatenate([guards_test.index, bigs_test.index])

#concatenating predicted values of test sets
predicted_2_test = np.concatenate([ridge_reg_1_test, ridge_reg_2_test])
pred2_test = pd.DataFrame(index = predicted_index_test, data = predicted_2_test, columns = ['predicted_sal', 'Pos'])
pred2_test.head()

Unnamed: 0,predicted_sal,Pos
76,0.021126,1.0
334,0.014614,1.0
354,0.010753,1.0
90,0.011799,1.0
10,0.024418,1.0


In [55]:
pos_pred = pd.merge(pred2, xtrain_, left_index = True, right_index = True)
pos_pred_2 = pd.merge(pos_pred, ytrain_, left_index = True, right_index = True)
pos_pred_2.head() #Merged dataframe of the training sets

Unnamed: 0,predicted_sal,Pos_x,season,NBA_Salary,Class,Pos_y,G,MP,FG,FGA,...,APG,TRB/G,STL/G,TOV/G,BLK/G,3P%,FG%,eFG%,sal_as_%,Pos
238,0.014654,1.0,2012,473604,4,1,35,1175.0,171,328,...,1.6,4.228571,1.285714,1.685714,0.285714,0.458824,0.521341,0.640244,0.008159,1
306,0.023997,1.0,2014,507336,3,1,38,1257.0,205,474,...,2.815789,4.052632,1.131579,1.684211,0.657895,0.366864,0.432489,0.49789,0.008045,1
401,0.031853,1.0,2009,3290000,1,1,37,1072.0,230,505,...,3.864865,5.351351,2.081081,3.594595,0.783784,0.274194,0.455446,0.489109,0.057019,1
234,0.02461,1.0,2013,2653080,2,1,32,1086.0,186,430,...,1.8125,7.0625,2.03125,2.0,0.53125,0.373333,0.432558,0.530233,0.045213,1
190,0.025521,1.0,2015,1572360,4,1,38,1408.0,204,427,...,6.657895,3.026316,1.657895,2.157895,0.5,0.316456,0.477752,0.5363,0.022462,1


In [56]:
pos_pred_test = pd.merge(pred2_test, xtest_, left_index = True, right_index = True)
pos_pred_test_2 = pd.merge(pos_pred_test, ytest_, left_index = True, right_index = True)
pos_pred_test_2.head() #Merged dataframe of the test sets

Unnamed: 0,predicted_sal,Pos_x,season,NBA_Salary,Class,Pos_y,G,MP,FG,FGA,...,APG,TRB/G,STL/G,TOV/G,BLK/G,3P%,FG%,eFG%,sal_as_%,Pos
76,0.021126,1.0,2007,427163,4,1,34,1083.0,184,386,...,3.5,4.441176,2.029412,2.176471,0.411765,0.343137,0.476684,0.522021,0.007679,1
334,0.014614,1.0,2006,412718,4,1,34,1127.0,201,446,...,1.411765,5.823529,1.470588,1.617647,0.294118,0.410628,0.450673,0.545964,0.007767,1
354,0.010753,1.0,2013,324203,1,1,32,987.0,202,456,...,0.84375,5.21875,0.71875,1.59375,0.125,0.377358,0.442982,0.486842,0.005525,1
90,0.011799,1.0,2012,192228,4,1,34,1120.0,210,470,...,2.735294,3.5,0.941176,2.352941,0.117647,0.385,0.446809,0.528723,0.003312,1
10,0.024418,1.0,2011,2020200,2,1,38,1193.0,251,535,...,2.947368,6.5,1.078947,2.605263,0.315789,0.291667,0.469159,0.495327,0.034805,1


In [57]:
#After finding the predicted salary from the ensemble model - we use the output (predicted salary)
#as an input with another model for those features that are not significantly different by position

total = RidgeCV(alphas = [1e-3, 1e-2, 1e-1, 1, 10], cv = 5).fit(pos_pred_2.loc[:,['predicted_sal','Class','FG','FGA','3P']], pos_pred_2['sal_as_%'])
total.score(pos_pred_test_2.loc[:,['predicted_sal','Class','FG','FGA','3P']], pos_pred_test_2['sal_as_%'])



0.15768303976093123

We see that our model performs significantly worse than our previous two. 

In [58]:
sanity_check = RidgeCV(alphas = [1e-3, 1e-2, 1e-1, 1, 10], cv = 5).fit(Xtrain.loc[:,'Class':'eFG%'], Ytrain)
sanity_check_score = sanity_check.score(Xtest.loc[:,'Class':'eFG%'], Ytest)



In [59]:
sanity_check_score

0.14572817027623997

In [60]:
#Creating a new list of features so the old features are excluded
excluded_features = []

for i in list(pos_pred.columns):
    if i not in features_:
        excluded_features.append(i)

In [61]:
excluded_features

['predicted_sal',
 'Pos_x',
 'season',
 'NBA_Salary',
 'Pos_y',
 'G',
 'MP',
 '2P',
 '2PA',
 '3PA',
 'FT',
 'ORB',
 'TRB',
 'AST',
 'TOV',
 'PF',
 'PTS',
 'PPG',
 'APG',
 'TRB/G',
 'STL/G',
 'TOV/G',
 'BLK/G',
 '3P%',
 'FG%',
 'eFG%']

In [62]:
excluded_features.remove('Pos_x')
excluded_features.remove('Pos_y')
excluded_features.remove('NBA_Salary')
excluded_features.remove('season')

In [63]:
Ridge_allCV = RidgeCV(alphas = [1e-3, 1e-2, 1e-1, 1, 10], cv = 5).fit(pos_pred_2.loc[:,excluded_features], pos_pred_2['sal_as_%'])
Ridge_allCV.score(pos_pred_test_2.loc[:,excluded_features], pos_pred_test_2['sal_as_%'])



0.06973678358980662