## Prediction model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
salary_df = pd.read_csv("dataset_for_prediction.csv")
#salary_df[(salary_df["Year"]==2000) & (salary_df["Tm"]=="SEA")]

In [3]:
teams = salary_df[["Tm", "Year", "Salaries"]]
teams = teams.groupby(["Tm", "Year"]).sum()
teams.rename(columns={'Salaries':'Teamcap'}, inplace=True)
dataset = pd.merge(left=salary_df, right=teams, left_on=["Tm", "Year"], right_on=["Tm", "Year"])
dataset['Wage%'] = round(dataset['Salaries'] / dataset['Teamcap'], 2)

#### Creating dummy variables ####

In [4]:

data_dum = pd.get_dummies(columns=["Pos", "Tm", "Teamcap"], data=dataset)
prediction_data = data_dum.drop(columns = ["Player"])
print(prediction_data)

       Unnamed: 0  Year   Age     G      MP   PER    TS%   3PAr    FTr  ORB%  \
0               0  2000  24.0  82.0  3070.0  20.6  0.570  0.288  0.282   3.2   
1               1  2000  23.0  27.0   361.0   4.3  0.310  0.147  0.042   1.6   
2              18  2000  30.0  81.0  2899.0  21.1  0.551  0.077  0.380   2.8   
3              61  2000  32.0  80.0  2129.0  12.7  0.550  0.004  0.563  12.8   
4             108  2000  27.0  81.0  2909.0  17.8  0.534  0.162  0.194   4.3   
5             124  2000  22.0  44.0   447.0  15.8  0.517  0.033  0.557  13.1   
6             133  2000  31.0  68.0  1488.0  15.6  0.539  0.000  0.303  14.0   
7               2  2000  29.0  82.0  2593.0  17.4  0.524  0.223  0.257   2.3   
8              78  2000  25.0  61.0   879.0   7.2  0.451  0.478  0.235   4.1   
9              98  2000  22.0  73.0  2583.0  19.8  0.550  0.255  0.409   3.5   
10            101  2000  24.0  79.0  1797.0  13.8  0.527  0.002  0.260  11.0   
11            126  2000  23.0  82.0  300

#### Spliting data for testing and using StandardScaler ####

In [5]:
from sklearn.model_selection import train_test_split
X = prediction_data.copy().drop(columns=["Wage%", "Salaries"])
y = prediction_data["Wage%"]

X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
standardized_data = scaler.fit_transform(X_train)
standardized_test = scaler.fit_transform(X_test)

In [7]:
def getPlayerData(name, year=9999):    
    testPlayer = data_dum[data_dum["Player"] == name]
    if year!=9999:
        testPlayer = testPlayer[testPlayer["Year"] == year]
    testPlayer = testPlayer.drop(columns=["Player", "Salaries", "Wage%"])
    return testPlayer

def getPlayerDataWSalary(name, year=9999):    
    testPlayer = data_dum[data_dum["Player"] == name]
    if year!=9999:
        testPlayer = testPlayer[testPlayer["Year"] == year]
    return testPlayer
#rayAllen_pred = rf.predict(rayAllen)
#print(rayAllen_pred)

### Predicting using Random Forest ###

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix

#TODO
rf = RandomForestRegressor(n_estimators=100, max_depth=4,random_state=0).fit(X_train, y_train)

In [9]:
print(rf.predict(getPlayerData("Kobe Bryant", 2005)))
print(getPlayerDataWSalary("Kobe Bryant", 2005)["Wage%"].iloc[0])

[0.2061345]
0.21


### Predicting using Linear Regression ###

In [10]:
from sklearn.linear_model import LinearRegression
#"Linear regression does not respect the bounds of 0. It's linear, always and everywhere.
#It may not be appropriate for values that need to be close to 0 but are strictly positive."
#https://stats.stackexchange.com/questions/145383/getting-negative-predicted-values-after-linear-regression

lr = LinearRegression()
lr.fit(standardized_data, y_train)
print(lr.coef_)

[-2.04677585e-02  4.35982177e+10  3.03210968e-03 -7.54425617e-03
 -1.43272736e-02  3.54151619e-03 -1.86413944e-04 -1.75464525e-03
  6.84296955e-04 -6.73389435e-03 -7.83598423e-03  1.20776892e-02
  2.50518322e-04  1.41803920e-03  1.50370598e-03  1.45649910e-03
 -2.70456076e-03 -1.50839090e-02 -1.58346891e-02  4.32934761e-02
 -7.95102119e-03  1.15736723e-02 -6.75797462e-04 -5.68854809e-03
  7.82537460e-03 -2.50454794e+10 -1.16696911e+11  4.06742096e-04
  1.26898358e+10  3.94279737e+10 -2.09808350e-05  3.36178941e+10
  9.37978435e+10  1.30558014e-03 -1.68657303e-03  4.21857638e+09
  1.95386410e-02 -1.32381916e-04  4.17788829e+09  9.80221620e+09
 -1.35122636e+10 -8.43054056e-03 -6.05964661e-03  1.76715851e-03
  1.35762691e-02 -3.49736214e-03 -1.91440204e+10 -4.82811928e-02
  2.00764026e+10  2.03407571e+10  1.98408854e+10  1.91106085e+10
  2.04129964e+10 -7.99482300e+10 -1.42801544e+10  6.74352645e+09
  2.10557995e+10 -1.26448582e+11 -1.41624306e+10 -5.72883165e+10
  3.14703721e+10  1.02126

In [11]:
print(lr.predict(getPlayerData("Kobe Bryant", 2005)))
print(getPlayerDataWSalary("Kobe Bryant", 2005)["Wage%"].iloc[0])

[3.72358209e+12]
0.21


###  Predicting using RidgeRegression  ###

In [12]:
solvers = ['svd', 'cholesky', 'lsqr', 'sparse_cg']
#best_model = clf
abs_error = 100
for solver in solvers: 
    from sklearn.linear_model import Ridge
    from sklearn.metrics import mean_absolute_error
    clf = Ridge(alpha=1.0, solver=solver)
    clf.fit(X, y)
    print(solver)
    error = mean_absolute_error(y_test, clf.predict(standardized_test))
    print(error)
    if(error < abs_error):
        abs_error = error
        best_model = clf



svd
0.33597110223213933
cholesky
0.33597110222247856
lsqr
0.10374638457165947
sparse_cg
0.11997893082735962


In [13]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
clf = Ridge(alpha=1.0)
clf.fit(X, y)
clf.predict(getPlayerData("Kobe Bryant", 2005))

mean_absolute_error(y_test, clf.predict(standardized_test))

0.33597110222247856

### Predicting using Lasso ###

In [14]:
from sklearn.linear_model import Lasso
standardized_data
lasso = Lasso()
lasso.fit(X_train,y_train)
train_score=lasso.score(X_train,y_train)
test_score=lasso.score(X_test,y_test)
coeff_used = np.sum(lasso.coef_!=0)
print ("training score:", train_score )
print ("test score: ", test_score)
print ("number of features used: ", coeff_used)

error = mean_absolute_error(y_test, lasso.predict(X_test))
print("Lasso error: ", error)

lasso001 = Lasso(alpha=0.01, max_iter=10e5)
lasso001.fit(X_train,y_train)
train_score001=lasso001.score(X_train,y_train)
test_score001=lasso001.score(X_test,y_test)
coeff_used001 = np.sum(lasso001.coef_!=0)
print ("\n" + "training score for alpha=0.01:", train_score001 )
print ("test score for alpha =0.01: ", test_score001)
print ("number of features used: for alpha =0.01:", coeff_used001)
error = mean_absolute_error(y_test, lasso001.predict(X_test))
print("Lasso001 error: ", error)


lasso00001 = Lasso(alpha=0.0001, max_iter=10e5)
lasso00001.fit(standardized_data,y_train)
train_score00001=lasso00001.score(X_train,y_train)
test_score00001=lasso00001.score(X_test,y_test)
coeff_used00001 = np.sum(lasso00001.coef_!=0)
print ("\n" + "training score for alpha=0.0001:", train_score00001)
print ("test score for alpha =0.0001: ", test_score00001)
print ("number of features used: for alpha =0.0001:", coeff_used00001)
error = mean_absolute_error(y_test, lasso00001.predict(X_test))
print("Lasso00001 error: ", error)



training score: 0.6305092013763102
test score:  0.5793043415120044
number of features used:  3
Lasso error:  0.026115197704178243

training score for alpha=0.01: 0.6726995200759209
test score for alpha =0.01:  0.6227283546622191
number of features used: for alpha =0.01: 13
Lasso001 error:  0.024931069897492168

training score for alpha=0.0001: -54424.56883446725
test score for alpha =0.0001:  -49843.12192908813
number of features used: for alpha =0.0001: 546
Lasso00001 error:  13.853363690893412


### Predicting Key Players Salaries ###
#### Predicting new contract salary by year ####
Chosen players:
* Anthony Davis
* Damian Lillard
* Giannis Antetokounmpo

In [18]:
players = ["Anthony Davis", "Damian Lillard", "Giannis Antetokounmpo", "Chris Paul"]
algorithms = [rf, lr, clf, lasso, lasso001]

for algorithm in algorithms:
    print(algorithm)
    for player in players:
        salaryNow = getPlayerDataWSalary(player, 2019)["Wage%"].iloc[0]
        predictedSalary = round(algorithm.predict(getPlayerData(player, 2019))[0],2)
        print("Player: " + str(player)+"; "+ "Current Wage%: "+ str(salaryNow) +"; "+"Predicted next season Wage%: "+ str(predictedSalary))
    print("\n")



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)
Player: Anthony Davis; Current Wage%: 0.22; Predicted next season Wage%: 0.21
Player: Damian Lillard; Current Wage%: 0.22; Predicted next season Wage%: 0.21
Player: Giannis Antetokounmpo; Current Wage%: 0.17; Predicted next season Wage%: 0.21
Player: Chris Paul; Current Wage%: 0.19; Predicted next season Wage%: 0.21


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
Player: Anthony Davis; Current Wage%: 0.22; Predicted next season Wage%: 30439115367739.04
Player: Damian Lillard; Current Wage%: 0.22; Predic