In [130]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [131]:
#import data and view data frame
data = pd.read_csv(
    "../Resources/freeagent_2024_salary.csv")

data = pd.DataFrame(data)

data.columns


Index(['PLAYER_ID', 'SEASON_ID', 'LEAGUE_ID', 'TEAM_ID', 'TEAM_ABBREVIATION',
       'PLAYER_AGE', 'GP', 'GS', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A',
       'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'Player_Name', 'Salary'],
      dtype='object')

In [132]:
# Drop unnecessary columns
data = data.drop(columns=['Player_Name','TEAM_ABBREVIATION','LEAGUE_ID',])

#change data type for categorical variables
data[['PLAYER_ID','TEAM_ID','SEASON_ID']] = data[['PLAYER_ID','TEAM_ID','SEASON_ID']].astype(str)

data

Unnamed: 0,PLAYER_ID,SEASON_ID,TEAM_ID,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,...,OREB,DREB,REB,AST,STL,BLK,TOV,PF,PTS,Salary
0,1627783,2016-17,1610612761,23,55,38,859,103,205,0.502,...,64,121,185,17,26,45,33,109,229,1196040.0
1,1627783,2017-18,1610612761,24,81,5,1679,253,498,0.508,...,79,285,364,159,62,42,67,166,589,1312611.0
2,1627783,2018-19,1610612761,25,80,79,2548,519,945,0.549,...,124,425,549,248,73,52,154,241,1354,1544951.0
3,1627783,2019-20,1610612761,26,60,60,2110,500,1104,0.453,...,64,375,439,207,61,53,148,170,1371,2204849.0
4,1627783,2020-21,1610612761,27,56,56,2006,437,961,0.455,...,95,310,405,250,64,37,130,174,1196,30559200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
761,202066,2018-19,0,33,75,55,2041,208,493,0.422,...,28,188,216,106,76,30,70,204,586,4469063.0
762,202066,2019-20,1610612751,34,62,35,1730,222,587,0.378,...,29,189,218,156,49,29,62,108,641,4767000.0
763,202066,2020-21,1610612741,35,56,25,1528,158,381,0.415,...,29,131,160,124,43,30,58,126,423,4910000.0
764,202066,2021-22,1610612740,36,59,16,1098,111,295,0.376,...,29,114,143,74,43,23,42,81,309,5155500.0


In [110]:
#encode categorical data
data = pd.get_dummies(data)

data

Unnamed: 0,PLAYER_ID,TEAM_ID,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,...,SEASON_ID_2013-14,SEASON_ID_2014-15,SEASON_ID_2015-16,SEASON_ID_2016-17,SEASON_ID_2017-18,SEASON_ID_2018-19,SEASON_ID_2019-20,SEASON_ID_2020-21,SEASON_ID_2021-22,SEASON_ID_2022-23
0,1627783,1610612761,23,55,38,859,103,205,0.502,1,...,0,0,0,1,0,0,0,0,0,0
1,1627783,1610612761,24,81,5,1679,253,498,0.508,29,...,0,0,0,0,1,0,0,0,0,0
2,1627783,1610612761,25,80,79,2548,519,945,0.549,79,...,0,0,0,0,0,1,0,0,0,0
3,1627783,1610612761,26,60,60,2110,500,1104,0.453,131,...,0,0,0,0,0,0,1,0,0,0
4,1627783,1610612761,27,56,56,2006,437,961,0.455,73,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
761,202066,0,33,75,55,2041,208,493,0.422,90,...,0,0,0,0,0,1,0,0,0,0
762,202066,1610612751,34,62,35,1730,222,587,0.378,127,...,0,0,0,0,0,0,1,0,0,0
763,202066,1610612741,35,56,25,1528,158,381,0.415,67,...,0,0,0,0,0,0,0,1,0,0
764,202066,1610612740,36,59,16,1098,111,295,0.376,59,...,0,0,0,0,0,0,0,0,1,0


In [111]:
#isolate features
x = data.drop(columns='Salary')

x

Unnamed: 0,PLAYER_ID,TEAM_ID,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,...,SEASON_ID_2013-14,SEASON_ID_2014-15,SEASON_ID_2015-16,SEASON_ID_2016-17,SEASON_ID_2017-18,SEASON_ID_2018-19,SEASON_ID_2019-20,SEASON_ID_2020-21,SEASON_ID_2021-22,SEASON_ID_2022-23
0,1627783,1610612761,23,55,38,859,103,205,0.502,1,...,0,0,0,1,0,0,0,0,0,0
1,1627783,1610612761,24,81,5,1679,253,498,0.508,29,...,0,0,0,0,1,0,0,0,0,0
2,1627783,1610612761,25,80,79,2548,519,945,0.549,79,...,0,0,0,0,0,1,0,0,0,0
3,1627783,1610612761,26,60,60,2110,500,1104,0.453,131,...,0,0,0,0,0,0,1,0,0,0
4,1627783,1610612761,27,56,56,2006,437,961,0.455,73,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
761,202066,0,33,75,55,2041,208,493,0.422,90,...,0,0,0,0,0,1,0,0,0,0
762,202066,1610612751,34,62,35,1730,222,587,0.378,127,...,0,0,0,0,0,0,1,0,0,0
763,202066,1610612741,35,56,25,1528,158,381,0.415,67,...,0,0,0,0,0,0,0,1,0,0
764,202066,1610612740,36,59,16,1098,111,295,0.376,59,...,0,0,0,0,0,0,0,0,1,0


In [112]:
# Create scaler instance
scaler = StandardScaler()

#change features to float
x = x.astype(float)

#fit standardscaler
data_scaler_x = scaler.fit(x)

# scale the data
scaled_x = data_scaler_x.transform(x)

In [113]:
#Isolate target values
y = data['Salary'].values.astype(float)


# split preprocessed data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(scaled_x, y, random_state=8)

In [114]:
# create linear regression model
reg = SGDRegressor(alpha=0.001, penalty= None)
reg.fit(X_train, y_train)


X_test

array([[-0.69510889,  0.33429949,  2.0102407 , ..., -0.38971746,
        -0.38526067,  2.67374946],
       [-0.69677988,  0.33429953, -0.34586256, ..., -0.38971746,
        -0.38526067, -0.37400662],
       [-0.69874708,  0.33429954,  1.22487294, ..., -0.38971746,
        -0.38526067, -0.37400662],
       ...,
       [-0.69752188,  0.33429955,  0.43950519, ..., -0.38971746,
        -0.38526067, -0.37400662],
       [ 1.4360266 ,  0.33429952, -0.86944106, ...,  2.5659615 ,
        -0.38526067, -0.37400662],
       [-0.69680532,  0.33429949,  2.79560845, ..., -0.38971746,
        -0.38526067,  2.67374946]])

In [115]:
# examine the error rate of our models predictions
error = y_test - reg.predict(X_test)
error 

array([ -4162015.24787496,    196085.85073842,  10283445.61933511,
        -3593391.63772166,  -5433770.03384699,  -3772165.57464176,
        -4816612.54311307,   4777851.46260261,  -7857016.7555028 ,
        -8425421.66078928,   -210141.58698586,   1444883.44199349,
       -10047599.72988811,  -5477803.23427044,    616100.55814924,
        -1305388.47783845,  -2751644.80569173,  -5389547.38417881,
         7247308.21933471,  -2556828.49279995,    665351.06809627,
         2651428.80896488,  -8851441.97949778,    630666.24317031,
        -2782830.829037  ,  -1830874.14361798,  -1356160.58974669,
        -1519347.68437644,  -8949149.84738448,  -1447016.48794121,
        -2657004.72194803,  -4692008.11173583,  -1448609.43476857,
        -4220168.86237548,   5373037.19195861,  -3926532.00893236,
        -1576016.64250948,  -8383902.71948547,   4737750.90298286,
        -2469837.29334063,   1978241.85128764,    988396.20565674,
        -3570592.26727797,   2607834.38060481,  -7999516.44415

In [102]:
# Test model
y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # setting squared=False gives RMSE directly
r2 = r2_score(y_test, y_pred)

#Print Results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

Mean Absolute Error: 4380629.091902487
Mean Squared Error: 31171121319552.7
Root Mean Squared Error: 5583110.362472937
R-squared: 0.47480930998450666


### These results indicate that the model is still off by  ~3.5 million 

In [116]:
# Adjust the data scaling 
# Create scaler instance
scaler2 = MinMaxScaler()

#fit MinMaxScaler
data_scaler2x = scaler.fit(x)

#scale the data
scaled_data2x = data_scaler2x.transform(x)

data

Unnamed: 0,PLAYER_ID,TEAM_ID,PLAYER_AGE,GP,GS,MIN,FGM,FGA,FG_PCT,FG3M,...,SEASON_ID_2013-14,SEASON_ID_2014-15,SEASON_ID_2015-16,SEASON_ID_2016-17,SEASON_ID_2017-18,SEASON_ID_2018-19,SEASON_ID_2019-20,SEASON_ID_2020-21,SEASON_ID_2021-22,SEASON_ID_2022-23
0,1627783,1610612761,23,55,38,859,103,205,0.502,1,...,0,0,0,1,0,0,0,0,0,0
1,1627783,1610612761,24,81,5,1679,253,498,0.508,29,...,0,0,0,0,1,0,0,0,0,0
2,1627783,1610612761,25,80,79,2548,519,945,0.549,79,...,0,0,0,0,0,1,0,0,0,0
3,1627783,1610612761,26,60,60,2110,500,1104,0.453,131,...,0,0,0,0,0,0,1,0,0,0
4,1627783,1610612761,27,56,56,2006,437,961,0.455,73,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
761,202066,0,33,75,55,2041,208,493,0.422,90,...,0,0,0,0,0,1,0,0,0,0
762,202066,1610612751,34,62,35,1730,222,587,0.378,127,...,0,0,0,0,0,0,1,0,0,0
763,202066,1610612741,35,56,25,1528,158,381,0.415,67,...,0,0,0,0,0,0,0,1,0,0
764,202066,1610612740,36,59,16,1098,111,295,0.376,59,...,0,0,0,0,0,0,0,0,1,0


In [23]:
# REPEAT - separate preprocessed data into features and target arrays using re-scaled data
X_train, X_test, y_train, y_test = train_test_split(scaled_data2x, y, random_state=22)


In [29]:
# coptimize linear regression model
# change learning rate to 0.01 and use l2 regularization
reg = SGDRegressor(alpha=0.01, penalty= None)
reg.fit(X_train, y_train)


reg

In [30]:
# Test model
y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # setting squared=False gives RMSE directly
r2 = r2_score(y_test, y_pred)

#Print Results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

Mean Absolute Error: 4364186.067865628
Mean Squared Error: 36952693054182.46
Root Mean Squared Error: 6078872.679550252
R-squared: 0.5713087890470852
