In [325]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from joblib import dump, load

In [326]:
#import data and view data frame
data = pd.read_csv(
    "../Resources/player_info_reduced.csv")

data = pd.DataFrame(data)



In [327]:
# Save Target columns 
target = data[['2526_salary_predicted', '2627_salary_predicted', '1920_salary_adjusted','2021_salary_adjusted', 
        '2122_salary_adjusted', '2223_salary_adjusted','2324_salary', '2425_salary_predicted','1819_salary_adjusted']]

# Ensure data types are appropriate
target = target.astype(float)

# isolate features
features = data.drop(columns=['Unnamed: 0', 'player_name','2526_salary_predicted', '2627_salary_predicted', 'AVG_SALARY',
                         '1920_salary_adjusted','2021_salary_adjusted', '2122_salary_adjusted', '1819_salary_adjusted', '2223_salary_adjusted',
                         '2324_salary', '2425_salary_predicted'])


# Ensure data types are appropriate
features = pd.get_dummies(features).astype(float)

target.columns

Index(['2526_salary_predicted', '2627_salary_predicted',
       '1920_salary_adjusted', '2021_salary_adjusted', '2122_salary_adjusted',
       '2223_salary_adjusted', '2324_salary', '2425_salary_predicted',
       '1819_salary_adjusted'],
      dtype='object')

In [328]:
# Create scaler instance
scaler = StandardScaler()

# # separate preprocessed data into features and target arrays
# y = data['1819_salary_adjusted']np.reshape(-1,1)
# x = data.drop(columns='1819_salary_adjusted')

# y = np.reshapeX[:,1].reshape(-1,1)

#fit standardscaler to features and then scale data
data_scaler_x = scaler.fit(features)
scaled_x = data_scaler_x.transform(features)

#fit standardscaler to targets and then scale data
data_scaler_y = scaler.fit(target)
scaled_y = data_scaler_y.transform(target)
scaled_y.shape

(199, 9)

In [329]:
#Isolate target values
y1 = scaled_y[:,8]


# split preprocessed data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(scaled_x, y1, random_state=8)

In [330]:
# create linear regression model
reg = SGDRegressor(alpha=0.001, penalty=None)
reg.fit(X_train, y_train)


X_test

array([[-0.18699978, -0.25159379, -0.2902111 , ..., -0.52343375,
        -0.61869619,  0.06219757],
       [-0.18699978, -0.25159379, -0.2902111 , ...,  2.36742205,
         1.82998954, -0.71716385],
       [-0.18699978, -0.25159379, -0.2902111 , ..., -0.68532168,
        -1.10219465, -0.65111628],
       ...,
       [-0.18699978, -0.25159379, -0.2902111 , ...,  1.37296766,
         0.03636623, -0.41334499],
       [-0.18699978, -0.25159379, -0.2902111 , ...,  0.97981127,
         0.41068762,  0.35280691],
       [-0.18699978, -0.25159379, -0.2902111 , ...,  2.11302674,
         1.09694349,  0.71386701]])

In [331]:
# examine the error rate of our models predictions
error = y_test - reg.predict(X_test)
error

array([ 0.28477366,  0.26966085, -0.22662136, -0.09775528, -0.08903692,
       -0.52887644, -0.0746366 , -0.21284437,  0.10195222, -0.46983624,
        0.9244948 , -0.50776216, -0.20407808, -0.13437141,  0.25535018,
        1.22777289,  0.57381544, -0.00178651,  1.21441158, -0.19357952,
        0.14353114,  0.0842046 , -0.17543493,  0.30641584, -0.03245713,
       -0.1442302 , -0.06420444, -0.25926206,  0.32809834,  0.27984715,
        0.4131723 ,  0.63949959,  0.12636916, -0.45588315, -0.07037939,
        0.22961493,  0.55279522,  0.17067714,  0.61190955,  0.0869343 ,
       -0.35168314, -1.61795304,  0.16500918,  0.17098465,  0.03067192,
       -0.50681357,  0.03022381,  0.16645066,  0.31892432, -0.03044853])

In [332]:
# Test model
y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # setting squared=False gives RMSE directly
r2 = r2_score(y_test, y_pred)

#Print Results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")


Mean Absolute Error: 0.32314999783610093
Mean Squared Error: 0.21134209262853987
Root Mean Squared Error: 0.4597195804276123
R-squared: 0.7678760503877717


## These results indicate that the model can predict with great accuracy, player salaries for the 2018-2019 season. Prediction  R^2 is high at 0.76

## Will export this as our model

## now to repeat this with minmaxscaler

In [333]:
# Adjust the data scaling 
# Create scaler instance
scaler2 = MinMaxScaler()


#fit MinMaxScaler to features and then scale data
data_scaler_2x = scaler2.fit(features)
scaled_2x = data_scaler_2x.transform(features)

#fit MinMaxScaler to targets and then scale data
data_scaler_2y = scaler2.fit(target)
scaled_2y = data_scaler_2y.transform(target)
target.columns

Index(['2526_salary_predicted', '2627_salary_predicted',
       '1920_salary_adjusted', '2021_salary_adjusted', '2122_salary_adjusted',
       '2223_salary_adjusted', '2324_salary', '2425_salary_predicted',
       '1819_salary_adjusted'],
      dtype='object')

In [334]:
#Isolate new target values
y1 = scaled_2y[:,8]

# REPEAT - separate preprocessed data into features and target arrays using re-scaled data
X_train, X_test, y_train, y_test = train_test_split(scaled_data2x, y1, random_state=22)



In [335]:
# create linear regression model
reg = SGDRegressor(alpha=0.001, penalty= None)
reg.fit(X_train, y_train)


reg

In [336]:
# Test model
y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)  
r2 = r2_score(y_test, y_pred)

#Print Results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

Mean Absolute Error: 0.06909897581059922
Mean Squared Error: 0.008650577233438954
Root Mean Squared Error: 0.008650577233438954
R-squared: 0.8706774843886059


### MinMaxScaler is slightly benficial in reducing error. begin testing on further years

In [337]:
# add 2018-2019 data to feature data
features = data.drop(columns=['Unnamed: 0', 'player_name','2526_salary_predicted', '2627_salary_predicted', 'AVG_SALARY',
                         '1920_salary_adjusted','2021_salary_adjusted', '2122_salary_adjusted', '2223_salary_adjusted',
                         '2324_salary', '2425_salary_predicted'])

In [338]:
# # Adjust the data scaling 
# # Create scaler instance
# scaler2 = MinMaxScaler()


# #fit MinMaxScaler to features and then scale data
# data_scaler_2x = scaler2.fit(features)
# scaled_2x = data_scaler_2x.transform(features)

# #fit MinMaxScaler to targets and then scale data
# data_scaler_2y = scaler2.fit(target)
# scaled_2y = data_scaler_2y.transform(target)
# target.columns

In [339]:
# Create scaler instance
scaler = StandardScaler()

#fit standardscaler to features and then scale data
data_scaler_x = scaler.fit(features)
scaled_x = data_scaler_x.transform(features)

#fit standardscaler to targets and then scale data
data_scaler_y = scaler.fit(target)
scaled_y = data_scaler_y.transform(target)
scaled_y.shape

(199, 9)

In [340]:
#Isolate new target values
y2 = scaled_y[:,2]

# REPEAT - separate preprocessed data into features and target arrays using re-scaled data
X_train, X_test, y_train, y_test = train_test_split(scaled_x, y2, random_state=71)



In [341]:
# create linear regression model 
reg = SGDRegressor(alpha=0.001, penalty= None)
reg.fit(X_train, y_train)


reg

In [342]:
# Test model
y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)  
r2 = r2_score(y_test, y_pred)

#Print Results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

Mean Absolute Error: 0.4582518329420074
Mean Squared Error: 0.3699813177336172
Root Mean Squared Error: 0.3699813177336172
R-squared: 0.650802783980037


## Model loses accuracy (r^2 0.84 -> 0.64) but error remains low for the 2019-2020 season

In [343]:
# add 2019-2020 data to feature data
features = data.drop(columns=['Unnamed: 0', 'player_name','2526_salary_predicted', '2627_salary_predicted', 'AVG_SALARY',
                         '2021_salary_adjusted', '2122_salary_adjusted', '2223_salary_adjusted',
                         '2324_salary', '2425_salary_predicted'])

# Create scaler instance
scaler = StandardScaler()

#fit standardscaler to features and then scale data
data_scaler_x = scaler.fit(features)
scaled_x = data_scaler_x.transform(features)

#fit standardscaler to targets and then scale data
data_scaler_y = scaler.fit(target)
scaled_y = data_scaler_y.transform(target)
target.columns

Index(['2526_salary_predicted', '2627_salary_predicted',
       '1920_salary_adjusted', '2021_salary_adjusted', '2122_salary_adjusted',
       '2223_salary_adjusted', '2324_salary', '2425_salary_predicted',
       '1819_salary_adjusted'],
      dtype='object')

In [344]:
#Isolate new target values (2020-2021 season)
y2 = scaled_y[:,3]

# REPEAT - separate preprocessed data into features and target arrays using re-scaled data
X_train, X_test, y_train, y_test = train_test_split(scaled_x, y2, random_state=9)


In [345]:
# create linear regression model 
reg = SGDRegressor(alpha=0.001, penalty= None)
reg.fit(X_train, y_train)


reg

In [346]:
# Test model
y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)  
r2 = r2_score(y_test, y_pred)

#Print Results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

Mean Absolute Error: 0.3044522051210098
Mean Squared Error: 0.20002422955061672
Root Mean Squared Error: 0.20002422955061672
R-squared: 0.8014121457463522


## R^2 jumps back up tp 0.8 for the 2020-2021 season


In [347]:
# add 2020-2021 data to feature data
features = data.drop(columns=['Unnamed: 0', 'player_name','2526_salary_predicted', '2627_salary_predicted', 'AVG_SALARY',
                          '2122_salary_adjusted', '2223_salary_adjusted',
                         '2324_salary', '2425_salary_predicted'])

# Create scaler instance
scaler = StandardScaler()

#fit standardscaler to features and then scale data
data_scaler_x = scaler.fit(features)
scaled_x = data_scaler_x.transform(features)

#fit standardscaler to targets and then scale data
data_scaler_y = scaler.fit(target)
scaled_y = data_scaler_y.transform(target)
target.columns

Index(['2526_salary_predicted', '2627_salary_predicted',
       '1920_salary_adjusted', '2021_salary_adjusted', '2122_salary_adjusted',
       '2223_salary_adjusted', '2324_salary', '2425_salary_predicted',
       '1819_salary_adjusted'],
      dtype='object')

In [348]:
#Isolate new target values (2021-2022 season)
y2 = scaled_y[:,4]

# REPEAT - separate preprocessed data into features and target arrays using re-scaled data
X_train, X_test, y_train, y_test = train_test_split(scaled_x, y2, random_state=33)


In [349]:
# create linear regression model 
reg = SGDRegressor(alpha=0.001, penalty= None)
reg.fit(X_train, y_train)


reg

In [350]:
# Test model
y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)  
r2 = r2_score(y_test, y_pred)

#Print Results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")



Mean Absolute Error: 0.21932937905018035
Mean Squared Error: 0.07455405643158329
Root Mean Squared Error: 0.07455405643158329
R-squared: 0.9297679453689236


# r^ now at 0.93 for 2021-2022 season

# lets try to predict the current season now

In [367]:
# add all previous salary data to feature data
features = data.drop(columns=['Unnamed: 0', 'player_name','2526_salary_predicted', '2627_salary_predicted', 'AVG_SALARY',
                         '2324_salary', '2425_salary_predicted'])



# Create scaler instance
scaler = StandardScaler()

#fit standardscaler to features and then scale data
data_scaler_x = scaler.fit(features)
scaled_x = data_scaler_x.transform(features)

#fit standardscaler to targets and then scale data
data_scaler_y = scaler.fit(target)
scaled_y = data_scaler_y.transform(target)
target.columns

Index(['2526_salary_predicted', '2627_salary_predicted',
       '1920_salary_adjusted', '2021_salary_adjusted', '2122_salary_adjusted',
       '2223_salary_adjusted', '2324_salary', '2425_salary_predicted',
       '1819_salary_adjusted'],
      dtype='object')

In [354]:
#Isolate new target values (current salary)
y2 = scaled_y[:,6]

# REPEAT - separate preprocessed data into features and target arrays using re-scaled data
X_train, X_test, y_train, y_test = train_test_split(scaled_x, y2, random_state=4)


In [355]:
# create linear regression model 
reg = SGDRegressor(alpha=0.001, penalty= None)
reg.fit(X_train, y_train)


reg

In [356]:
# Test model
y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)  
r2 = r2_score(y_test, y_pred)

#Print Results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

dump(reg, 'sgd_model_NBA_Salary.joblib')

Mean Absolute Error: 0.47273548918423275
Mean Squared Error: 0.5129116989441238
Root Mean Squared Error: 0.5129116989441238
R-squared: 0.5923797236104504


['sgd_model_NBA_Salary.joblib']

## Now to test our model on a group of players due for new contracts this season


### 2024 predictions!

In [357]:
#import test data
pred_test_data= pd.read_csv(
    "salary_prediction_2024.csv")


pred_df = pd.DataFrame(pred_test_data)
pred_df.columns

Index(['Unnamed: 0', 'player_name', '0304_salary_adjusted',
       '0405_salary_adjusted', '0506_salary_adjusted', '0607_salary_adjusted',
       '0708_salary_adjusted', '0809_salary_adjusted', '0910_salary_adjusted',
       '1011_salary_adjusted', '1112_salary_adjusted', '1213_salary_adjusted',
       '1314_salary_adjusted', '1415_salary_adjusted', '1516_salary_adjusted',
       '1617_salary_adjusted', '1718_salary_adjusted', '1819_salary_adjusted',
       '1920_salary_adjusted', '2021_salary_adjusted', '2122_salary_adjusted',
       '2223_salary_adjusted', 'CONTRACT_START', 'CONTRACT_END', 'AGE', 'GP',
       'W', 'L', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM',
       'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'PF',
       '+/-'],
      dtype='object')

In [369]:
#Prep test data

# isolate features
features2 = pred_df.drop(columns=['Unnamed: 0', 'player_name'])


# Ensure data types are appropriate
features2 = pd.get_dummies(features).astype(float)

# features2.columns
target.columns


Index(['2526_salary_predicted', '2627_salary_predicted',
       '1920_salary_adjusted', '2021_salary_adjusted', '2122_salary_adjusted',
       '2223_salary_adjusted', '2324_salary', '2425_salary_predicted',
       '1819_salary_adjusted'],
      dtype='object')

### in trying to use separate data to predict future salaries I am having trouble unscaling the predicted results

In [370]:
#Load model
loaded_model = load('sgd_model_NBA_Salary.joblib')

# use model on ptest data
predictions = loaded_model.predict(features2)

# re-establish scaler
data_scaler_yy = scaler.fit(target['2324_salary'])

# Reshape the predictions to 2D
predictions_2D = predictions.reshape(-1, 1)

predictions_2D

#inverse transform predictions (remove scaling)
unscaled_predictions = data_scaler_yy.inverse_transform(predictions_2D)

# Flatten the unscaled predictions if you prefer them in a 1D format
unscaled_predictions = unscaled_predictions.ravel()

# Save unscaled predictions as a DataFrame
df_unscaled_predictions = pd.DataFrame({'Unscaled Predictions': unscaled_predictions})
df_unscaled_predictions



ValueError: Expected 2D array, got 1D array instead:
array=[47607350. 47607350.        0.        0.        0.        0.        0.
        0.        0.        0.        0.        0.        0.        0.
        0.        0. 30800000.        0.        0.        0.  6479000.
  6479000.        0.        0.        0.        0. 29682540. 29682540.
        0.        0.        0. 47649433. 47649433. 47649433. 47649433.
 10000000. 10000000. 24360000.  8000000.  8000000.        0.        0.
        0.        0.        0.  3196448.  3835738.  3835738.  3835738.
  3835738.        0.        0.  6802950.  6802950.  6802950.  3196448.
  3196448. 25000000. 25000000.        0.        0.  3196448.  3196448.
  3196448.  5734280. 11710818. 11710818.        0.        0.  3196448.
  3196448.        0. 35640000. 35640000. 51915615. 28600000.        0.
        0. 36861707.        0.        0.        0.        0.        0.
        0.        0.  3196448.  3196448.        0.        0.  3196448.
  3196448.        0.        0.        0.        0.        0. 31500000.
 31500000. 45640084.        0.        0.        0.  1300000.        0.
        0.        0.        0.        0.  3196448. 37037037.        0.
        0.  6146342.        0.        0.        0. 43219440. 10489600.
        0. 45640084. 18518519. 39270150.        0.  5000000.  3196448.
 45183960.        0.        0.        0. 40600080. 15435000. 17000000.
        0.        0.        0.        0.        0.        0. 22321429.
        0.  3196448.        0.        0.        0. 29320988.        0.
        0.        0.  3196448.  3196448.  3196448.        0. 14704938.
        0.  5000000.        0. 10489600. 11692308.        0.  3500000.
  4000000.  4000000. 20000000. 28226880.        0.        0. 27586207.
 19928571. 19928571. 20357143. 20357143. 11000000.        0.        0.
 23487629.  4000000.        0.        0.        0.        0.        0.
        0.        0.        0.  2891467.        0.  9423869.        0.
 40806300.        0. 17000000.        0.        0.        0.        0.
        0.        0.  2528233.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.