In [246]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from joblib import dump

In [194]:
#import data and view data frame
data = pd.read_csv(
    "../Resources/player_info_reduced.csv")

data = pd.DataFrame(data)

data.columns



Index(['Unnamed: 0', 'player_name', '0304_salary_adjusted',
       '0405_salary_adjusted', '0506_salary_adjusted', '0607_salary_adjusted',
       '0708_salary_adjusted', '0809_salary_adjusted', '0910_salary_adjusted',
       '1011_salary_adjusted', '1112_salary_adjusted', '1213_salary_adjusted',
       '1314_salary_adjusted', '1415_salary_adjusted', '1516_salary_adjusted',
       '1617_salary_adjusted', '1718_salary_adjusted', '1819_salary_adjusted',
       '1920_salary_adjusted', '2021_salary_adjusted', '2122_salary_adjusted',
       '2223_salary_adjusted', '2324_salary', '2425_salary_predicted',
       '2526_salary_predicted', '2627_salary_predicted', 'CONTRACT_START',
       'CONTRACT_END', 'AVG_SALARY', 'AGE', 'GP', 'W', 'L', 'MIN', 'PTS',
       'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB',
       'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'PF', '+/-'],
      dtype='object')

In [195]:
# Save Target columns 
target = data[['2526_salary_predicted', '2627_salary_predicted', '1920_salary_adjusted','2021_salary_adjusted', 
        '2122_salary_adjusted', '2223_salary_adjusted','2324_salary', '2425_salary_predicted','1819_salary_adjusted']]

# Ensure data types are appropriate
target = target.astype(float)

# isolate features
features = data.drop(columns=['Unnamed: 0', 'player_name','2526_salary_predicted', '2627_salary_predicted', 'AVG_SALARY',
                         '1920_salary_adjusted','2021_salary_adjusted', '2122_salary_adjusted', '2223_salary_adjusted',
                         '2324_salary', '2425_salary_predicted'])

# Ensure data types are appropriate
features = pd.get_dummies(features).astype(float)

target.columns

Index(['2526_salary_predicted', '2627_salary_predicted',
       '1920_salary_adjusted', '2021_salary_adjusted', '2122_salary_adjusted',
       '2223_salary_adjusted', '2324_salary', '2425_salary_predicted',
       '1819_salary_adjusted'],
      dtype='object')

In [196]:
# Create scaler instance
scaler = StandardScaler()

# # separate preprocessed data into features and target arrays
# y = data['1819_salary_adjusted']np.reshape(-1,1)
# x = data.drop(columns='1819_salary_adjusted')

# y = np.reshapeX[:,1].reshape(-1,1)

#fit standardscaler to features and then scale data
data_scaler_x = scaler.fit(features)
scaled_x = data_scaler_x.transform(features)

#fit standardscaler to targets and then scale data
data_scaler_y = scaler.fit(target)
scaled_y = data_scaler_y.transform(target)
scaled_y.shape

(199, 9)

In [197]:
#Isolate target values
y1 = scaled_y[:,8]


# split preprocessed data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(scaled_x, y1, random_state=8)

In [209]:
# create linear regression model
reg = SGDRegressor(alpha=0.001, penalty=None)
reg.fit(X_train, y_train)


X_test

array([[-0.18699978, -0.25159379, -0.2902111 , ..., -0.61594114,
        -0.5563093 , -1.38204281],
       [-0.18699978, -0.25159379, -0.2902111 , ..., -0.66219483,
        -1.08659793, -0.07430076],
       [-0.18699978, -0.25159379, -0.2902111 , ...,  1.83550459,
        -0.07281084,  1.26866666],
       ...,
       [-0.18699978, -0.25159379, -0.2902111 , ..., -0.01464313,
         1.6740223 ,  0.18548638],
       [-0.18699978, -0.25159379, -0.2902111 , ..., -0.68532168,
         0.58225159, -0.95493514],
       [-0.18699978, -0.25159379, -0.2902111 , ...,  1.25733342,
         0.95657297,  0.49370841]])

In [199]:
# examine the error rate of our models predictions
error = y_test - reg.predict(X_test)
error

array([ 0.05669398,  0.10346433, -0.09914245, -0.05066857, -0.06702902,
       -0.17842097, -0.0706557 , -0.03121507,  0.02391737, -0.16386644,
        0.3492552 , -0.20126041, -0.07161554, -0.05222593,  0.11649955,
        0.56784745,  0.2122353 ,  0.00976054,  0.56440074, -0.09447099,
        0.09143484, -0.02244478, -0.05700027,  0.04439217, -0.02790478,
       -0.01997723, -0.04588122, -0.10245889,  0.08601163,  0.07109885,
        0.17893128,  0.20397616,  0.0031014 , -0.11245167, -0.09959995,
        0.04349514,  0.26579732,  0.05162081,  0.19687921,  0.00792622,
       -0.11697337, -0.43341066,  0.05519011,  0.07705919,  0.00344275,
       -0.14009629, -0.06370521,  0.01127063,  0.13569087,  0.03006024])

In [247]:
# Test model
y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # setting squared=False gives RMSE directly
r2 = r2_score(y_test, y_pred)

#Print Results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

dump(reg, 'sgd_model_NBA_Salary.joblib')

Mean Absolute Error: 0.6297552805457559
Mean Squared Error: 0.6671254046756894
Root Mean Squared Error: 0.8167774511307774
R-squared: 0.3772565782125539


['sgd_model_NBA_Salary.joblib']

## These results indicate that the model can predict with great accuracy, player salaries for the 2018-2019 season. Prediction  R^2 is high at 0.88

## Will export this as our model

## now to repeat this with minmaxscaler

In [219]:
# Adjust the data scaling 
# Create scaler instance
scaler2 = MinMaxScaler()


#fit MinMaxScaler to features and then scale data
data_scaler_2x = scaler2.fit(features)
scaled_2x = data_scaler_2x.transform(features)

#fit MinMaxScaler to targets and then scale data
data_scaler_2y = scaler2.fit(target)
scaled_2y = data_scaler_2y.transform(target)
target.columns

Index(['2526_salary_predicted', '2627_salary_predicted',
       '1920_salary_adjusted', '2021_salary_adjusted', '2122_salary_adjusted',
       '2223_salary_adjusted', '2324_salary', '2425_salary_predicted',
       '1819_salary_adjusted'],
      dtype='object')

In [224]:
#Isolate new target values
y1 = scaled_2y[:,8]

# REPEAT - separate preprocessed data into features and target arrays using re-scaled data
X_train, X_test, y_train, y_test = train_test_split(scaled_data2x, y1, random_state=22)



In [226]:
# create linear regression model
reg = SGDRegressor(alpha=0.001, penalty= None)
reg.fit(X_train, y_train)


reg

In [227]:
# Test model
y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)  
r2 = r2_score(y_test, y_pred)

#Print Results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

Mean Absolute Error: 0.06539014517464721
Mean Squared Error: 0.009078517058704699
Root Mean Squared Error: 0.009078517058704699
R-squared: 0.8642799627851064


### Minmaxscaler is slightly benficial in reducing error. begin testing on further years

In [231]:
# Adjust the data scaling 
# Create scaler instance
scaler2 = MinMaxScaler()


#fit MinMaxScaler to features and then scale data
data_scaler_2x = scaler2.fit(features)
scaled_2x = data_scaler_2x.transform(features)

#fit MinMaxScaler to targets and then scale data
data_scaler_2y = scaler2.fit(target)
scaled_2y = data_scaler_2y.transform(target)
target.columns

Index(['2526_salary_predicted', '2627_salary_predicted',
       '1920_salary_adjusted', '2021_salary_adjusted', '2122_salary_adjusted',
       '2223_salary_adjusted', '2324_salary', '2425_salary_predicted',
       '1819_salary_adjusted'],
      dtype='object')

In [232]:
#Isolate new target values
y2 = scaled_y[:,2]

# REPEAT - separate preprocessed data into features and target arrays using re-scaled data
X_train, X_test, y_train, y_test = train_test_split(scaled_data2x, y2, random_state=71)



In [241]:
# create linear regression model 
reg = SGDRegressor(alpha=0.001, penalty= None)
reg.fit(X_train, y_train)


reg

In [242]:
# Test model
y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)  
r2 = r2_score(y_test, y_pred)

#Print Results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

Mean Absolute Error: 0.5075138893585455
Mean Squared Error: 0.487281518574296
Root Mean Squared Error: 0.487281518574296
R-squared: 0.5400920491162864


## Model loses accuracy but error remains low for the 2019-2020 season

In [243]:
#Isolate new target values
y2 = scaled_y[:,3]

# REPEAT - separate preprocessed data into features and target arrays using re-scaled data
X_train, X_test, y_train, y_test = train_test_split(scaled_data2x, y2, random_state=71)


In [244]:
# create linear regression model 
reg = SGDRegressor(alpha=0.001, penalty= None)
reg.fit(X_train, y_train)


reg

In [245]:
# Test model
y_pred = reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)  
r2 = r2_score(y_test, y_pred)

#Print Results
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared: {r2}")

Mean Absolute Error: 0.6297552805457559
Mean Squared Error: 0.6671254046756894
Root Mean Squared Error: 0.6671254046756894
R-squared: 0.3772565782125539


## Accuracy continues to drop for the 2020-2021 season and loss increases
