In [142]:
import pandas as pd

# Path to your CSV file
file_path = 'NBA_Player_Salary_2022_2023.csv'  # Replace with your file path

# Load the CSV file into a DataFrame
game = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
game.head()

Unnamed: 0.1,Unnamed: 0,Player Name,Salary,Position,Age,Team,GP,GS,MP,FG,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,0,Stephen Curry,48070014,PG,34,GSW,56,56,34.7,10.0,...,12.5,31.0,5.8,2.0,7.8,0.192,7.5,0.1,7.5,4.7
1,1,John Wall,47345760,PG,32,LAC,34,3,22.2,4.1,...,17.1,27.0,-0.4,0.7,0.3,0.02,-0.8,-0.4,-1.2,0.1
2,2,Russell Westbrook,47080179,PG,34,LAL/LAC,73,24,29.1,5.9,...,18.4,27.7,-0.6,2.6,1.9,0.044,0.3,-0.1,0.2,1.2
3,3,LeBron James,44474988,PF,38,LAL,55,54,35.5,11.1,...,11.6,33.3,3.2,2.4,5.6,0.138,5.5,0.6,6.1,4.0
4,4,Kevin Durant,44119845,PF,34,BRK/PHO,47,47,35.6,10.3,...,13.4,30.7,4.7,2.1,6.8,0.194,6.0,1.2,7.1,3.9


In [143]:
pip install pandas scikit-learn statsmodels

Note: you may need to restart the kernel to use updated packages.


# Set Up

In [144]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score

# Load the Dataset

In [145]:
# Load the dataset
df = pd.read_csv('NBA_Player_Salary_2022_2023.csv')
df = df[df['Salary'] > 1000000]

# Feature Engineering

In [146]:
# Create new feature: Game Started / Game Played
df['GS/GP'] = df['GS'] / df['GP']

# Handling missing values
df.fillna(df.mean(), inplace=True)

# Handling infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(df.mean(), inplace=True)

df['Position'] = df['Position'].str.split('-').str[0]

  df.fillna(df.mean(), inplace=True)
  df.fillna(df.mean(), inplace=True)


# Feature Selection

In [147]:
selected_features = ['Position', 'Age', 'FGA', 'TOV%', 'PTS', 'PER', 'VORP', 'FG%', 'USG%', 'GS/GP']
X = df[selected_features]
y = df['Salary']

In [148]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), ['Position']),
        ('num', StandardScaler(), ['Age', 'FGA', 'TOV%', 'PTS', 'PER', 'VORP', 'FG%', 'USG%', 'GS/GP'])
    ])

# Split the dataset into training and test sets (70:30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [153]:
# Preprocess the training data
X_train_transformed = preprocessor.fit_transform(X_train)
X_train_transformed = np.nan_to_num(X_train_transformed)

# Preprocess the test data
X_test_transformed = preprocessor.transform(X_test)
X_test_transformed = np.nan_to_num(X_test_transformed)

In [150]:
# Fit the GLM model and proceed with predictions and evaluation
glm_model = sm.GLM(y_train, sm.add_constant(X_train_transformed), family=sm.families.Gaussian())
glm_results = glm_model.fit()

In [151]:
# Predict on the test set and evaluate
y_pred = glm_results.predict(sm.add_constant(X_test_transformed))
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 38244011656143.516
R-squared: 0.7315872068392756


In [152]:
print(glm_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 Salary   No. Observations:                  274
Model:                            GLM   Df Residuals:                      260
Model Family:                Gaussian   Df Model:                           13
Link Function:               identity   Scale:                      4.4051e+13
Method:                          IRLS   Log-Likelihood:                -4685.6
Date:                Thu, 07 Dec 2023   Deviance:                   1.1453e+16
Time:                        18:38:28   Pearson chi2:                 1.15e+16
No. Iterations:                     3   Pseudo R-squ. (CS):             0.7941
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const       7.894e+06   3.37e+05     23.427      0.0