In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# Loading my data
df = pd.read_csv('data/all_seasons.csv')

###

In [3]:
df

Unnamed: 0.1,Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,...,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
0,0,Randy Livingston,HOU,22.0,193.04,94.800728,Louisiana State,USA,1996,2,...,3.9,1.5,2.4,0.3,0.042,0.071,0.169,0.487,0.248,1996-97
1,1,Gaylon Nickerson,WAS,28.0,190.50,86.182480,Northwestern Oklahoma,USA,1994,2,...,3.8,1.3,0.3,8.9,0.030,0.111,0.174,0.497,0.043,1996-97
2,2,George Lynch,VAN,26.0,203.20,103.418976,North Carolina,USA,1993,1,...,8.3,6.4,1.9,-8.2,0.106,0.185,0.175,0.512,0.125,1996-97
3,3,George McCloud,LAL,30.0,203.20,102.058200,Florida State,USA,1989,1,...,10.2,2.8,1.7,-2.7,0.027,0.111,0.206,0.527,0.125,1996-97
4,4,George Zidek,DEN,23.0,213.36,119.748288,UCLA,USA,1995,1,...,2.8,1.7,0.3,-14.1,0.102,0.169,0.195,0.500,0.064,1996-97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12839,12839,Joel Embiid,PHI,29.0,213.36,127.005760,Kansas,Cameroon,2014,1,...,33.1,10.2,4.2,8.8,0.057,0.243,0.370,0.655,0.233,2022-23
12840,12840,John Butler Jr.,POR,20.0,213.36,86.182480,Florida State,USA,Undrafted,Undrafted,...,2.4,0.9,0.6,-16.1,0.012,0.065,0.102,0.411,0.066,2022-23
12841,12841,John Collins,ATL,25.0,205.74,102.511792,Wake Forest,USA,2017,1,...,13.1,6.5,1.2,-0.2,0.035,0.180,0.168,0.593,0.052,2022-23
12842,12842,Jericho Sims,NYK,24.0,208.28,113.398000,Texas,USA,2021,2,...,3.4,4.7,0.5,-6.7,0.117,0.175,0.074,0.780,0.044,2022-23


In [4]:
df.isnull().sum()

Unnamed: 0              0
player_name             0
team_abbreviation       0
age                     0
player_height           0
player_weight           0
college              1854
country                 0
draft_year              0
draft_round             0
draft_number            0
gp                      0
pts                     0
reb                     0
ast                     0
net_rating              0
oreb_pct                0
dreb_pct                0
usg_pct                 0
ts_pct                  0
ast_pct                 0
season                  0
dtype: int64

In [5]:
df = df.dropna()

In [6]:
columns_to_delete = ['Unnamed: 0', 'player_name','country', 'draft_round','oreb_pct','dreb_pct','usg_pct','ts_pct','ast_pct']

# Drop the specified columns
df = df.drop(columns=columns_to_delete)

In [7]:
df

Unnamed: 0,team_abbreviation,age,player_height,player_weight,college,draft_year,draft_number,gp,pts,reb,ast,net_rating,season
0,HOU,22.0,193.04,94.800728,Louisiana State,1996,42,64,3.9,1.5,2.4,0.3,1996-97
1,WAS,28.0,190.50,86.182480,Northwestern Oklahoma,1994,34,4,3.8,1.3,0.3,8.9,1996-97
2,VAN,26.0,203.20,103.418976,North Carolina,1993,12,41,8.3,6.4,1.9,-8.2,1996-97
3,LAL,30.0,203.20,102.058200,Florida State,1989,7,64,10.2,2.8,1.7,-2.7,1996-97
4,DEN,23.0,213.36,119.748288,UCLA,1995,22,52,2.8,1.7,0.3,-14.1,1996-97
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12839,PHI,29.0,213.36,127.005760,Kansas,2014,3,66,33.1,10.2,4.2,8.8,2022-23
12840,POR,20.0,213.36,86.182480,Florida State,Undrafted,Undrafted,19,2.4,0.9,0.6,-16.1,2022-23
12841,ATL,25.0,205.74,102.511792,Wake Forest,2017,19,71,13.1,6.5,1.2,-0.2,2022-23
12842,NYK,24.0,208.28,113.398000,Texas,2021,58,52,3.4,4.7,0.5,-6.7,2022-23


In [8]:
df['season'] = df['season'].str.split('-').str[0]

# Convert the result to float
df['season'] = df['season'].astype(int)

In [9]:
df

Unnamed: 0,team_abbreviation,age,player_height,player_weight,college,draft_year,draft_number,gp,pts,reb,ast,net_rating,season
0,HOU,22.0,193.04,94.800728,Louisiana State,1996,42,64,3.9,1.5,2.4,0.3,1996
1,WAS,28.0,190.50,86.182480,Northwestern Oklahoma,1994,34,4,3.8,1.3,0.3,8.9,1996
2,VAN,26.0,203.20,103.418976,North Carolina,1993,12,41,8.3,6.4,1.9,-8.2,1996
3,LAL,30.0,203.20,102.058200,Florida State,1989,7,64,10.2,2.8,1.7,-2.7,1996
4,DEN,23.0,213.36,119.748288,UCLA,1995,22,52,2.8,1.7,0.3,-14.1,1996
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12839,PHI,29.0,213.36,127.005760,Kansas,2014,3,66,33.1,10.2,4.2,8.8,2022
12840,POR,20.0,213.36,86.182480,Florida State,Undrafted,Undrafted,19,2.4,0.9,0.6,-16.1,2022
12841,ATL,25.0,205.74,102.511792,Wake Forest,2017,19,71,13.1,6.5,1.2,-0.2,2022
12842,NYK,24.0,208.28,113.398000,Texas,2021,58,52,3.4,4.7,0.5,-6.7,2022


In [10]:
condition = (df['draft_year'] != 'Undrafted') 

df = df[condition]

condition = (df['draft_number'] != 'Undrafted')

df = df[condition]

df['draft_year'] = df['draft_year'].astype(int)
df['draft_number'] = df['draft_number'].astype(int)

In [11]:
df['draft_number'].unique()

array([ 42,  34,  12,   7,  22,  47,   4,   1,   3,  11,  33,  26,  35,
        17,  25,  28,  49,  23,   2,  15,  27,  37,  38,  13,  18,   9,
        21,  43,  14,  10,   6,  57,  52,  32,  39,  46,  29,  16,  45,
        30,  20,  48,   5,  79,  41,  51,  36,  75,  56,   8,  24, 165,
        19,  58,  40,  44, 139,  54, 124,  50, 160,  63,  31,  69,  53,
        55, 120,  87,  66,  82,  60,  59,  78,   0])

In [12]:
df['years_in_nba'] = df['season'] - df['draft_year']

# Drop both 'current_year' and 'draft_year' columns
df = df.drop(['season', 'draft_year'], axis=1)

In [13]:
df

Unnamed: 0,team_abbreviation,age,player_height,player_weight,college,draft_number,gp,pts,reb,ast,net_rating,years_in_nba
0,HOU,22.0,193.04,94.800728,Louisiana State,42,64,3.9,1.5,2.4,0.3,0
1,WAS,28.0,190.50,86.182480,Northwestern Oklahoma,34,4,3.8,1.3,0.3,8.9,2
2,VAN,26.0,203.20,103.418976,North Carolina,12,41,8.3,6.4,1.9,-8.2,3
3,LAL,30.0,203.20,102.058200,Florida State,7,64,10.2,2.8,1.7,-2.7,7
4,DEN,23.0,213.36,119.748288,UCLA,22,52,2.8,1.7,0.3,-14.1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
12836,BKN,31.0,198.12,99.790240,Virginia,33,74,7.6,2.2,1.4,-1.0,8
12838,TOR,23.0,198.12,92.986360,Iowa,41,9,1.0,0.4,0.3,1.0,1
12839,PHI,29.0,213.36,127.005760,Kansas,3,66,33.1,10.2,4.2,8.8,8
12841,ATL,25.0,205.74,102.511792,Wake Forest,19,71,13.1,6.5,1.2,-0.2,5


In [14]:
column_to_move = 'years_in_nba'

# Get the list of column names
columns = df.columns.tolist()

# Remove the column from its original position
columns.remove(column_to_move)

# Insert the column at the 8th position
columns.insert(6, column_to_move)

# Reorder the DataFrame with the new column order
df = df[columns]

In [15]:
df

Unnamed: 0,team_abbreviation,age,player_height,player_weight,college,draft_number,years_in_nba,gp,pts,reb,ast,net_rating
0,HOU,22.0,193.04,94.800728,Louisiana State,42,0,64,3.9,1.5,2.4,0.3
1,WAS,28.0,190.50,86.182480,Northwestern Oklahoma,34,2,4,3.8,1.3,0.3,8.9
2,VAN,26.0,203.20,103.418976,North Carolina,12,3,41,8.3,6.4,1.9,-8.2
3,LAL,30.0,203.20,102.058200,Florida State,7,7,64,10.2,2.8,1.7,-2.7
4,DEN,23.0,213.36,119.748288,UCLA,22,1,52,2.8,1.7,0.3,-14.1
...,...,...,...,...,...,...,...,...,...,...,...,...
12836,BKN,31.0,198.12,99.790240,Virginia,33,8,74,7.6,2.2,1.4,-1.0
12838,TOR,23.0,198.12,92.986360,Iowa,41,1,9,1.0,0.4,0.3,1.0
12839,PHI,29.0,213.36,127.005760,Kansas,3,8,66,33.1,10.2,4.2,8.8
12841,ATL,25.0,205.74,102.511792,Wake Forest,19,5,71,13.1,6.5,1.2,-0.2


In [16]:
# Identify the top 20 colleges
top_colleges = df['college'].value_counts().nlargest(20).index

# Replace colleges not in the top 20 with 'other'
df.loc[~df['college'].isin(top_colleges), 'college'] = 'other'
unique_colleges = df['college'].unique()
print(unique_colleges)

['Louisiana State' 'other' 'North Carolina' 'UCLA' 'Michigan' 'Duke'
 'Kansas' 'Indiana' 'Michigan State' 'Georgetown' 'Syracuse'
 'Georgia Tech' 'Maryland' 'Kentucky' 'Villanova' 'Arizona' 'Connecticut'
 'Florida' 'Wake Forest' 'Texas' 'Stanford']


In [17]:
df

Unnamed: 0,team_abbreviation,age,player_height,player_weight,college,draft_number,years_in_nba,gp,pts,reb,ast,net_rating
0,HOU,22.0,193.04,94.800728,Louisiana State,42,0,64,3.9,1.5,2.4,0.3
1,WAS,28.0,190.50,86.182480,other,34,2,4,3.8,1.3,0.3,8.9
2,VAN,26.0,203.20,103.418976,North Carolina,12,3,41,8.3,6.4,1.9,-8.2
3,LAL,30.0,203.20,102.058200,other,7,7,64,10.2,2.8,1.7,-2.7
4,DEN,23.0,213.36,119.748288,UCLA,22,1,52,2.8,1.7,0.3,-14.1
...,...,...,...,...,...,...,...,...,...,...,...,...
12836,BKN,31.0,198.12,99.790240,other,33,8,74,7.6,2.2,1.4,-1.0
12838,TOR,23.0,198.12,92.986360,other,41,1,9,1.0,0.4,0.3,1.0
12839,PHI,29.0,213.36,127.005760,Kansas,3,8,66,33.1,10.2,4.2,8.8
12841,ATL,25.0,205.74,102.511792,Wake Forest,19,5,71,13.1,6.5,1.2,-0.2


In [18]:
unique_teams = df['team_abbreviation'].unique()
print(unique_teams)

['HOU' 'WAS' 'VAN' 'LAL' 'DEN' 'ORL' 'CHH' 'MIL' 'DET' 'POR' 'DAL' 'UTA'
 'SEA' 'BOS' 'IND' 'SAS' 'MIA' 'NJN' 'LAC' 'GSW' 'PHI' 'NYK' 'TOR' 'ATL'
 'PHX' 'MIN' 'CHI' 'SAC' 'CLE' 'MEM' 'NOH' 'CHA' 'NOK' 'OKC' 'BKN' 'NOP']


In [19]:
df.dtypes

team_abbreviation     object
age                  float64
player_height        float64
player_weight        float64
college               object
draft_number           int32
years_in_nba           int32
gp                     int64
pts                  float64
reb                  float64
ast                  float64
net_rating           float64
dtype: object

In [20]:
categorical_columns = ['team_abbreviation', 'college']

# Convert categorical variables to dummy/indicator variables
df = pd.get_dummies(df, columns=categorical_columns)

In [21]:
# Split the data into features (X) and target variables (y)
X = df.drop(columns=['gp', 'pts', 'reb', 'ast', 'net_rating'])
y = df[['gp', 'pts', 'reb', 'ast', 'net_rating']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Made a nice loop to figure out what models work the best for the Regression 

In [22]:
def train_evaluate_model(model, X_train, y_train, X_test, y_test, target_variable):
    model.fit(X_train, y_train[target_variable])
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test[target_variable], predictions)
    r2 = r2_score(y_test[target_variable], predictions)
    return mse, r2

# Models for each target variable
target_variables = ['gp', 'pts', 'reb', 'ast', 'net_rating']
for target_variable in target_variables:
    print(f"Results for {target_variable}:")
    
    # Random Forest
    rf_model = RandomForestRegressor()
    rf_mse, rf_r2 = train_evaluate_model(rf_model, X_train, y_train, X_test, y_test, target_variable)
    print(f"Random Forest - MSE: {rf_mse}, R2: {rf_r2}")
    
    # SVR
    svr_model = SVR()
    svr_mse, svr_r2 = train_evaluate_model(svr_model, X_train, y_train, X_test, y_test, target_variable)
    print(f"SVR - MSE: {svr_mse}, R2: {svr_r2}")
    
    # KNeighborsRegressor
    knn_model = KNeighborsRegressor()
    knn_mse, knn_r2 = train_evaluate_model(knn_model, X_train, y_train, X_test, y_test, target_variable)
    print(f"KNeighborsRegressor - MSE: {knn_mse}, R2: {knn_r2}")

    # GradientBoostingRegressor
    gb_model = GradientBoostingRegressor()
    gb_mse, gb_r2 = train_evaluate_model(gb_model, X_train, y_train, X_test, y_test, target_variable)
    print(f"GradientBoostingRegressor - MSE: {gb_mse}, R2: {gb_r2}")

    # Linear Regression
    lr_model = LinearRegression()
    lr_mse, lr_r2 = train_evaluate_model(lr_model, X_train, y_train, X_test, y_test, target_variable)
    print(f"Linear Regression - MSE: {lr_mse}, R2: {lr_r2}")

    # Lasso
    lasso_model = Lasso()
    lasso_mse, lasso_r2 = train_evaluate_model(lasso_model, X_train, y_train, X_test, y_test, target_variable)
    print(f"Lasso - MSE: {lasso_mse}, R2: {lasso_r2}")

    print("\n")

Results for gp:
Random Forest - MSE: 474.3113240551304, R2: 0.14844363159043483
SVR - MSE: 556.9952884228273, R2: -3.291035595376357e-06
KNeighborsRegressor - MSE: 510.3105442176871, R2: 0.08381231533782085
GradientBoostingRegressor - MSE: 492.83930461394635, R2: 0.11517936181981792
Linear Regression - MSE: 528.7460578094974, R2: 0.050714056435230526
Lasso - MSE: 529.3371664345449, R2: 0.04965280765504643


Results for pts:
Random Forest - MSE: 12.389272679138319, R2: 0.6494601465738776
SVR - MSE: 28.828306019250434, R2: 0.18433709320745295
KNeighborsRegressor - MSE: 17.754302040816324, R2: 0.4976629705188057
GradientBoostingRegressor - MSE: 20.678683105509144, R2: 0.4149210585173301
Linear Regression - MSE: 26.686147482509654, R2: 0.2449469416571567
Lasso - MSE: 28.73044522416425, R2: 0.18710594894693766


Results for reb:
Random Forest - MSE: 2.044579490006299, R2: 0.667443633200197
SVR - MSE: 4.478364068005121, R2: 0.2715820094341349
KNeighborsRegressor - MSE: 2.9779081632653064, R2

In [29]:
rf_model = RandomForestRegressor(n_estimators=50)
rf_model.fit(X_train, y_train)

# Predictions
rf_predictions = rf_model.predict(X_test)

# Calculate error metrics for each target variable
target_variables = ['gp', 'pts', 'reb', 'ast', 'net_rating']
for i, target_variable in enumerate(target_variables):
    mse = mean_squared_error(y_test[target_variable], rf_predictions[:, i])
    r2 = r2_score(y_test[target_variable], rf_predictions[:, i])
    print(f"Random Forest - {target_variable} - MSE: {mse}, R2: {r2}")

Random Forest - gp - MSE: 475.2711336167801, R2: 0.14672043439224147
Random Forest - pts - MSE: 13.319590356009071, R2: 0.6231379055080941
Random Forest - reb - MSE: 2.331737454333586, R2: 0.6207366160452826
Random Forest - ast - MSE: 1.2230186561476442, R2: 0.6826560136832726
Random Forest - net_rating - MSE: 93.64692575592089, R2: -0.00061871264555724


In [30]:
# Save the trained model to a file
joblib.dump(rf_model, 'nba_random_forest_model.joblib')

['nba_random_forest_model.joblib']