In [1]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import os

In [7]:
full_game_df = pd.read_csv(os.path.join("feature_eng_data_iter_2.csv"))

In [8]:
sig_cols = list(np.load("sig_cols_ml_iteration_2.npy"))

In [9]:
target = full_game_df["+/-"]

In [10]:
full_game_df.head()

Unnamed: 0,G,Date,Age,Tm,game_location,Opp,game_result,GS,MP,FG,...,More than 3 Fouls,FTR-Above-Avg,Less-Than-3-TOV,PTS-Per-Min,%-Avg,PER-Per-TOV,off_rating,off_rating_above_avg,More-Than-Avg-POSS,Above-Avg-GmSc
0,1,2003-10-29,18,CLE,False,SAC,-14,True,42,12.0,...,False,False,True,0.595238,0.333333,14.638524,117.260788,True,False,True
1,2,2003-10-30,18,CLE,False,PHO,-9,True,40,8.0,...,False,False,False,0.525,0.391667,2.413371,83.732057,False,False,False
2,3,2003-11-01,18,CLE,False,POR,-19,True,39,3.0,...,False,False,True,0.205128,0.136364,2.74459,53.763441,False,False,False
3,4,2003-11-05,18,CLE,True,DEN,-4,True,41,3.0,...,False,False,True,0.170732,0.166667,6.645524,61.188811,False,False,False
4,5,2003-11-07,18,CLE,False,IND,-1,True,43,8.0,...,False,True,False,0.534884,0.46875,1.242708,81.908832,False,True,False


In [11]:
sig_df = full_game_df[sig_cols]

In [12]:
sig_df.head()

Unnamed: 0,PTS-Per-Min,%-Avg,PER-Per-TOV,off_rating,off_rating_above_avg,More-Than-Avg-POSS,Above-Avg-GmSc,game_result,FG%,GmSc,...,Over-3-BLKs-and-STLs,Played-Above-Avg-Min,Above-Avg-FT%,Career-Stage,Above-Avg-PTS-Per-POSS,More-BLKs-Than-STLs,More-AST-Than-TOV,PTS-Cat,2P%-Above-50%,More than 3 Fouls
0,0.595238,0.333333,14.638524,117.260788,True,False,True,-14,0.6,24.7,...,True,True,False,rising,True,False,True,below 35,True,False
1,0.525,0.391667,2.413371,83.732057,False,False,False,-9,0.471,14.7,...,False,True,False,rising,False,False,True,below 35,True,False
2,0.205128,0.136364,2.74459,53.763441,False,False,False,-19,0.25,5.0,...,False,True,True,rising,False,False,True,below 20,False,False
3,0.170732,0.166667,6.645524,61.188811,False,False,False,-4,0.273,11.2,...,True,True,True,rising,False,True,True,below 20,False,False
4,0.534884,0.46875,1.242708,81.908832,False,True,False,-1,0.444,9.0,...,False,True,True,rising,False,False,False,below 35,False,False


In [13]:
sig_df.replace(True,1,inplace=True)
sig_df.replace(False,0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [14]:
sig_df.select_dtypes(include=["int64"]).columns

Index(['game_result'], dtype='object')

In [15]:
for col in sig_df.select_dtypes(include=["int64"]).columns:
    sig_df[col] = sig_df[col].apply(lambda x: float(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
sig_df.dtypes

PTS-Per-Min               float64
%-Avg                     float64
PER-Per-TOV               float64
off_rating                float64
off_rating_above_avg      float64
More-Than-Avg-POSS        float64
Above-Avg-GmSc            float64
game_result               float64
FG%                       float64
GmSc                      float64
TS%                       float64
GmSc-Per-PTS              float64
PTS-Per-POSS              float64
game_result-Per-PTS       float64
PER                       float64
game_location             float64
Opp                        object
Won                       float64
High-3P%-Shooting         float64
High-FG%-Shooting         float64
Over-3-BLKs-and-STLs      float64
Played-Above-Avg-Min      float64
Above-Avg-FT%             float64
Career-Stage               object
Above-Avg-PTS-Per-POSS    float64
More-BLKs-Than-STLs       float64
More-AST-Than-TOV         float64
PTS-Cat                    object
2P%-Above-50%             float64
More than 3 Fo

In [17]:
def label_bin(cat_col,df):
    lb = LabelBinarizer()
    data = lb.fit_transform(df[cat_col])
    classes = lb.classes_
    new_df = pd.DataFrame(data,columns=classes,dtype=float)
    for col in new_df.columns:
        df[col] = new_df[col]
    return lb

In [18]:
label_bins = []
for object_col in sig_df.select_dtypes(include=["object"]).columns:
    object_lb = label_bin(object_col,sig_df)
    object_col_lb_pair = (object_col,object_lb)
    label_bins.append(object_col_lb_pair)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [19]:
sig_df.tail()

Unnamed: 0,PTS-Per-Min,%-Avg,PER-Per-TOV,off_rating,off_rating_above_avg,More-Than-Avg-POSS,Above-Avg-GmSc,game_result,FG%,GmSc,...,SEA,TOR,UTA,WAS,decline,prime,rising,above 35,below 20,below 35
1255,1.027778,0.464357,10.446618,117.088608,1.0,1.0,1.0,10.0,0.571,31.3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1256,0.823529,0.393773,18.12825,120.8981,1.0,0.0,1.0,9.0,0.412,25.8,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1257,0.852941,0.529692,11.770794,110.687023,1.0,1.0,1.0,-2.0,0.545,24.7,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1258,0.470588,0.309667,2.522094,69.93007,0.0,0.0,0.0,2.0,0.316,9.6,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1259,0.588235,0.45,4.925272,94.87666,0.0,0.0,0.0,-15.0,0.467,14.3,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [21]:
label_bins

[('Opp', LabelBinarizer()),
 ('Career-Stage', LabelBinarizer()),
 ('PTS-Cat', LabelBinarizer())]

In [22]:
sig_df.select_dtypes(include=["object"])

Unnamed: 0,Opp,Career-Stage,PTS-Cat
0,SAC,rising,below 35
1,PHO,rising,below 35
2,POR,rising,below 20
3,DEN,rising,below 20
4,IND,rising,below 35
...,...,...,...
1255,MIL,decline,above 35
1256,LAC,decline,below 35
1257,BRK,decline,below 35
1258,LAC,decline,below 20


In [23]:
split = StratifiedShuffleSplit(n_splits=1,test_size=0.33)

In [24]:
ml_models_performances = []

<h2>Perform Machine Learning with Opposition Team Feature Included</h2>

In [25]:
std_scaler = StandardScaler()

In [26]:
one_hot_encoded_sig_df = sig_df.select_dtypes(exclude=["object"])

In [27]:
one_hot_encoded_sig_df.columns

Index(['PTS-Per-Min', '%-Avg', 'PER-Per-TOV', 'off_rating',
       'off_rating_above_avg', 'More-Than-Avg-POSS', 'Above-Avg-GmSc',
       'game_result', 'FG%', 'GmSc', 'TS%', 'GmSc-Per-PTS', 'PTS-Per-POSS',
       'game_result-Per-PTS', 'PER', 'game_location', 'Won',
       'High-3P%-Shooting', 'High-FG%-Shooting', 'Over-3-BLKs-and-STLs',
       'Played-Above-Avg-Min', 'Above-Avg-FT%', 'Above-Avg-PTS-Per-POSS',
       'More-BLKs-Than-STLs', 'More-AST-Than-TOV', '2P%-Above-50%',
       'More than 3 Fouls', 'ATL', 'BOS', 'BRK', 'CHA', 'CHI', 'CHO', 'CLE',
       'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA',
       'MIL', 'MIN', 'NJN', 'NOH', 'NOK', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI',
       'PHO', 'POR', 'SAC', 'SAS', 'SEA', 'TOR', 'UTA', 'WAS', 'decline',
       'prime', 'rising', 'above 35', 'below 20', 'below 35'],
      dtype='object')

In [28]:
std_scale_sig_data = std_scaler.fit_transform(one_hot_encoded_sig_df.values)

In [29]:
std_scale_sig_df = pd.DataFrame(std_scale_sig_data,columns=one_hot_encoded_sig_df.columns)

In [30]:
std_scale_sig_df.head()

Unnamed: 0,PTS-Per-Min,%-Avg,PER-Per-TOV,off_rating,off_rating_above_avg,More-Than-Avg-POSS,Above-Avg-GmSc,game_result,FG%,GmSc,...,SEA,TOR,UTA,WAS,decline,prime,rising,above 35,below 20,below 35
0,-0.635326,-0.661242,0.467196,0.480218,0.97337,-0.97337,0.962604,-1.420272,0.846395,0.318334,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,-0.456129,0.70837
1,-1.005199,-0.248814,-0.919494,-1.059284,-1.027358,-0.97337,-1.038849,-1.035821,-0.304266,-0.975332,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,-0.456129,0.70837
2,-2.689642,-2.053853,-0.881924,-2.43532,-1.027358,-0.97337,-1.038849,-1.804724,-2.275553,-2.230188,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,2.192362,-1.411692
3,-2.870773,-1.839605,-0.439444,-2.094377,-1.027358,-0.97337,-1.038849,-0.65137,-2.070397,-1.428115,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,2.192362,-1.411692
4,-0.953152,0.296179,-1.052281,-1.142999,-1.027358,1.027358,-1.038849,-0.420699,-0.545102,-1.712722,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,-0.456129,0.70837


In [31]:
print(f"There are {len(std_scale_sig_df.columns)} columns")
std_scale_sig_df.columns

There are 68 columns


Index(['PTS-Per-Min', '%-Avg', 'PER-Per-TOV', 'off_rating',
       'off_rating_above_avg', 'More-Than-Avg-POSS', 'Above-Avg-GmSc',
       'game_result', 'FG%', 'GmSc', 'TS%', 'GmSc-Per-PTS', 'PTS-Per-POSS',
       'game_result-Per-PTS', 'PER', 'game_location', 'Won',
       'High-3P%-Shooting', 'High-FG%-Shooting', 'Over-3-BLKs-and-STLs',
       'Played-Above-Avg-Min', 'Above-Avg-FT%', 'Above-Avg-PTS-Per-POSS',
       'More-BLKs-Than-STLs', 'More-AST-Than-TOV', '2P%-Above-50%',
       'More than 3 Fouls', 'ATL', 'BOS', 'BRK', 'CHA', 'CHI', 'CHO', 'CLE',
       'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA',
       'MIL', 'MIN', 'NJN', 'NOH', 'NOK', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI',
       'PHO', 'POR', 'SAC', 'SAS', 'SEA', 'TOR', 'UTA', 'WAS', 'decline',
       'prime', 'rising', 'above 35', 'below 20', 'below 35'],
      dtype='object')

In [32]:
X, y = std_scale_sig_df.values, target

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33)

<h4>Linear Regression</h4>

In [34]:
ln = LinearRegression()

In [35]:
ln.fit(X_train,y_train)

LinearRegression()

In [36]:
y_pred = ln.predict(X_test)

In [37]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for linear regression is {round(rmse,2)} points")

The root mean squared error for linear regression is 6.37 points


In [38]:
ml_performance = ("lin_reg_w_opp_teams_w/out_pca",rmse)

In [39]:
ml_models_performances.append(ml_performance)

<h4>SGD Regression</h4>

In [40]:
sgd_param_grid = [
    {
        "loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
        "epsilon":[0.1,0.01,0.001]
    }
]

In [41]:
sgd_grid_search = GridSearchCV(SGDRegressor(),sgd_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [42]:
sgd_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=SGDRegressor(), n_jobs=-1,
             param_grid=[{'epsilon': [0.1, 0.01, 0.001],
                          'loss': ['squared_loss', 'huber',
                                   'epsilon_insensitive',
                                   'squared_epsilon_insensitive']}],
             scoring='neg_root_mean_squared_error')

In [43]:
sgd_grid_search.best_params_

{'epsilon': 0.1, 'loss': 'squared_loss'}

In [44]:
best_sgd_reg = sgd_grid_search.best_estimator_

In [45]:
y_pred = best_sgd_reg.predict(X_test)

In [46]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.41 points


In [47]:
ml_performance = ("sgd_reg_w_opp_teams_w/out_pca",rmse)

In [48]:
ml_models_performances.append(ml_performance)

<h4>Random Forest Regression</h4>

In [49]:
rand_forest_param_grid = [
    {
        "n_estimators":[50,100,500],
        "criterion":["mse","mae"]
    }
]

In [50]:
rand_forest_grid_search = GridSearchCV(RandomForestRegressor(),rand_forest_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [51]:
rand_forest_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'criterion': ['mse', 'mae'],
                          'n_estimators': [50, 100, 500]}],
             scoring='neg_root_mean_squared_error')

In [52]:
rand_forest_grid_search.best_params_

{'criterion': 'mae', 'n_estimators': 50}

In [53]:
best_rand_forest_reg = rand_forest_grid_search.best_estimator_

In [54]:
y_pred = best_rand_forest_reg.predict(X_test)

In [55]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.63 points


In [56]:
ml_performance = ("rand_forest_reg_w_opp_teams_w/out_pca",rmse)

In [57]:
ml_models_performances.append(ml_performance)

<h4>Elastic Net Regression</h4>

In [58]:
elastic_net_param_grid = [
    {
        "l1_ratio":[0.25,0.5,0.75]
    }
]

In [59]:
elastic_net_grid_search = GridSearchCV(ElasticNet(),elastic_net_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [60]:
elastic_net_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=ElasticNet(), n_jobs=-1,
             param_grid=[{'l1_ratio': [0.25, 0.5, 0.75]}],
             scoring='neg_root_mean_squared_error')

In [61]:
elastic_net_grid_search.best_params_

{'l1_ratio': 0.75}

In [62]:
best_elastic_net_reg = elastic_net_grid_search.best_estimator_

In [63]:
y_pred = best_elastic_net_reg.predict(X_test)

In [64]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for Elastic Net regression is {round(rmse,2)} points")

The root mean squared error for Elastic Net regression is 6.92 points


In [65]:
ml_performance = ("elastic_net_reg_w_opp_teams_w/out_pca",rmse)

In [66]:
ml_models_performances.append(ml_performance)

<h3>Perform PCA to see if there is any performance improvements</h3>

In [67]:
pca = PCA()

In [68]:
pca.fit_transform(one_hot_encoded_sig_df.values)

array([[-5.98477896e+00,  2.06242204e+01,  3.31527466e+00, ...,
         4.14072055e-15, -9.61510991e-15, -3.22636612e-15],
       [ 3.03538080e+01,  6.41256875e+00, -2.19963501e+00, ...,
         4.59793656e-15, -2.66561156e-15,  7.75295086e-15],
       [ 6.45456817e+01,  7.37135727e+00,  5.00338535e+00, ...,
         3.27243014e-15,  2.64471418e-15, -2.47737826e-15],
       ...,
       [-5.13312941e+00,  7.50926672e+00,  9.69286671e-01, ...,
         6.30929269e-16,  2.52268247e-17,  1.26553867e-16],
       [ 4.20338247e+01, -8.29259835e+00, -1.15840290e-01, ...,
         2.46393615e-16, -8.47394234e-16, -2.00844010e-15],
       [ 2.11091955e+01,  1.49456703e+01, -1.47886592e+00, ...,
         1.83158706e-15,  5.90287679e-16, -3.06797676e-15]])

In [69]:
sum(pca.explained_variance_ratio_)

1.0

In [70]:
cum_sum = np.cumsum(pca.explained_variance_ratio_)

In [71]:
max_comp_idx = np.argmax(cum_sum>0.95)

In [72]:
pca = PCA(n_components=max_comp_idx+1)

In [73]:
pca_transformed_sig_data = pca.fit_transform(one_hot_encoded_sig_df.values)

In [74]:
X, y = pca_transformed_sig_data, target

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

<h4>Linear Regression</h4>

In [76]:
ln = LinearRegression()

In [77]:
ln.fit(X_train,y_train)

LinearRegression()

In [78]:
y_pred = ln.predict(X_test)

In [79]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for linear regression is {round(rmse,2)} points")

The root mean squared error for linear regression is 6.71 points


In [80]:
ml_performance = ("lin_reg_w_opp_teams_w_pca",rmse)

In [81]:
ml_models_performances.append(ml_performance)

<h4>SGD Regression</h4>

In [82]:
sgd_param_grid = [
    {
        "loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
        "epsilon":[0.1,0.01,0.001]
    }
]

In [83]:
sgd_grid_search = GridSearchCV(SGDRegressor(),sgd_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [84]:
sgd_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=SGDRegressor(), n_jobs=-1,
             param_grid=[{'epsilon': [0.1, 0.01, 0.001],
                          'loss': ['squared_loss', 'huber',
                                   'epsilon_insensitive',
                                   'squared_epsilon_insensitive']}],
             scoring='neg_root_mean_squared_error')

In [85]:
sgd_grid_search.best_params_

{'epsilon': 0.1, 'loss': 'epsilon_insensitive'}

In [86]:
best_sgd_reg = sgd_grid_search.best_estimator_

In [87]:
y_pred = best_sgd_reg.predict(X_test)

In [88]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 7.06 points


In [89]:
ml_performance = ("sgd_reg_w_opp_teams_w_pca",rmse)

In [90]:
ml_models_performances.append(ml_performance)

<h4>Random Forest Regression</h4>

In [91]:
rand_forest_param_grid = [
    {
        "n_estimators":[50,100,500],
        "criterion":["mse","mae"]
    }
]

In [92]:
rand_forest_grid_search = GridSearchCV(RandomForestRegressor(),rand_forest_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [93]:
rand_forest_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'criterion': ['mse', 'mae'],
                          'n_estimators': [50, 100, 500]}],
             scoring='neg_root_mean_squared_error')

In [94]:
rand_forest_grid_search.best_params_

{'criterion': 'mae', 'n_estimators': 500}

In [95]:
best_rand_forest_reg = rand_forest_grid_search.best_estimator_

In [96]:
y_pred = best_rand_forest_reg.predict(X_test)

In [97]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 7.03 points


In [98]:
ml_performance = ("rand_forest_reg_w_opp_teams_w_pca",rmse)

In [99]:
ml_models_performances.append(ml_performance)

<h4>Elastic Net Regression</h4>

In [100]:
elastic_net_param_grid = [
    {
        "l1_ratio":[0.25,0.5,0.75]
    }
]

In [101]:
elastic_net_grid_search = GridSearchCV(ElasticNet(),elastic_net_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [102]:
elastic_net_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=ElasticNet(), n_jobs=-1,
             param_grid=[{'l1_ratio': [0.25, 0.5, 0.75]}],
             scoring='neg_root_mean_squared_error')

In [103]:
elastic_net_grid_search.best_params_

{'l1_ratio': 0.75}

In [104]:
best_elastic_net_reg = elastic_net_grid_search.best_estimator_

In [105]:
y_pred = best_elastic_net_reg.predict(X_test)

In [106]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for Elastic Net regression is {round(rmse,2)} points")

The root mean squared error for Elastic Net regression is 6.72 points


In [107]:
ml_performance = ("elastic_net_reg_w_opp_teams_w_pca",rmse)

In [108]:
ml_models_performances.append(ml_performance)

<h2>Remove Opposing Team Columns</h2>

In [109]:
sig_df.columns

Index(['PTS-Per-Min', '%-Avg', 'PER-Per-TOV', 'off_rating',
       'off_rating_above_avg', 'More-Than-Avg-POSS', 'Above-Avg-GmSc',
       'game_result', 'FG%', 'GmSc', 'TS%', 'GmSc-Per-PTS', 'PTS-Per-POSS',
       'game_result-Per-PTS', 'PER', 'game_location', 'Opp', 'Won',
       'High-3P%-Shooting', 'High-FG%-Shooting', 'Over-3-BLKs-and-STLs',
       'Played-Above-Avg-Min', 'Above-Avg-FT%', 'Career-Stage',
       'Above-Avg-PTS-Per-POSS', 'More-BLKs-Than-STLs', 'More-AST-Than-TOV',
       'PTS-Cat', '2P%-Above-50%', 'More than 3 Fouls', 'ATL', 'BOS', 'BRK',
       'CHA', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND',
       'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NJN', 'NOH', 'NOK', 'NOP',
       'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'SEA', 'TOR',
       'UTA', 'WAS', 'decline', 'prime', 'rising', 'above 35', 'below 20',
       'below 35'],
      dtype='object')

In [110]:
teams_lists = ['ATL', 'BOS', 'BRK',
       'CHA', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND',
       'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NJN', 'NOH', 'NOK', 'NOP',
       'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'SEA', 'TOR',
       'UTA', 'WAS']

In [111]:
non_opp_cols = list(set(sig_df.columns)-set(teams_lists))

In [112]:
sig_df_no_opp = sig_df[non_opp_cols].select_dtypes(exclude=["object"])

In [113]:
sig_df_no_opp.head()

Unnamed: 0,off_rating_above_avg,GmSc-Per-PTS,prime,2P%-Above-50%,%-Avg,above 35,Above-Avg-GmSc,rising,High-3P%-Shooting,Above-Avg-PTS-Per-POSS,...,off_rating,Won,PER-Per-TOV,PER,game_result-Per-PTS,TS%,More-BLKs-Than-STLs,More-Than-Avg-POSS,High-FG%-Shooting,More-AST-Than-TOV
0,1.0,0.988,0.0,1.0,0.333333,0.0,1.0,1.0,0.0,1.0,...,117.260788,0.0,14.638524,29.277048,-0.56,0.583431,0.0,0.0,1.0,1.0
1,0.0,0.7,0.0,1.0,0.391667,0.0,0.0,1.0,0.0,0.0,...,83.732057,0.0,2.413371,16.8936,-0.428571,0.516605,0.0,0.0,0.0,1.0
2,0.0,0.625,0.0,0.0,0.136364,0.0,0.0,1.0,0.0,0.0,...,53.763441,0.0,2.74459,5.489179,-2.375,0.30888,0.0,0.0,0.0,1.0
3,0.0,1.6,0.0,0.0,0.166667,0.0,0.0,1.0,0.0,0.0,...,61.188811,0.0,6.645524,13.291049,-0.571429,0.305011,1.0,0.0,0.0,1.0
4,0.0,0.391304,0.0,0.0,0.46875,0.0,0.0,1.0,1.0,0.0,...,81.908832,0.0,1.242708,8.698953,-0.043478,0.539273,0.0,1.0,0.0,0.0


In [114]:
std_scaler = StandardScaler()

In [115]:
sig_no_opp_data = std_scaler.fit_transform(sig_df_no_opp.values)

In [116]:
X, y = sig_no_opp_data, target

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

<h4>Linear Regression</h4>

In [118]:
ln = LinearRegression()

In [119]:
ln.fit(X_train,y_train)

LinearRegression()

In [120]:
y_pred = ln.predict(X_test)

In [121]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for linear regression is {round(rmse,2)} points")

The root mean squared error for linear regression is 6.64 points


In [122]:
ml_performance = ("lin_reg_w/out_opp_teams_w/out_pca",rmse)

In [123]:
ml_models_performances.append(ml_performance)

<h4>SGD Regression</h4>

In [124]:
sgd_param_grid = [
    {
        "loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
        "epsilon":[0.1,0.01,0.001]
    }
]

In [125]:
sgd_grid_search = GridSearchCV(SGDRegressor(),sgd_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [126]:
sgd_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=SGDRegressor(), n_jobs=-1,
             param_grid=[{'epsilon': [0.1, 0.01, 0.001],
                          'loss': ['squared_loss', 'huber',
                                   'epsilon_insensitive',
                                   'squared_epsilon_insensitive']}],
             scoring='neg_root_mean_squared_error')

In [127]:
sgd_grid_search.best_params_

{'epsilon': 0.1, 'loss': 'squared_loss'}

In [128]:
best_sgd_reg = sgd_grid_search.best_estimator_

In [129]:
y_pred = best_sgd_reg.predict(X_test)

In [130]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.61 points


In [131]:
ml_performance = ("sgd_reg_w/out_opp_teams_w/out_pca",rmse)

In [132]:
ml_models_performances.append(ml_performance)

<h4>Random Forest Regression</h4>

In [133]:
rand_forest_param_grid = [
    {
        "n_estimators":[50,100,500],
        "criterion":["mse","mae"]
    }
]

In [134]:
rand_forest_grid_search = GridSearchCV(RandomForestRegressor(),rand_forest_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [135]:
rand_forest_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'criterion': ['mse', 'mae'],
                          'n_estimators': [50, 100, 500]}],
             scoring='neg_root_mean_squared_error')

In [136]:
rand_forest_grid_search.best_params_

{'criterion': 'mae', 'n_estimators': 100}

In [137]:
best_rand_forest_reg = rand_forest_grid_search.best_estimator_

In [138]:
y_pred = best_rand_forest_reg.predict(X_test)

In [139]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.93 points


In [140]:
ml_performance = ("rand_forest_reg_w/out_opp_teams_w/out_pca",rmse)

In [141]:
ml_models_performances.append(ml_performance)

<h4>Elastic Net Regression</h4>

In [142]:
elastic_net_param_grid = [
    {
        "l1_ratio":[0.25,0.5,0.75]
    }
]

In [143]:
elastic_net_grid_search = GridSearchCV(ElasticNet(),elastic_net_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [144]:
elastic_net_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=ElasticNet(), n_jobs=-1,
             param_grid=[{'l1_ratio': [0.25, 0.5, 0.75]}],
             scoring='neg_root_mean_squared_error')

In [145]:
elastic_net_grid_search.best_params_

{'l1_ratio': 0.75}

In [146]:
best_elastic_net_reg = elastic_net_grid_search.best_estimator_

In [147]:
y_pred = best_elastic_net_reg.predict(X_test)

In [148]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for Elastic Net regression is {round(rmse,2)} points")

The root mean squared error for Elastic Net regression is 6.91 points


In [149]:
ml_performance = ("elastic_net_reg_w/out_opp_teams_w/out_pca",rmse)

In [150]:
ml_models_performances.append(ml_performance)

<h3>Performance Analysis with PCA</h3>

In [151]:
pca = PCA()

In [152]:
sig_df_no_opp_w_pca = pca.fit_transform(sig_df_no_opp)

In [153]:
pca_cumsum = np.cumsum(pca.explained_variance_ratio_)

In [154]:
max_comp_idx = np.argmax(pca_cumsum>0.95)

In [155]:
pca = PCA(n_components=max_comp_idx+1)

In [156]:
sig_df_no_opp_w_pca = pca.fit_transform(sig_df_no_opp)

In [157]:
X, y = sig_df_no_opp_w_pca, target

In [158]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [159]:
ml_models_performances.append(ml_performance)

<h4>Linear Regression</h4>

In [160]:
ln = LinearRegression()

In [161]:
ln.fit(X_train,y_train)

LinearRegression()

In [162]:
y_pred = ln.predict(X_test)

In [163]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for linear regression is {round(rmse,2)} points")

The root mean squared error for linear regression is 6.71 points


In [164]:
ml_performance = ("lin_reg_w/out_opp_teams_w_pca",rmse)

In [165]:
ml_models_performances.append(ml_performance)

<h4>SGD Regression</h4>

In [166]:
sgd_param_grid = [
    {
        "loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
        "epsilon":[0.1,0.01,0.001]
    }
]

In [167]:
sgd_grid_search = GridSearchCV(SGDRegressor(),sgd_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [168]:
sgd_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=SGDRegressor(), n_jobs=-1,
             param_grid=[{'epsilon': [0.1, 0.01, 0.001],
                          'loss': ['squared_loss', 'huber',
                                   'epsilon_insensitive',
                                   'squared_epsilon_insensitive']}],
             scoring='neg_root_mean_squared_error')

In [169]:
sgd_grid_search.best_params_

{'epsilon': 0.001, 'loss': 'epsilon_insensitive'}

In [170]:
best_sgd_reg = sgd_grid_search.best_estimator_

In [171]:
y_pred = best_sgd_reg.predict(X_test)

In [172]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.74 points


In [173]:
ml_performance = ("sgd_reg_w/out_opp_teams_w_pca",rmse)

In [174]:
ml_models_performances.append(ml_performance)

In [175]:
ml_models_performances.append(ml_performance)

<h4>Random Forest Regression</h4>

In [176]:
rand_forest_param_grid = [
    {
        "n_estimators":[50,100,500],
        "criterion":["mse","mae"]
    }
]

In [177]:
rand_forest_grid_search = GridSearchCV(RandomForestRegressor(),rand_forest_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [178]:
rand_forest_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'criterion': ['mse', 'mae'],
                          'n_estimators': [50, 100, 500]}],
             scoring='neg_root_mean_squared_error')

In [179]:
rand_forest_grid_search.best_params_

{'criterion': 'mae', 'n_estimators': 500}

In [180]:
best_rand_forest_reg = rand_forest_grid_search.best_estimator_

In [181]:
y_pred = best_rand_forest_reg.predict(X_test)

In [182]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 7.04 points


In [183]:
ml_performance = ("rand_forest_reg_w/out_opp_teams_w_pca",rmse)

In [184]:
ml_models_performances.append(ml_performance)

<h4>Elastic Net Regression</h4>

In [185]:
elastic_net_param_grid = [
    {
        "l1_ratio":[0.25,0.5,0.75]
    }
]

In [186]:
elastic_net_grid_search = GridSearchCV(ElasticNet(),elastic_net_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [187]:
elastic_net_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=ElasticNet(), n_jobs=-1,
             param_grid=[{'l1_ratio': [0.25, 0.5, 0.75]}],
             scoring='neg_root_mean_squared_error')

In [188]:
elastic_net_grid_search.best_params_

{'l1_ratio': 0.75}

In [189]:
best_elastic_net_reg = elastic_net_grid_search.best_estimator_

In [190]:
y_pred = best_elastic_net_reg.predict(X_test)

In [191]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for Elastic Net regression is {round(rmse,2)} points")

The root mean squared error for Elastic Net regression is 6.72 points


In [192]:
ml_performance = ("elastic_net_reg_w/out_opp_teams_w_pca",rmse)

In [193]:
ml_models_performances.append(ml_performance)

<h2>Viewing Error Scores For All the ML Models</h2>

In [194]:
for ml_perf in ml_models_performances:
    print(f"{ml_perf[0]}: {round(ml_perf[1],3)}")

lin_reg_w_opp_teams_w/out_pca: 6.374
sgd_reg_w_opp_teams_w/out_pca: 6.414
rand_forest_reg_w_opp_teams_w/out_pca: 6.629
elastic_net_reg_w_opp_teams_w/out_pca: 6.925
lin_reg_w_opp_teams_w_pca: 6.712
sgd_reg_w_opp_teams_w_pca: 7.062
rand_forest_reg_w_opp_teams_w_pca: 7.034
elastic_net_reg_w_opp_teams_w_pca: 6.718
lin_reg_w/out_opp_teams_w/out_pca: 6.643
sgd_reg_w/out_opp_teams_w/out_pca: 6.605
rand_forest_reg_w/out_opp_teams_w/out_pca: 6.931
elastic_net_reg_w/out_opp_teams_w/out_pca: 6.913
elastic_net_reg_w/out_opp_teams_w/out_pca: 6.913
lin_reg_w/out_opp_teams_w_pca: 6.712
sgd_reg_w/out_opp_teams_w_pca: 6.744
sgd_reg_w/out_opp_teams_w_pca: 6.744
rand_forest_reg_w/out_opp_teams_w_pca: 7.039
elastic_net_reg_w/out_opp_teams_w_pca: 6.718
