In [183]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import os

In [184]:
full_game_df = pd.read_csv(os.path.join("feature_eng_data_iter_2.csv"))

In [185]:
sig_cols = list(np.load("sig_cols_ml_iteration_2.npy"))

In [186]:
target = full_game_df["+/-"]

In [187]:
full_game_df.head()

Unnamed: 0,G,Date,Age,Tm,game_location,Opp,game_result,GS,MP,FG,...,More than 3 Fouls,FTR-Above-Avg,Less-Than-3-TOV,PTS-Per-Min,%-Avg,PER-Per-TOV,off_rating,off_rating_above_avg,More-Than-Avg-POSS,Above-Avg-GmSc
0,1,2003-10-29,18,CLE,False,SAC,-14,True,42,12.0,...,False,False,True,0.595238,0.333333,14.638524,117.260788,True,False,True
1,2,2003-10-30,18,CLE,False,PHO,-9,True,40,8.0,...,False,False,False,0.525,0.391667,2.413371,83.732057,False,False,False
2,3,2003-11-01,18,CLE,False,POR,-19,True,39,3.0,...,False,False,True,0.205128,0.136364,2.74459,53.763441,False,False,False
3,4,2003-11-05,18,CLE,True,DEN,-4,True,41,3.0,...,False,False,True,0.170732,0.166667,6.645524,61.188811,False,False,False
4,5,2003-11-07,18,CLE,False,IND,-1,True,43,8.0,...,False,True,False,0.534884,0.46875,1.242708,81.908832,False,True,False


In [188]:
sig_df = full_game_df[sig_cols]

In [189]:
sig_df.head()

Unnamed: 0,PTS-Per-Min,%-Avg,PER-Per-TOV,off_rating,off_rating_above_avg,More-Than-Avg-POSS,Above-Avg-GmSc,game_result,FG%,GmSc,...,Over-3-BLKs-and-STLs,Played-Above-Avg-Min,Above-Avg-FT%,Career-Stage,Above-Avg-PTS-Per-POSS,More-BLKs-Than-STLs,More-AST-Than-TOV,PTS-Cat,2P%-Above-50%,More than 3 Fouls
0,0.595238,0.333333,14.638524,117.260788,True,False,True,-14,0.6,24.7,...,True,True,False,rising,True,False,True,below 35,True,False
1,0.525,0.391667,2.413371,83.732057,False,False,False,-9,0.471,14.7,...,False,True,False,rising,False,False,True,below 35,True,False
2,0.205128,0.136364,2.74459,53.763441,False,False,False,-19,0.25,5.0,...,False,True,True,rising,False,False,True,below 20,False,False
3,0.170732,0.166667,6.645524,61.188811,False,False,False,-4,0.273,11.2,...,True,True,True,rising,False,True,True,below 20,False,False
4,0.534884,0.46875,1.242708,81.908832,False,True,False,-1,0.444,9.0,...,False,True,True,rising,False,False,False,below 35,False,False


In [190]:
sig_df.replace(True,1,inplace=True)
sig_df.replace(False,0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [191]:
sig_df.select_dtypes(include=["int64"]).columns

Index(['game_result'], dtype='object')

In [192]:
for col in sig_df.select_dtypes(include=["int64"]).columns:
    sig_df[col] = sig_df[col].apply(lambda x: float(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [193]:
sig_df.dtypes

PTS-Per-Min               float64
%-Avg                     float64
PER-Per-TOV               float64
off_rating                float64
off_rating_above_avg      float64
More-Than-Avg-POSS        float64
Above-Avg-GmSc            float64
game_result               float64
FG%                       float64
GmSc                      float64
TS%                       float64
GmSc-Per-PTS              float64
PTS-Per-POSS              float64
game_result-Per-PTS       float64
PER                       float64
game_location             float64
Opp                        object
Won                       float64
High-3P%-Shooting         float64
High-FG%-Shooting         float64
Over-3-BLKs-and-STLs      float64
Played-Above-Avg-Min      float64
Above-Avg-FT%             float64
Career-Stage               object
Above-Avg-PTS-Per-POSS    float64
More-BLKs-Than-STLs       float64
More-AST-Than-TOV         float64
PTS-Cat                    object
2P%-Above-50%             float64
More than 3 Fo

In [194]:
def label_bin(cat_col,df):
    lb = LabelBinarizer()
    data = lb.fit_transform(df[cat_col])
    classes = lb.classes_
    new_df = pd.DataFrame(data,columns=classes,dtype=float)
    for col in new_df.columns:
        df[col] = new_df[col]
    return lb

In [195]:
label_bins = []
for object_col in sig_df.select_dtypes(include=["object"]).columns:
    object_lb = label_bin(object_col,sig_df)
    object_col_lb_pair = (object_col,object_lb)
    label_bins.append(object_col_lb_pair)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [196]:
sig_df.tail()

Unnamed: 0,PTS-Per-Min,%-Avg,PER-Per-TOV,off_rating,off_rating_above_avg,More-Than-Avg-POSS,Above-Avg-GmSc,game_result,FG%,GmSc,...,SEA,TOR,UTA,WAS,decline,prime,rising,above 35,below 20,below 35
1255,1.027778,0.464357,10.446618,117.088608,1.0,1.0,1.0,10.0,0.571,31.3,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1256,0.823529,0.393773,18.12825,120.8981,1.0,0.0,1.0,9.0,0.412,25.8,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1257,0.852941,0.529692,11.770794,110.687023,1.0,1.0,1.0,-2.0,0.545,24.7,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1258,0.470588,0.309667,2.522094,69.93007,0.0,0.0,0.0,2.0,0.316,9.6,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1259,0.588235,0.45,4.925272,94.87666,0.0,0.0,0.0,-15.0,0.467,14.3,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [197]:
label_bins

[('Opp', LabelBinarizer()),
 ('Career-Stage', LabelBinarizer()),
 ('PTS-Cat', LabelBinarizer())]

In [198]:
sig_df.select_dtypes(include=["object"])

Unnamed: 0,Opp,Career-Stage,PTS-Cat
0,SAC,rising,below 35
1,PHO,rising,below 35
2,POR,rising,below 20
3,DEN,rising,below 20
4,IND,rising,below 35
...,...,...,...
1255,MIL,decline,above 35
1256,LAC,decline,below 35
1257,BRK,decline,below 35
1258,LAC,decline,below 20


In [199]:
split = StratifiedShuffleSplit(n_splits=1,test_size=0.33)

In [200]:
ml_models_performances = []
ml_model_accuracies = []

<h2>Perform Machine Learning with Opposition Team Feature Included</h2>

In [201]:
std_scaler = StandardScaler()

In [202]:
one_hot_encoded_sig_df = sig_df.select_dtypes(exclude=["object"])

In [203]:
one_hot_encoded_sig_df.columns

Index(['PTS-Per-Min', '%-Avg', 'PER-Per-TOV', 'off_rating',
       'off_rating_above_avg', 'More-Than-Avg-POSS', 'Above-Avg-GmSc',
       'game_result', 'FG%', 'GmSc', 'TS%', 'GmSc-Per-PTS', 'PTS-Per-POSS',
       'game_result-Per-PTS', 'PER', 'game_location', 'Won',
       'High-3P%-Shooting', 'High-FG%-Shooting', 'Over-3-BLKs-and-STLs',
       'Played-Above-Avg-Min', 'Above-Avg-FT%', 'Above-Avg-PTS-Per-POSS',
       'More-BLKs-Than-STLs', 'More-AST-Than-TOV', '2P%-Above-50%',
       'More than 3 Fouls', 'ATL', 'BOS', 'BRK', 'CHA', 'CHI', 'CHO', 'CLE',
       'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA',
       'MIL', 'MIN', 'NJN', 'NOH', 'NOK', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI',
       'PHO', 'POR', 'SAC', 'SAS', 'SEA', 'TOR', 'UTA', 'WAS', 'decline',
       'prime', 'rising', 'above 35', 'below 20', 'below 35'],
      dtype='object')

In [204]:
std_scale_sig_data = std_scaler.fit_transform(one_hot_encoded_sig_df.values)

In [205]:
std_scale_sig_df = pd.DataFrame(std_scale_sig_data,columns=one_hot_encoded_sig_df.columns)

In [206]:
std_scale_sig_df.head()

Unnamed: 0,PTS-Per-Min,%-Avg,PER-Per-TOV,off_rating,off_rating_above_avg,More-Than-Avg-POSS,Above-Avg-GmSc,game_result,FG%,GmSc,...,SEA,TOR,UTA,WAS,decline,prime,rising,above 35,below 20,below 35
0,-0.635326,-0.661242,0.467196,0.480218,0.97337,-0.97337,0.962604,-1.420272,0.846395,0.318334,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,-0.456129,0.70837
1,-1.005199,-0.248814,-0.919494,-1.059284,-1.027358,-0.97337,-1.038849,-1.035821,-0.304266,-0.975332,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,-0.456129,0.70837
2,-2.689642,-2.053853,-0.881924,-2.43532,-1.027358,-0.97337,-1.038849,-1.804724,-2.275553,-2.230188,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,2.192362,-1.411692
3,-2.870773,-1.839605,-0.439444,-2.094377,-1.027358,-0.97337,-1.038849,-0.65137,-2.070397,-1.428115,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,2.192362,-1.411692
4,-0.953152,0.296179,-1.052281,-1.142999,-1.027358,1.027358,-1.038849,-0.420699,-0.545102,-1.712722,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,-0.456129,0.70837


In [207]:
print(f"There are {len(std_scale_sig_df.columns)} columns")
std_scale_sig_df.columns

There are 68 columns


Index(['PTS-Per-Min', '%-Avg', 'PER-Per-TOV', 'off_rating',
       'off_rating_above_avg', 'More-Than-Avg-POSS', 'Above-Avg-GmSc',
       'game_result', 'FG%', 'GmSc', 'TS%', 'GmSc-Per-PTS', 'PTS-Per-POSS',
       'game_result-Per-PTS', 'PER', 'game_location', 'Won',
       'High-3P%-Shooting', 'High-FG%-Shooting', 'Over-3-BLKs-and-STLs',
       'Played-Above-Avg-Min', 'Above-Avg-FT%', 'Above-Avg-PTS-Per-POSS',
       'More-BLKs-Than-STLs', 'More-AST-Than-TOV', '2P%-Above-50%',
       'More than 3 Fouls', 'ATL', 'BOS', 'BRK', 'CHA', 'CHI', 'CHO', 'CLE',
       'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA',
       'MIL', 'MIN', 'NJN', 'NOH', 'NOK', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI',
       'PHO', 'POR', 'SAC', 'SAS', 'SEA', 'TOR', 'UTA', 'WAS', 'decline',
       'prime', 'rising', 'above 35', 'below 20', 'below 35'],
      dtype='object')

In [208]:
X, y = std_scale_sig_df.values, target

In [209]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33)

<h4>Linear Regression</h4>

In [210]:
ln = LinearRegression()

In [211]:
ln.fit(X_train,y_train)

LinearRegression()

In [212]:
y_pred = ln.predict(X_test)

In [213]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for linear regression is {round(rmse,2)} points")

The root mean squared error for linear regression is 6.75 points


In [214]:
score = ln.score(X_test,y_test) * 100
print(f"The accuracy for linear regression is {round(score,4)}%")

The accuracy for linear regression is 71.5229%


In [215]:
ml_performance = ("lin_reg_w_opp_teams_w/out_pca",rmse)
ml_accuracy = ("lin_reg_w_opp_teams_w/out_pca",score)

In [216]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>SGD Regression</h4>

In [217]:
sgd_param_grid = [
    {
        "loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
        "epsilon":[0.1,0.01,0.001]
    }
]

In [218]:
sgd_grid_search = GridSearchCV(SGDRegressor(),sgd_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [219]:
sgd_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=SGDRegressor(), n_jobs=-1,
             param_grid=[{'epsilon': [0.1, 0.01, 0.001],
                          'loss': ['squared_loss', 'huber',
                                   'epsilon_insensitive',
                                   'squared_epsilon_insensitive']}],
             scoring='neg_root_mean_squared_error')

In [220]:
sgd_grid_search.best_params_

{'epsilon': 0.01, 'loss': 'squared_loss'}

In [221]:
best_sgd_reg = sgd_grid_search.best_estimator_

In [222]:
y_pred = best_sgd_reg.predict(X_test)

In [223]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.77 points


In [224]:
score = best_sgd_reg.score(X_test,y_test) * 100
print(f"The accuracy for sgd regression is {round(score,4)}%")

The accuracy for sgd regression is 71.3858%


In [225]:
ml_performance = ("sgd_reg_w_opp_teams_w/out_pca",rmse)
ml_accuracy = ("sgd_reg_w_opp_teams_w/out_pca",score)

In [226]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Random Forest Regression</h4>

In [227]:
rand_forest_param_grid = [
    {
        "n_estimators":[50,100,500],
        "criterion":["mse","mae"]
    }
]

In [228]:
rand_forest_grid_search = GridSearchCV(RandomForestRegressor(),rand_forest_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [229]:
rand_forest_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'criterion': ['mse', 'mae'],
                          'n_estimators': [50, 100, 500]}],
             scoring='neg_root_mean_squared_error')

In [230]:
rand_forest_grid_search.best_params_

{'criterion': 'mae', 'n_estimators': 500}

In [231]:
best_rand_forest_reg = rand_forest_grid_search.best_estimator_

In [232]:
y_pred = best_rand_forest_reg.predict(X_test)

In [233]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.95 points


In [234]:
score = best_rand_forest_reg.score(X_test,y_test) * 100
print(f"The accuracy for random forest regression is {round(score,4)}%")

The accuracy for random forest regression is 69.7935%


In [235]:
ml_performance = ("rand_forest_reg_w_opp_teams_w/out_pca",rmse)
ml_accuracy = ("rand_forest_reg_w_opp_teams_w/out_pca",score)

In [236]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Elastic Net Regression</h4>

In [237]:
elastic_net_param_grid = [
    {
        "l1_ratio":[0.25,0.5,0.75]
    }
]

In [238]:
elastic_net_grid_search = GridSearchCV(ElasticNet(),elastic_net_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [239]:
elastic_net_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=ElasticNet(), n_jobs=-1,
             param_grid=[{'l1_ratio': [0.25, 0.5, 0.75]}],
             scoring='neg_root_mean_squared_error')

In [240]:
elastic_net_grid_search.best_params_

{'l1_ratio': 0.75}

In [241]:
best_elastic_net_reg = elastic_net_grid_search.best_estimator_

In [242]:
y_pred = best_elastic_net_reg.predict(X_test)

In [243]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for Elastic Net regression is {round(rmse,2)} points")

The root mean squared error for Elastic Net regression is 6.71 points


In [244]:
score = best_elastic_net_reg.score(X_test,y_test) * 100
print(f"The accuracy for elastic net regression is {round(score,4)}%")

The accuracy for elastic net regression is 71.8467%


In [245]:
ml_performance = ("elastic_net_reg_w_opp_teams_w/out_pca",rmse)
ml_accuracy = ("elastic_net_reg_w_opp_teams_w/out_pca",score)

In [246]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h3>Perform PCA to see if there is any performance improvements</h3>

In [247]:
pca = PCA()

In [248]:
pca.fit_transform(one_hot_encoded_sig_df.values)

array([[-5.98477896e+00,  2.06242204e+01,  3.31527466e+00, ...,
         4.14072055e-15, -9.61510991e-15, -3.22636612e-15],
       [ 3.03538080e+01,  6.41256875e+00, -2.19963501e+00, ...,
         4.59793656e-15, -2.66561156e-15,  7.75295086e-15],
       [ 6.45456817e+01,  7.37135727e+00,  5.00338535e+00, ...,
         3.27243014e-15,  2.64471418e-15, -2.47737826e-15],
       ...,
       [-5.13312941e+00,  7.50926672e+00,  9.69286671e-01, ...,
         6.30929269e-16,  2.52268247e-17,  1.26553867e-16],
       [ 4.20338247e+01, -8.29259835e+00, -1.15840290e-01, ...,
         2.46393615e-16, -8.47394234e-16, -2.00844010e-15],
       [ 2.11091955e+01,  1.49456703e+01, -1.47886592e+00, ...,
         1.83158706e-15,  5.90287679e-16, -3.06797676e-15]])

In [249]:
sum(pca.explained_variance_ratio_)

1.0

In [250]:
cum_sum = np.cumsum(pca.explained_variance_ratio_)

In [251]:
max_comp_idx = np.argmax(cum_sum>0.95)

In [252]:
pca = PCA(n_components=max_comp_idx+1)

In [253]:
pca_transformed_sig_data = pca.fit_transform(one_hot_encoded_sig_df.values)

In [254]:
X, y = pca_transformed_sig_data, target

In [255]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

<h4>Linear Regression</h4>

In [256]:
ln = LinearRegression()

In [257]:
ln.fit(X_train,y_train)

LinearRegression()

In [258]:
y_pred = ln.predict(X_test)

In [259]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for linear regression is {round(rmse,2)} points")

The root mean squared error for linear regression is 6.71 points


In [260]:
score = ln.score(X_test,y_test)
print(f"The accuracy for linear regression is {round(score,4)}%")

The accuracy for linear regression is 0.7327%


In [261]:
ml_performance = ("lin_reg_w_opp_teams_w_pca",rmse)
ml_accuracy = ("lin_reg_w_opp_teams_w_pca",score)

In [262]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>SGD Regression</h4>

In [263]:
sgd_param_grid = [
    {
        "loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
        "epsilon":[0.1,0.01,0.001]
    }
]

In [264]:
sgd_grid_search = GridSearchCV(SGDRegressor(),sgd_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [265]:
sgd_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=SGDRegressor(), n_jobs=-1,
             param_grid=[{'epsilon': [0.1, 0.01, 0.001],
                          'loss': ['squared_loss', 'huber',
                                   'epsilon_insensitive',
                                   'squared_epsilon_insensitive']}],
             scoring='neg_root_mean_squared_error')

In [266]:
sgd_grid_search.best_params_

{'epsilon': 0.001, 'loss': 'epsilon_insensitive'}

In [267]:
best_sgd_reg = sgd_grid_search.best_estimator_

In [268]:
y_pred = best_sgd_reg.predict(X_test)

In [269]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.98 points


In [270]:
score = best_sgd_reg.score(X_test,y_test) * 100
print(f"The accuracy for sgd regression is {round(score,4)}%")

The accuracy for sgd regression is 71.0658%


In [271]:
ml_performance = ("sgd_reg_w_opp_teams_w_pca",rmse)
ml_accuracy = ("sgd_reg_w_opp_teams_w_pca",score)

In [272]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Random Forest Regression</h4>

In [273]:
rand_forest_param_grid = [
    {
        "n_estimators":[50,100,500],
        "criterion":["mse","mae"]
    }
]

In [274]:
rand_forest_grid_search = GridSearchCV(RandomForestRegressor(),rand_forest_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [275]:
rand_forest_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'criterion': ['mse', 'mae'],
                          'n_estimators': [50, 100, 500]}],
             scoring='neg_root_mean_squared_error')

In [276]:
rand_forest_grid_search.best_params_

{'criterion': 'mae', 'n_estimators': 500}

In [277]:
best_rand_forest_reg = rand_forest_grid_search.best_estimator_

In [278]:
y_pred = best_rand_forest_reg.predict(X_test)

In [279]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 7.04 points


In [280]:
score = best_rand_forest_reg.score(X_test,y_test) * 100
print(f"The accuracy for random forest regression is {round(score,4)}%")

The accuracy for random forest regression is 70.5844%


In [281]:
ml_performance = ("rand_forest_reg_w_opp_teams_w_pca",rmse)
ml_accuracy = ("rand_forest_reg_w_opp_teams_w_pca",score)

In [282]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Elastic Net Regression</h4>

In [283]:
elastic_net_param_grid = [
    {
        "l1_ratio":[0.25,0.5,0.75]
    }
]

In [284]:
elastic_net_grid_search = GridSearchCV(ElasticNet(),elastic_net_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [285]:
elastic_net_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=ElasticNet(), n_jobs=-1,
             param_grid=[{'l1_ratio': [0.25, 0.5, 0.75]}],
             scoring='neg_root_mean_squared_error')

In [286]:
elastic_net_grid_search.best_params_

{'l1_ratio': 0.75}

In [287]:
best_elastic_net_reg = elastic_net_grid_search.best_estimator_

In [288]:
y_pred = best_elastic_net_reg.predict(X_test)

In [289]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for Elastic Net regression is {round(rmse,2)} points")

The root mean squared error for Elastic Net regression is 6.72 points


In [290]:
score = best_elastic_net_reg.score(X_test,y_test) * 100
print(f"The accuracy for elastic net regression is {round(score,4)}%")

The accuracy for elastic net regression is 73.2257%


In [291]:
ml_performance = ("elastic_net_reg_w_opp_teams_w_pca",rmse)
ml_accuracy = ("elastic_net_reg_w_opp_teams_w_pca",score)

In [292]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h2>Remove Opposing Team Columns</h2>

In [293]:
sig_df.columns

Index(['PTS-Per-Min', '%-Avg', 'PER-Per-TOV', 'off_rating',
       'off_rating_above_avg', 'More-Than-Avg-POSS', 'Above-Avg-GmSc',
       'game_result', 'FG%', 'GmSc', 'TS%', 'GmSc-Per-PTS', 'PTS-Per-POSS',
       'game_result-Per-PTS', 'PER', 'game_location', 'Opp', 'Won',
       'High-3P%-Shooting', 'High-FG%-Shooting', 'Over-3-BLKs-and-STLs',
       'Played-Above-Avg-Min', 'Above-Avg-FT%', 'Career-Stage',
       'Above-Avg-PTS-Per-POSS', 'More-BLKs-Than-STLs', 'More-AST-Than-TOV',
       'PTS-Cat', '2P%-Above-50%', 'More than 3 Fouls', 'ATL', 'BOS', 'BRK',
       'CHA', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND',
       'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NJN', 'NOH', 'NOK', 'NOP',
       'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'SEA', 'TOR',
       'UTA', 'WAS', 'decline', 'prime', 'rising', 'above 35', 'below 20',
       'below 35'],
      dtype='object')

In [294]:
teams_lists = ['ATL', 'BOS', 'BRK',
       'CHA', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND',
       'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NJN', 'NOH', 'NOK', 'NOP',
       'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'SEA', 'TOR',
       'UTA', 'WAS']

In [295]:
non_opp_cols = list(set(sig_df.columns)-set(teams_lists))

In [296]:
sig_df_no_opp = sig_df[non_opp_cols].select_dtypes(exclude=["object"])

In [297]:
sig_df_no_opp.head()

Unnamed: 0,PTS-Per-Min,More-Than-Avg-POSS,More than 3 Fouls,below 35,Won,High-FG%-Shooting,rising,PER,FG%,TS%,...,2P%-Above-50%,prime,off_rating_above_avg,%-Avg,above 35,GmSc,High-3P%-Shooting,More-AST-Than-TOV,More-BLKs-Than-STLs,Over-3-BLKs-and-STLs
0,0.595238,0.0,0.0,1.0,0.0,1.0,1.0,29.277048,0.6,0.583431,...,1.0,0.0,1.0,0.333333,0.0,24.7,0.0,1.0,0.0,1.0
1,0.525,0.0,0.0,1.0,0.0,0.0,1.0,16.8936,0.471,0.516605,...,1.0,0.0,0.0,0.391667,0.0,14.7,0.0,1.0,0.0,0.0
2,0.205128,0.0,0.0,0.0,0.0,0.0,1.0,5.489179,0.25,0.30888,...,0.0,0.0,0.0,0.136364,0.0,5.0,0.0,1.0,0.0,0.0
3,0.170732,0.0,0.0,0.0,0.0,0.0,1.0,13.291049,0.273,0.305011,...,0.0,0.0,0.0,0.166667,0.0,11.2,0.0,1.0,1.0,1.0
4,0.534884,1.0,0.0,1.0,0.0,0.0,1.0,8.698953,0.444,0.539273,...,0.0,0.0,0.0,0.46875,0.0,9.0,1.0,0.0,0.0,0.0


In [298]:
std_scaler = StandardScaler()

In [299]:
sig_no_opp_data = std_scaler.fit_transform(sig_df_no_opp.values)

In [300]:
X, y = sig_no_opp_data, target

In [301]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

<h4>Linear Regression</h4>

In [302]:
ln = LinearRegression()

In [303]:
ln.fit(X_train,y_train)

LinearRegression()

In [304]:
y_pred = ln.predict(X_test)

In [305]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for linear regression is {round(rmse,2)} points")

The root mean squared error for linear regression is 6.62 points


In [306]:
score = ln.score(X_test,y_test) * 100
print(f"The accuracy for linear regression is {round(score,4)}%")

The accuracy for linear regression is 73.9959%


In [307]:
ml_performance = ("lin_reg_w/out_opp_teams_w/out_pca",rmse)
ml_accuracy = ("lin_reg_w/out_opp_teams_w/out_pca",score)

In [308]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>SGD Regression</h4>

In [309]:
sgd_param_grid = [
    {
        "loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
        "epsilon":[0.1,0.01,0.001]
    }
]

In [310]:
sgd_grid_search = GridSearchCV(SGDRegressor(),sgd_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [311]:
sgd_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=SGDRegressor(), n_jobs=-1,
             param_grid=[{'epsilon': [0.1, 0.01, 0.001],
                          'loss': ['squared_loss', 'huber',
                                   'epsilon_insensitive',
                                   'squared_epsilon_insensitive']}],
             scoring='neg_root_mean_squared_error')

In [312]:
sgd_grid_search.best_params_

{'epsilon': 0.001, 'loss': 'squared_loss'}

In [313]:
best_sgd_reg = sgd_grid_search.best_estimator_

In [314]:
y_pred = best_sgd_reg.predict(X_test)

In [315]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.65 points


In [316]:
score = best_sgd_reg.score(X_test,y_test) * 100
print(f"The accuracy for sgd regression is {round(score,4)}%")

The accuracy for sgd regression is 73.7635%


In [317]:
ml_performance = ("sgd_reg_w/out_opp_teams_w/out_pca",rmse)
ml_accuracy = ("sgd_reg_w/out_opp_teams_w/out_pca",score)

In [318]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Random Forest Regression</h4>

In [319]:
rand_forest_param_grid = [
    {
        "n_estimators":[50,100,500],
        "criterion":["mse","mae"]
    }
]

In [320]:
rand_forest_grid_search = GridSearchCV(RandomForestRegressor(),rand_forest_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [321]:
rand_forest_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'criterion': ['mse', 'mae'],
                          'n_estimators': [50, 100, 500]}],
             scoring='neg_root_mean_squared_error')

In [322]:
rand_forest_grid_search.best_params_

{'criterion': 'mae', 'n_estimators': 500}

In [323]:
best_rand_forest_reg = rand_forest_grid_search.best_estimator_

In [324]:
y_pred = best_rand_forest_reg.predict(X_test)

In [325]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.85 points


In [326]:
score = best_rand_forest_reg.score(X_test,y_test) * 100
print(f"The accuracy for random forest regression is {round(score,4)}%")

The accuracy for random forest regression is 72.1334%


In [327]:
ml_performance = ("rand_forest_reg_w/out_opp_teams_w/out_pca",rmse)
ml_accuracy = ("rand_forest_reg_w/out_opp_teams_w/out_pca",score)

In [328]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Elastic Net Regression</h4>

In [329]:
elastic_net_param_grid = [
    {
        "l1_ratio":[0.25,0.5,0.75]
    }
]

In [330]:
elastic_net_grid_search = GridSearchCV(ElasticNet(),elastic_net_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [331]:
elastic_net_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=ElasticNet(), n_jobs=-1,
             param_grid=[{'l1_ratio': [0.25, 0.5, 0.75]}],
             scoring='neg_root_mean_squared_error')

In [332]:
elastic_net_grid_search.best_params_

{'l1_ratio': 0.75}

In [333]:
best_elastic_net_reg = elastic_net_grid_search.best_estimator_

In [334]:
y_pred = best_elastic_net_reg.predict(X_test)

In [335]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for Elastic Net regression is {round(rmse,2)} points")

The root mean squared error for Elastic Net regression is 6.91 points


In [336]:
score = best_elastic_net_reg.score(X_test,y_test) * 100
print(f"The accuracy for elastic net regression is {round(score,4)}%")

The accuracy for elastic net regression is 71.6427%


In [337]:
ml_performance = ("elastic_net_reg_w/out_opp_teams_w/out_pca",rmse)
ml_accuracy = ("elastic_net_reg_w/out_opp_teams_w/out_pca",score)

In [338]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h3>Performance Analysis with PCA</h3>

In [339]:
pca = PCA()

In [340]:
sig_df_no_opp_w_pca = pca.fit_transform(sig_df_no_opp)

In [341]:
pca_cumsum = np.cumsum(pca.explained_variance_ratio_)

In [342]:
max_comp_idx = np.argmax(pca_cumsum>0.95)

In [343]:
pca = PCA(n_components=max_comp_idx+1)

In [344]:
sig_df_no_opp_w_pca = pca.fit_transform(sig_df_no_opp)

In [345]:
X, y = sig_df_no_opp_w_pca, target

In [346]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [347]:
ml_models_performances.append(ml_performance)

<h4>Linear Regression</h4>

In [348]:
ln = LinearRegression()

In [349]:
ln.fit(X_train,y_train)

LinearRegression()

In [350]:
y_pred = ln.predict(X_test)

In [351]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for linear regression is {round(rmse,2)} points")

The root mean squared error for linear regression is 6.71 points


In [352]:
score = ln.score(X_test,y_test) * 100
print(f"The accuracy for linear regression is {round(score,4)}%")

The accuracy for linear regression is 73.2679%


In [353]:
ml_performance = ("lin_reg_w/out_opp_teams_w_pca",rmse)
ml_accuracy = ("lin_reg_w/out_opp_teams_w_pca",score)

In [354]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>SGD Regression</h4>

In [355]:
sgd_param_grid = [
    {
        "loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
        "epsilon":[0.1,0.01,0.001]
    }
]

In [356]:
sgd_grid_search = GridSearchCV(SGDRegressor(),sgd_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [357]:
sgd_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=SGDRegressor(), n_jobs=-1,
             param_grid=[{'epsilon': [0.1, 0.01, 0.001],
                          'loss': ['squared_loss', 'huber',
                                   'epsilon_insensitive',
                                   'squared_epsilon_insensitive']}],
             scoring='neg_root_mean_squared_error')

In [358]:
sgd_grid_search.best_params_

{'epsilon': 0.01, 'loss': 'epsilon_insensitive'}

In [359]:
best_sgd_reg = sgd_grid_search.best_estimator_

In [360]:
y_pred = best_sgd_reg.predict(X_test)

In [361]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.77 points


In [362]:
ml_performance = ("sgd_reg_w/out_opp_teams_w_pca",rmse)

In [363]:
score = best_sgd_reg.score(X_test,y_test) * 100
print(f"The accuracy for sgd regression is {round(score,4)}%")

The accuracy for sgd regression is 72.8185%


In [364]:
ml_performance = ("sgd_reg_w/out_opp_teams_w_pca",rmse)
ml_accuracy = ("sgd_reg_w/out_opp_teams_w_pca",score)

In [365]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Random Forest Regression</h4>

In [366]:
rand_forest_param_grid = [
    {
        "n_estimators":[50,100,500],
        "criterion":["mse","mae"]
    }
]

In [367]:
rand_forest_grid_search = GridSearchCV(RandomForestRegressor(),rand_forest_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [368]:
rand_forest_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'criterion': ['mse', 'mae'],
                          'n_estimators': [50, 100, 500]}],
             scoring='neg_root_mean_squared_error')

In [369]:
rand_forest_grid_search.best_params_

{'criterion': 'mae', 'n_estimators': 500}

In [370]:
best_rand_forest_reg = rand_forest_grid_search.best_estimator_

In [371]:
y_pred = best_rand_forest_reg.predict(X_test)

In [372]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 7.04 points


In [373]:
score = best_rand_forest_reg.score(X_test,y_test) * 100
print(f"The accuracy for random forest regression is {round(score,4)}%")

The accuracy for random forest regression is 70.6186%


In [374]:
ml_performance = ("rand_forest_reg_w/out_opp_teams_w_pca",rmse)
ml_accuracy = ("rand_forest_reg_w/out_opp_teams_w_pca",score)

In [375]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Elastic Net Regression</h4>

In [376]:
elastic_net_param_grid = [
    {
        "l1_ratio":[0.25,0.5,0.75]
    }
]

In [377]:
elastic_net_grid_search = GridSearchCV(ElasticNet(),elastic_net_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [378]:
elastic_net_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=ElasticNet(), n_jobs=-1,
             param_grid=[{'l1_ratio': [0.25, 0.5, 0.75]}],
             scoring='neg_root_mean_squared_error')

In [379]:
elastic_net_grid_search.best_params_

{'l1_ratio': 0.75}

In [380]:
best_elastic_net_reg = elastic_net_grid_search.best_estimator_

In [381]:
y_pred = best_elastic_net_reg.predict(X_test)

In [382]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for Elastic Net regression is {round(rmse,2)} points")

The root mean squared error for Elastic Net regression is 6.72 points


In [383]:
score = best_elastic_net_reg.score(X_test,y_test) * 100
print(f"The accuracy for elastic net regression is {round(score,4)}%")

The accuracy for elastic net regression is 73.2257%


In [384]:
ml_performance = ("elastic_net_reg_w/out_opp_teams_w_pca",rmse)
ml_accuracy = ("elastic_net_reg_w/out_opp_teams_w_pca",score)

In [385]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h2>Viewing Error Scores For All the ML Models</h2>

In [386]:
for ml_perf in ml_models_performances:
    print(f"{ml_perf[0]}: {round(ml_perf[1],3)}")

lin_reg_w_opp_teams_w/out_pca: 6.753
sgd_reg_w_opp_teams_w/out_pca: 6.769
rand_forest_reg_w_opp_teams_w/out_pca: 6.955
elastic_net_reg_w_opp_teams_w/out_pca: 6.714
lin_reg_w_opp_teams_w_pca: 6.712
sgd_reg_w_opp_teams_w_pca: 6.983
rand_forest_reg_w_opp_teams_w_pca: 7.041
elastic_net_reg_w_opp_teams_w_pca: 6.718
lin_reg_w/out_opp_teams_w/out_pca: 6.62
sgd_reg_w/out_opp_teams_w/out_pca: 6.65
rand_forest_reg_w/out_opp_teams_w/out_pca: 6.853
elastic_net_reg_w/out_opp_teams_w/out_pca: 6.913
elastic_net_reg_w/out_opp_teams_w/out_pca: 6.913
lin_reg_w/out_opp_teams_w_pca: 6.712
sgd_reg_w/out_opp_teams_w_pca: 6.768
rand_forest_reg_w/out_opp_teams_w_pca: 7.037
elastic_net_reg_w/out_opp_teams_w_pca: 6.718


In [387]:
score = best_elastic_net_reg.score(X_test,y_test) * 100
print(f"The accuracy for elastic net regression is {round(score,4)}%")

The accuracy for elastic net regression is 73.2257%


In [388]:
ml_performance = ("elastic_net_reg_w/out_opp_teams_w_pca",rmse)
ml_accuracy = ("elastic_net_reg_w/out_opp_teams_w_pca",score)

In [389]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h2>Viewing Error/ Accuracy Scores For All the ML Models</h2>

In [390]:
for ml_perf in ml_models_performances:
    print(f"{ml_perf[0]}: {round(ml_perf[1],3)}")

lin_reg_w_opp_teams_w/out_pca: 6.753
sgd_reg_w_opp_teams_w/out_pca: 6.769
rand_forest_reg_w_opp_teams_w/out_pca: 6.955
elastic_net_reg_w_opp_teams_w/out_pca: 6.714
lin_reg_w_opp_teams_w_pca: 6.712
sgd_reg_w_opp_teams_w_pca: 6.983
rand_forest_reg_w_opp_teams_w_pca: 7.041
elastic_net_reg_w_opp_teams_w_pca: 6.718
lin_reg_w/out_opp_teams_w/out_pca: 6.62
sgd_reg_w/out_opp_teams_w/out_pca: 6.65
rand_forest_reg_w/out_opp_teams_w/out_pca: 6.853
elastic_net_reg_w/out_opp_teams_w/out_pca: 6.913
elastic_net_reg_w/out_opp_teams_w/out_pca: 6.913
lin_reg_w/out_opp_teams_w_pca: 6.712
sgd_reg_w/out_opp_teams_w_pca: 6.768
rand_forest_reg_w/out_opp_teams_w_pca: 7.037
elastic_net_reg_w/out_opp_teams_w_pca: 6.718
elastic_net_reg_w/out_opp_teams_w_pca: 6.718


In [392]:
for ml_acc in ml_model_accuracies:
    print(f"{ml_acc[0]}: {round(ml_acc[1],3)}%")

lin_reg_w_opp_teams_w/out_pca: 71.523%
sgd_reg_w_opp_teams_w/out_pca: 71.386%
rand_forest_reg_w_opp_teams_w/out_pca: 69.793%
elastic_net_reg_w_opp_teams_w/out_pca: 71.847%
lin_reg_w_opp_teams_w_pca: 0.733%
sgd_reg_w_opp_teams_w_pca: 71.066%
rand_forest_reg_w_opp_teams_w_pca: 70.584%
elastic_net_reg_w_opp_teams_w_pca: 73.226%
lin_reg_w/out_opp_teams_w/out_pca: 73.996%
sgd_reg_w/out_opp_teams_w/out_pca: 73.764%
rand_forest_reg_w/out_opp_teams_w/out_pca: 72.133%
elastic_net_reg_w/out_opp_teams_w/out_pca: 71.643%
lin_reg_w/out_opp_teams_w_pca: 73.268%
sgd_reg_w/out_opp_teams_w_pca: 72.819%
rand_forest_reg_w/out_opp_teams_w_pca: 70.619%
elastic_net_reg_w/out_opp_teams_w_pca: 73.226%
elastic_net_reg_w/out_opp_teams_w_pca: 73.226%
