In [227]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import os

In [228]:
full_game_df = pd.read_csv(os.path.join("../","data","combined_game_data.csv"))

In [229]:
sig_cols = list(np.load(os.path.join("../","sig_cols.npy")))

In [230]:
target = full_game_df["+/-"]

In [231]:
full_game_df.head()

Unnamed: 0,G,Date,Age,Tm,game_location,Opp,game_result,GS,MP,FG,...,More-BLKs-Than-STLs,More-AST-Than-TOV,PTS-Cat,2P,2PA,2P%,2P%-Above-50%,More than 3 Fouls,FTR-Above-Avg,Less-Than-3-TOV
0,1,2003-10-29,18,CLE,0,SAC,-14,1.0,42,12.0,...,False,True,below 35,12.0,18.0,0.666667,True,False,False,True
1,2,2003-10-30,18,CLE,0,PHO,-9,1.0,40,8.0,...,False,True,below 35,7.0,12.0,0.583333,True,False,False,False
2,3,2003-11-01,18,CLE,0,POR,-19,1.0,39,3.0,...,False,True,below 20,3.0,11.0,0.272727,False,False,False,True
3,4,2003-11-05,18,CLE,1,DEN,-4,1.0,41,3.0,...,True,True,below 20,3.0,9.0,0.333333,False,False,False,True
4,5,2003-11-07,18,CLE,0,IND,-1,1.0,43,8.0,...,False,False,below 35,7.0,16.0,0.4375,False,False,True,False


In [232]:
sig_df = full_game_df[sig_cols]

In [233]:
sig_df.head()

Unnamed: 0,game_result,FG%,GmSc,TS%,GmSc-Per-PTS,PTS-Per-POSS,game_result-Per-PTS,PER,game_location,Opp,...,Over-3-BLKs-and-STLs,Played-Above-Avg-Min,Above-Avg-FT%,Career-Stage,Above-Avg-PTS-Per-POSS,More-BLKs-Than-STLs,More-AST-Than-TOV,PTS-Cat,2P%-Above-50%,More than 3 Fouls
0,-14,0.6,24.7,0.583431,0.988,1.172608,-0.56,29.277048,0,SAC,...,True,True,False,rising,True,False,True,below 35,True,False
1,-9,0.471,14.7,0.516605,0.7,0.837321,-0.428571,16.8936,0,PHO,...,False,True,False,rising,False,False,True,below 35,True,False
2,-19,0.25,5.0,0.30888,0.625,0.537634,-2.375,5.489179,0,POR,...,False,True,True,rising,False,False,True,below 20,False,False
3,-4,0.273,11.2,0.305011,1.6,0.611888,-0.571429,13.291049,1,DEN,...,True,True,True,rising,False,True,True,below 20,False,False
4,-1,0.444,9.0,0.539273,0.391304,0.819088,-0.043478,8.698953,0,IND,...,False,True,True,rising,False,False,False,below 35,False,False


In [234]:
sig_df.replace(True,1,inplace=True)
sig_df.replace(False,0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [235]:
sig_df.select_dtypes(include=["int64"]).columns

Index(['game_result', 'game_location'], dtype='object')

In [236]:
for col in sig_df.select_dtypes(include=["int64"]).columns:
    sig_df[col] = sig_df[col].apply(lambda x: float(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [237]:
sig_df.dtypes

game_result               float64
FG%                       float64
GmSc                      float64
TS%                       float64
GmSc-Per-PTS              float64
PTS-Per-POSS              float64
game_result-Per-PTS       float64
PER                       float64
game_location             float64
Opp                        object
Won                       float64
High-3P%-Shooting         float64
High-FG%-Shooting         float64
Over-3-BLKs-and-STLs      float64
Played-Above-Avg-Min      float64
Above-Avg-FT%             float64
Career-Stage               object
Above-Avg-PTS-Per-POSS    float64
More-BLKs-Than-STLs       float64
More-AST-Than-TOV         float64
PTS-Cat                    object
2P%-Above-50%             float64
More than 3 Fouls         float64
dtype: object

In [238]:
def label_bin(cat_col,df):
    lb = LabelBinarizer()
    data = lb.fit_transform(df[cat_col])
    classes = lb.classes_
    new_df = pd.DataFrame(data,columns=classes,dtype=float)
    for col in new_df.columns:
        df[col] = new_df[col]
    return lb

In [239]:
label_bins = []
for object_col in sig_df.select_dtypes(include=["object"]).columns:
    object_lb = label_bin(object_col,sig_df)
    object_col_lb_pair = (object_col,object_lb)
    label_bins.append(object_col_lb_pair)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [240]:
sig_df.tail()

Unnamed: 0,game_result,FG%,GmSc,TS%,GmSc-Per-PTS,PTS-Per-POSS,game_result-Per-PTS,PER,game_location,Opp,...,SEA,TOR,UTA,WAS,decline,prime,rising,above 35,below 20,below 35
1255,10.0,0.571,31.3,0.657778,0.845946,1.170886,0.27027,41.786472,1.0,MIL,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1256,9.0,0.412,25.8,0.591966,0.921429,1.208981,0.321429,36.2565,0.0,LAC,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1257,-2.0,0.545,24.7,0.594872,0.851724,1.10687,-0.068966,35.312382,1.0,BRK,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1258,2.0,0.316,9.6,0.401003,0.6,0.699301,0.125,12.610471,1.0,LAC,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1259,-15.0,0.467,14.3,0.545703,0.715,0.948767,-0.75,19.701088,0.0,TOR,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [241]:
label_bins

[('Opp', LabelBinarizer()),
 ('Career-Stage', LabelBinarizer()),
 ('PTS-Cat', LabelBinarizer())]

In [242]:
sig_df.select_dtypes(include=["object"])

Unnamed: 0,Opp,Career-Stage,PTS-Cat
0,SAC,rising,below 35
1,PHO,rising,below 35
2,POR,rising,below 20
3,DEN,rising,below 20
4,IND,rising,below 35
...,...,...,...
1255,MIL,decline,above 35
1256,LAC,decline,below 35
1257,BRK,decline,below 35
1258,LAC,decline,below 20


In [243]:
split = StratifiedShuffleSplit(n_splits=1,test_size=0.33)

In [244]:
ml_models_performances = []
ml_model_accuracies = []

<h2>Perform Machine Learning with Opposition Team Feature Included</h2>

In [245]:
std_scaler = StandardScaler()

In [246]:
one_hot_encoded_sig_df = sig_df.select_dtypes(exclude=["object"])

In [247]:
one_hot_encoded_sig_df.columns

Index(['game_result', 'FG%', 'GmSc', 'TS%', 'GmSc-Per-PTS', 'PTS-Per-POSS',
       'game_result-Per-PTS', 'PER', 'game_location', 'Won',
       'High-3P%-Shooting', 'High-FG%-Shooting', 'Over-3-BLKs-and-STLs',
       'Played-Above-Avg-Min', 'Above-Avg-FT%', 'Above-Avg-PTS-Per-POSS',
       'More-BLKs-Than-STLs', 'More-AST-Than-TOV', '2P%-Above-50%',
       'More than 3 Fouls', 'ATL', 'BOS', 'BRK', 'CHA', 'CHI', 'CHO', 'CLE',
       'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA',
       'MIL', 'MIN', 'NJN', 'NOH', 'NOK', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI',
       'PHO', 'POR', 'SAC', 'SAS', 'SEA', 'TOR', 'UTA', 'WAS', 'decline',
       'prime', 'rising', 'above 35', 'below 20', 'below 35'],
      dtype='object')

In [248]:
std_scale_sig_data = std_scaler.fit_transform(one_hot_encoded_sig_df.values)

In [249]:
std_scale_sig_df = pd.DataFrame(std_scale_sig_data,columns=one_hot_encoded_sig_df.columns)

In [250]:
std_scale_sig_df.head()

Unnamed: 0,game_result,FG%,GmSc,TS%,GmSc-Per-PTS,PTS-Per-POSS,game_result-Per-PTS,PER,game_location,Won,...,SEA,TOR,UTA,WAS,decline,prime,rising,above 35,below 20,below 35
0,-1.420272,0.846395,0.318334,0.027535,0.942241,0.480218,-1.126608,0.098135,-1.00957,-1.406671,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,-0.456129,0.70837
1,-1.035821,-0.304266,-0.975332,-0.576938,-0.652382,-1.059284,-0.920912,-1.119288,-1.00957,-1.406671,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,-0.456129,0.70837
2,-1.804724,-2.275553,-2.230188,-2.455923,-1.067648,-2.43532,-3.967223,-2.240462,-1.00957,-1.406671,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,2.192362,-1.411692
3,-0.65137,-2.070397,-1.428115,-2.490924,4.330813,-2.094377,-1.144495,-1.473457,0.990521,-1.406671,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,2.192362,-1.411692
4,-0.420699,-0.545102,-1.712722,-0.371893,-2.361593,-1.142999,-0.318212,-1.924908,-1.00957,-1.406671,...,-0.079936,-0.213643,-0.161427,-0.205387,-0.386827,-0.766095,1.0,-0.439525,-0.456129,0.70837


In [251]:
print(f"There are {len(std_scale_sig_df.columns)} columns")
std_scale_sig_df.columns

There are 61 columns


Index(['game_result', 'FG%', 'GmSc', 'TS%', 'GmSc-Per-PTS', 'PTS-Per-POSS',
       'game_result-Per-PTS', 'PER', 'game_location', 'Won',
       'High-3P%-Shooting', 'High-FG%-Shooting', 'Over-3-BLKs-and-STLs',
       'Played-Above-Avg-Min', 'Above-Avg-FT%', 'Above-Avg-PTS-Per-POSS',
       'More-BLKs-Than-STLs', 'More-AST-Than-TOV', '2P%-Above-50%',
       'More than 3 Fouls', 'ATL', 'BOS', 'BRK', 'CHA', 'CHI', 'CHO', 'CLE',
       'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA',
       'MIL', 'MIN', 'NJN', 'NOH', 'NOK', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI',
       'PHO', 'POR', 'SAC', 'SAS', 'SEA', 'TOR', 'UTA', 'WAS', 'decline',
       'prime', 'rising', 'above 35', 'below 20', 'below 35'],
      dtype='object')

In [252]:
X, y = std_scale_sig_df.values, target

In [253]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33)

<h4>Linear Regression</h4>

In [254]:
ln = LinearRegression()

In [255]:
ln.fit(X_train,y_train)

LinearRegression()

In [256]:
y_pred = ln.predict(X_test)

In [257]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for linear regression is {round(rmse,2)} points")

The root mean squared error for linear regression is 7.07 points


In [258]:
score = ln.score(X_test,y_test) * 100
print(f"The accuracy for linear regression is {round(score,4)}%")

The accuracy for linear regression is 67.0197%


In [259]:
ml_performance = ("lin_reg_w_opp_teams_w/out_pca",rmse)
ml_accuracy = ("lin_reg_w_opp_teams_w/out_pca",score)

In [260]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>SGD Regression</h4>

In [261]:
sgd_param_grid = [
    {
        "loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
        "epsilon":[0.1,0.01,0.001]
    }
]

In [262]:
sgd_grid_search = GridSearchCV(SGDRegressor(),sgd_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [263]:
sgd_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=SGDRegressor(), n_jobs=-1,
             param_grid=[{'epsilon': [0.1, 0.01, 0.001],
                          'loss': ['squared_loss', 'huber',
                                   'epsilon_insensitive',
                                   'squared_epsilon_insensitive']}],
             scoring='neg_root_mean_squared_error')

In [264]:
sgd_grid_search.best_params_

{'epsilon': 0.1, 'loss': 'squared_loss'}

In [265]:
best_sgd_reg = sgd_grid_search.best_estimator_

In [266]:
y_pred = best_sgd_reg.predict(X_test)

In [267]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 7.07 points


In [268]:
score = best_sgd_reg.score(X_test,y_test) * 100
print(f"The accuracy for sgd regression is {round(score,4)}%")

The accuracy for sgd regression is 67.0274%


In [269]:
ml_performance = ("sgd_reg_w_opp_teams_w/out_pca",rmse)
ml_accuracy = ("sgd_reg_w_opp_teams_w/out_pca",score)

In [270]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Random Forest Regression</h4>

In [271]:
rand_forest_param_grid = [
    {
        "n_estimators":[50,100,500],
        "criterion":["mse","mae"]
    }
]

In [272]:
rand_forest_grid_search = GridSearchCV(RandomForestRegressor(),rand_forest_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [273]:
rand_forest_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'criterion': ['mse', 'mae'],
                          'n_estimators': [50, 100, 500]}],
             scoring='neg_root_mean_squared_error')

In [274]:
rand_forest_grid_search.best_params_

{'criterion': 'mae', 'n_estimators': 500}

In [275]:
best_rand_forest_reg = rand_forest_grid_search.best_estimator_

In [276]:
y_pred = best_rand_forest_reg.predict(X_test)

In [277]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.93 points


In [278]:
score = best_rand_forest_reg.score(X_test,y_test) * 100
print(f"The accuracy for random forest regression is {round(score,4)}%")

The accuracy for random forest regression is 68.2621%


In [279]:
ml_performance = ("rand_forest_reg_w_opp_teams_w/out_pca",rmse)
ml_accuracy = ("rand_forest_reg_w_opp_teams_w/out_pca",score)

In [280]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Elastic Net Regression</h4>

In [281]:
elastic_net_param_grid = [
    {
        "l1_ratio":[0.25,0.5,0.75]
    }
]

In [282]:
elastic_net_grid_search = GridSearchCV(ElasticNet(),elastic_net_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [283]:
elastic_net_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=ElasticNet(), n_jobs=-1,
             param_grid=[{'l1_ratio': [0.25, 0.5, 0.75]}],
             scoring='neg_root_mean_squared_error')

In [284]:
elastic_net_grid_search.best_params_

{'l1_ratio': 0.75}

In [285]:
best_elastic_net_reg = elastic_net_grid_search.best_estimator_

In [286]:
y_pred = best_elastic_net_reg.predict(X_test)

In [287]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for Elastic Net regression is {round(rmse,2)} points")

The root mean squared error for Elastic Net regression is 6.9 points


In [288]:
score = best_elastic_net_reg.score(X_test,y_test) * 100
print(f"The accuracy for elastic net regression is {round(score,4)}%")

The accuracy for elastic net regression is 68.5711%


In [289]:
ml_performance = ("elastic_net_reg_w_opp_teams_w/out_pca",rmse)
ml_accuracy = ("elastic_net_reg_w_opp_teams_w/out_pca",score)

In [290]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h3>Perform PCA to see if there is any performance improvements</h3>

In [291]:
pca = PCA()

In [292]:
pca.fit_transform(one_hot_encoded_sig_df.values)

array([[ 1.22980724e+01, -1.40787345e+01,  2.42832188e-01, ...,
        -6.90215163e-15, -3.20754327e-15,  6.21976989e-16],
       [ 1.91805886e+01,  1.15836357e+00, -1.02894998e-01, ...,
        -5.37548508e-15,  2.84616663e-15,  3.73714869e-16],
       [ 3.66022808e+01,  5.70160028e+00,  1.21759167e+00, ...,
         8.65901084e-15, -1.50315547e-15, -2.67354734e-15],
       ...,
       [-2.94536778e-02, -9.42423770e+00,  3.17168525e+00, ...,
         2.48691146e-16,  4.29883964e-16, -1.64573798e-16],
       [ 1.52195511e+01,  1.34507127e+01,  4.33292335e-01, ...,
         1.10134257e-15, -7.19467988e-16,  7.15628363e-17],
       [ 2.22283309e+01, -4.13636504e+00,  2.64395164e+00, ...,
         5.91824373e-17, -7.99715897e-16,  6.07131875e-17]])

In [293]:
sum(pca.explained_variance_ratio_)

0.9999999999999999

In [294]:
cum_sum = np.cumsum(pca.explained_variance_ratio_)

In [295]:
max_comp_idx = np.argmax(cum_sum>0.95)

In [296]:
pca = PCA(n_components=max_comp_idx+1)

In [297]:
pca_transformed_sig_data = pca.fit_transform(one_hot_encoded_sig_df.values)

In [298]:
X, y = pca_transformed_sig_data, target

In [299]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

<h4>Linear Regression</h4>

In [300]:
ln = LinearRegression()

In [301]:
ln.fit(X_train,y_train)

LinearRegression()

In [302]:
y_pred = ln.predict(X_test)

In [303]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for linear regression is {round(rmse,2)} points")

The root mean squared error for linear regression is 6.68 points


In [304]:
score = ln.score(X_test,y_test)
print(f"The accuracy for linear regression is {round(score,4)}%")

The accuracy for linear regression is 0.7349%


In [305]:
ml_performance = ("lin_reg_w_opp_teams_w_pca",rmse)
ml_accuracy = ("lin_reg_w_opp_teams_w_pca",score)

In [306]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>SGD Regression</h4>

In [307]:
sgd_param_grid = [
    {
        "loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
        "epsilon":[0.1,0.01,0.001]
    }
]

In [308]:
sgd_grid_search = GridSearchCV(SGDRegressor(),sgd_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [309]:
sgd_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=SGDRegressor(), n_jobs=-1,
             param_grid=[{'epsilon': [0.1, 0.01, 0.001],
                          'loss': ['squared_loss', 'huber',
                                   'epsilon_insensitive',
                                   'squared_epsilon_insensitive']}],
             scoring='neg_root_mean_squared_error')

In [310]:
sgd_grid_search.best_params_

{'epsilon': 0.1, 'loss': 'epsilon_insensitive'}

In [311]:
best_sgd_reg = sgd_grid_search.best_estimator_

In [312]:
y_pred = best_sgd_reg.predict(X_test)

In [313]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.72 points


In [314]:
score = best_sgd_reg.score(X_test,y_test) * 100
print(f"The accuracy for sgd regression is {round(score,4)}%")

The accuracy for sgd regression is 73.1672%


In [315]:
ml_performance = ("sgd_reg_w_opp_teams_w_pca",rmse)
ml_accuracy = ("sgd_reg_w_opp_teams_w_pca",score)

In [316]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Random Forest Regression</h4>

In [317]:
rand_forest_param_grid = [
    {
        "n_estimators":[50,100,500],
        "criterion":["mse","mae"]
    }
]

In [318]:
rand_forest_grid_search = GridSearchCV(RandomForestRegressor(),rand_forest_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [319]:
rand_forest_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'criterion': ['mse', 'mae'],
                          'n_estimators': [50, 100, 500]}],
             scoring='neg_root_mean_squared_error')

In [320]:
rand_forest_grid_search.best_params_

{'criterion': 'mae', 'n_estimators': 100}

In [321]:
best_rand_forest_reg = rand_forest_grid_search.best_estimator_

In [322]:
y_pred = best_rand_forest_reg.predict(X_test)

In [323]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 7.4 points


In [324]:
score = best_rand_forest_reg.score(X_test,y_test) * 100
print(f"The accuracy for random forest regression is {round(score,4)}%")

The accuracy for random forest regression is 67.5109%


In [325]:
ml_performance = ("rand_forest_reg_w_opp_teams_w_pca",rmse)
ml_accuracy = ("rand_forest_reg_w_opp_teams_w_pca",score)

In [326]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Elastic Net Regression</h4>

In [327]:
elastic_net_param_grid = [
    {
        "l1_ratio":[0.25,0.5,0.75]
    }
]

In [328]:
elastic_net_grid_search = GridSearchCV(ElasticNet(),elastic_net_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [329]:
elastic_net_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=ElasticNet(), n_jobs=-1,
             param_grid=[{'l1_ratio': [0.25, 0.5, 0.75]}],
             scoring='neg_root_mean_squared_error')

In [330]:
elastic_net_grid_search.best_params_

{'l1_ratio': 0.25}

In [331]:
best_elastic_net_reg = elastic_net_grid_search.best_estimator_

In [332]:
y_pred = best_elastic_net_reg.predict(X_test)

In [333]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for Elastic Net regression is {round(rmse,2)} points")

The root mean squared error for Elastic Net regression is 6.69 points


In [334]:
score = best_elastic_net_reg.score(X_test,y_test) * 100
print(f"The accuracy for elastic net regression is {round(score,4)}%")

The accuracy for elastic net regression is 73.4836%


In [335]:
ml_performance = ("elastic_net_reg_w_opp_teams_w_pca",rmse)
ml_accuracy = ("elastic_net_reg_w_opp_teams_w_pca",score)

In [336]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h2>Remove Opposing Team Columns</h2>

In [337]:
sig_df.columns

Index(['game_result', 'FG%', 'GmSc', 'TS%', 'GmSc-Per-PTS', 'PTS-Per-POSS',
       'game_result-Per-PTS', 'PER', 'game_location', 'Opp', 'Won',
       'High-3P%-Shooting', 'High-FG%-Shooting', 'Over-3-BLKs-and-STLs',
       'Played-Above-Avg-Min', 'Above-Avg-FT%', 'Career-Stage',
       'Above-Avg-PTS-Per-POSS', 'More-BLKs-Than-STLs', 'More-AST-Than-TOV',
       'PTS-Cat', '2P%-Above-50%', 'More than 3 Fouls', 'ATL', 'BOS', 'BRK',
       'CHA', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND',
       'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NJN', 'NOH', 'NOK', 'NOP',
       'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'SEA', 'TOR',
       'UTA', 'WAS', 'decline', 'prime', 'rising', 'above 35', 'below 20',
       'below 35'],
      dtype='object')

In [338]:
teams_lists = ['ATL', 'BOS', 'BRK',
       'CHA', 'CHI', 'CHO', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND',
       'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NJN', 'NOH', 'NOK', 'NOP',
       'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'SEA', 'TOR',
       'UTA', 'WAS']

In [339]:
non_opp_cols = list(set(sig_df.columns)-set(teams_lists))

In [340]:
sig_df_no_opp = sig_df[non_opp_cols].select_dtypes(exclude=["object"])

In [341]:
sig_df_no_opp.head()

Unnamed: 0,PTS-Per-POSS,Won,More-AST-Than-TOV,PER,above 35,High-FG%-Shooting,game_result-Per-PTS,Played-Above-Avg-Min,prime,More than 3 Fouls,...,Above-Avg-FT%,below 20,game_location,GmSc-Per-PTS,TS%,Above-Avg-PTS-Per-POSS,below 35,GmSc,More-BLKs-Than-STLs,Over-3-BLKs-and-STLs
0,1.172608,0.0,1.0,29.277048,0.0,1.0,-0.56,1.0,0.0,0.0,...,0.0,0.0,0.0,0.988,0.583431,1.0,1.0,24.7,0.0,1.0
1,0.837321,0.0,1.0,16.8936,0.0,0.0,-0.428571,1.0,0.0,0.0,...,0.0,0.0,0.0,0.7,0.516605,0.0,1.0,14.7,0.0,0.0
2,0.537634,0.0,1.0,5.489179,0.0,0.0,-2.375,1.0,0.0,0.0,...,1.0,1.0,0.0,0.625,0.30888,0.0,0.0,5.0,0.0,0.0
3,0.611888,0.0,1.0,13.291049,0.0,0.0,-0.571429,1.0,0.0,0.0,...,1.0,1.0,1.0,1.6,0.305011,0.0,0.0,11.2,1.0,1.0
4,0.819088,0.0,0.0,8.698953,0.0,0.0,-0.043478,1.0,0.0,0.0,...,1.0,0.0,0.0,0.391304,0.539273,0.0,1.0,9.0,0.0,0.0


In [342]:
std_scaler = StandardScaler()

In [343]:
sig_no_opp_data = std_scaler.fit_transform(sig_df_no_opp.values)

In [344]:
X, y = sig_no_opp_data, target

In [345]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

<h4>Linear Regression</h4>

In [346]:
ln = LinearRegression()

In [347]:
ln.fit(X_train,y_train)

LinearRegression()

In [348]:
y_pred = ln.predict(X_test)

In [349]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for linear regression is {round(rmse,2)} points")

The root mean squared error for linear regression is 6.63 points


In [350]:
score = ln.score(X_test,y_test) * 100
print(f"The accuracy for linear regression is {round(score,4)}%")

The accuracy for linear regression is 73.9381%


In [351]:
ml_performance = ("lin_reg_w/out_opp_teams_w/out_pca",rmse)
ml_accuracy = ("lin_reg_w/out_opp_teams_w/out_pca",score)

In [352]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>SGD Regression</h4>

In [353]:
sgd_param_grid = [
    {
        "loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
        "epsilon":[0.1,0.01,0.001]
    }
]

In [354]:
sgd_grid_search = GridSearchCV(SGDRegressor(),sgd_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [355]:
sgd_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=SGDRegressor(), n_jobs=-1,
             param_grid=[{'epsilon': [0.1, 0.01, 0.001],
                          'loss': ['squared_loss', 'huber',
                                   'epsilon_insensitive',
                                   'squared_epsilon_insensitive']}],
             scoring='neg_root_mean_squared_error')

In [356]:
sgd_grid_search.best_params_

{'epsilon': 0.1, 'loss': 'squared_loss'}

In [357]:
best_sgd_reg = sgd_grid_search.best_estimator_

In [358]:
y_pred = best_sgd_reg.predict(X_test)

In [359]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.64 points


In [360]:
score = best_sgd_reg.score(X_test,y_test) * 100
print(f"The accuracy for sgd regression is {round(score,4)}%")

The accuracy for sgd regression is 73.861%


In [361]:
ml_performance = ("sgd_reg_w/out_opp_teams_w/out_pca",rmse)
ml_accuracy = ("sgd_reg_w/out_opp_teams_w/out_pca",score)

In [362]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Random Forest Regression</h4>

In [363]:
rand_forest_param_grid = [
    {
        "n_estimators":[50,100,500],
        "criterion":["mse","mae"]
    }
]

In [364]:
rand_forest_grid_search = GridSearchCV(RandomForestRegressor(),rand_forest_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [365]:
rand_forest_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'criterion': ['mse', 'mae'],
                          'n_estimators': [50, 100, 500]}],
             scoring='neg_root_mean_squared_error')

In [366]:
rand_forest_grid_search.best_params_

{'criterion': 'mae', 'n_estimators': 500}

In [367]:
best_rand_forest_reg = rand_forest_grid_search.best_estimator_

In [368]:
y_pred = best_rand_forest_reg.predict(X_test)

In [369]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.93 points


In [370]:
score = best_rand_forest_reg.score(X_test,y_test) * 100
print(f"The accuracy for random forest regression is {round(score,4)}%")

The accuracy for random forest regression is 71.5052%


In [371]:
ml_performance = ("rand_forest_reg_w/out_opp_teams_w/out_pca",rmse)
ml_accuracy = ("rand_forest_reg_w/out_opp_teams_w/out_pca",score)

In [372]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Elastic Net Regression</h4>

In [373]:
elastic_net_param_grid = [
    {
        "l1_ratio":[0.25,0.5,0.75]
    }
]

In [374]:
elastic_net_grid_search = GridSearchCV(ElasticNet(),elastic_net_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [375]:
elastic_net_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=ElasticNet(), n_jobs=-1,
             param_grid=[{'l1_ratio': [0.25, 0.5, 0.75]}],
             scoring='neg_root_mean_squared_error')

In [376]:
elastic_net_grid_search.best_params_

{'l1_ratio': 0.75}

In [377]:
best_elastic_net_reg = elastic_net_grid_search.best_estimator_

In [378]:
y_pred = best_elastic_net_reg.predict(X_test)

In [379]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for Elastic Net regression is {round(rmse,2)} points")

The root mean squared error for Elastic Net regression is 6.91 points


In [380]:
score = best_elastic_net_reg.score(X_test,y_test) * 100
print(f"The accuracy for elastic net regression is {round(score,4)}%")

The accuracy for elastic net regression is 71.6826%


In [381]:
ml_performance = ("elastic_net_reg_w/out_opp_teams_w/out_pca",rmse)
ml_accuracy = ("elastic_net_reg_w/out_opp_teams_w/out_pca",score)

In [382]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h3>Performance Analysis with PCA</h3>

In [383]:
pca = PCA()

In [384]:
sig_df_no_opp_w_pca = pca.fit_transform(sig_df_no_opp)

In [385]:
pca_cumsum = np.cumsum(pca.explained_variance_ratio_)

In [386]:
max_comp_idx = np.argmax(pca_cumsum>0.95)

In [387]:
pca = PCA(n_components=max_comp_idx+1)

In [388]:
sig_df_no_opp_w_pca = pca.fit_transform(sig_df_no_opp)

In [389]:
X, y = sig_df_no_opp_w_pca, target

In [390]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [391]:
ml_models_performances.append(ml_performance)

<h4>Linear Regression</h4>

In [392]:
ln = LinearRegression()

In [393]:
ln.fit(X_train,y_train)

LinearRegression()

In [394]:
y_pred = ln.predict(X_test)

In [395]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for linear regression is {round(rmse,2)} points")

The root mean squared error for linear regression is 6.68 points


In [396]:
score = ln.score(X_test,y_test) * 100
print(f"The accuracy for linear regression is {round(score,4)}%")

The accuracy for linear regression is 73.4874%


In [397]:
ml_performance = ("lin_reg_w/out_opp_teams_w_pca",rmse)
ml_accuracy = ("lin_reg_w/out_opp_teams_w_pca",score)

In [398]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>SGD Regression</h4>

In [399]:
sgd_param_grid = [
    {
        "loss":["squared_loss","huber","epsilon_insensitive","squared_epsilon_insensitive"],
        "epsilon":[0.1,0.01,0.001]
    }
]

In [400]:
sgd_grid_search = GridSearchCV(SGDRegressor(),sgd_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [401]:
sgd_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=SGDRegressor(), n_jobs=-1,
             param_grid=[{'epsilon': [0.1, 0.01, 0.001],
                          'loss': ['squared_loss', 'huber',
                                   'epsilon_insensitive',
                                   'squared_epsilon_insensitive']}],
             scoring='neg_root_mean_squared_error')

In [402]:
sgd_grid_search.best_params_

{'epsilon': 0.01, 'loss': 'epsilon_insensitive'}

In [403]:
best_sgd_reg = sgd_grid_search.best_estimator_

In [404]:
y_pred = best_sgd_reg.predict(X_test)

In [405]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 6.72 points


In [406]:
score = best_sgd_reg.score(X_test,y_test) * 100
print(f"The accuracy for sgd regression is {round(score,4)}%")

The accuracy for sgd regression is 73.2026%


In [407]:
ml_performance = ("sgd_reg_w/out_opp_teams_w_pca",rmse)
ml_accuracy = ("sgd_reg_w/out_opp_teams_w_pca",score)

In [408]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Random Forest Regression</h4>

In [409]:
rand_forest_param_grid = [
    {
        "n_estimators":[50,100,500],
        "criterion":["mse","mae"]
    }
]

In [410]:
rand_forest_grid_search = GridSearchCV(RandomForestRegressor(),rand_forest_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [411]:
rand_forest_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid=[{'criterion': ['mse', 'mae'],
                          'n_estimators': [50, 100, 500]}],
             scoring='neg_root_mean_squared_error')

In [412]:
rand_forest_grid_search.best_params_

{'criterion': 'mae', 'n_estimators': 500}

In [413]:
best_rand_forest_reg = rand_forest_grid_search.best_estimator_

In [414]:
y_pred = best_rand_forest_reg.predict(X_test)

In [415]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for SGD regression is {round(rmse,2)} points")

The root mean squared error for SGD regression is 7.3 points


In [416]:
score = best_rand_forest_reg.score(X_test,y_test) * 100
print(f"The accuracy for random forest regression is {round(score,4)}%")

The accuracy for random forest regression is 68.3846%


In [417]:
ml_performance = ("rand_forest_reg_w/out_opp_teams_w_pca",rmse)
ml_accuracy = ("rand_forest_reg_w/out_opp_teams_w_pca",score)

In [418]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h4>Elastic Net Regression</h4>

In [419]:
elastic_net_param_grid = [
    {
        "l1_ratio":[0.25,0.5,0.75]
    }
]

In [420]:
elastic_net_grid_search = GridSearchCV(ElasticNet(),elastic_net_param_grid,cv=5,scoring="neg_root_mean_squared_error",n_jobs=-1)

In [421]:
elastic_net_grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=ElasticNet(), n_jobs=-1,
             param_grid=[{'l1_ratio': [0.25, 0.5, 0.75]}],
             scoring='neg_root_mean_squared_error')

In [422]:
elastic_net_grid_search.best_params_

{'l1_ratio': 0.25}

In [423]:
best_elastic_net_reg = elastic_net_grid_search.best_estimator_

In [424]:
y_pred = best_elastic_net_reg.predict(X_test)

In [425]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
print(f"The root mean squared error for Elastic Net regression is {round(rmse,2)} points")

The root mean squared error for Elastic Net regression is 6.69 points


In [426]:
score = best_elastic_net_reg.score(X_test,y_test) * 100
print(f"The accuracy for elastic net regression is {round(score,4)}%")

The accuracy for elastic net regression is 73.4837%


In [427]:
ml_performance = ("elastic_net_reg_w/out_opp_teams_w_pca",rmse)
ml_accuracy = ("elastic_net_reg_w/out_opp_teams_w_pca",score)

In [428]:
ml_models_performances.append(ml_performance)
ml_model_accuracies.append(ml_accuracy)

<h2>Viewing Error/ Accuracy Scores For All the ML Models</h2>

In [429]:
for ml_perf in ml_models_performances:
    print(f"{ml_perf[0]}: {round(ml_perf[1],3)}")

lin_reg_w_opp_teams_w/out_pca: 7.069
sgd_reg_w_opp_teams_w/out_pca: 7.068
rand_forest_reg_w_opp_teams_w/out_pca: 6.934
elastic_net_reg_w_opp_teams_w/out_pca: 6.901
lin_reg_w_opp_teams_w_pca: 6.685
sgd_reg_w_opp_teams_w_pca: 6.725
rand_forest_reg_w_opp_teams_w_pca: 7.4
elastic_net_reg_w_opp_teams_w_pca: 6.685
lin_reg_w/out_opp_teams_w/out_pca: 6.628
sgd_reg_w/out_opp_teams_w/out_pca: 6.637
rand_forest_reg_w/out_opp_teams_w/out_pca: 6.93
elastic_net_reg_w/out_opp_teams_w/out_pca: 6.908
elastic_net_reg_w/out_opp_teams_w/out_pca: 6.908
lin_reg_w/out_opp_teams_w_pca: 6.685
sgd_reg_w/out_opp_teams_w_pca: 6.72
rand_forest_reg_w/out_opp_teams_w_pca: 7.3
elastic_net_reg_w/out_opp_teams_w_pca: 6.685


In [430]:
for ml_acc in ml_model_accuracies:
    print(f"{ml_acc[0]}: {round(ml_acc[1],3)}%")

lin_reg_w_opp_teams_w/out_pca: 67.02%
sgd_reg_w_opp_teams_w/out_pca: 67.027%
rand_forest_reg_w_opp_teams_w/out_pca: 68.262%
elastic_net_reg_w_opp_teams_w/out_pca: 68.571%
lin_reg_w_opp_teams_w_pca: 0.735%
sgd_reg_w_opp_teams_w_pca: 73.167%
rand_forest_reg_w_opp_teams_w_pca: 67.511%
elastic_net_reg_w_opp_teams_w_pca: 73.484%
lin_reg_w/out_opp_teams_w/out_pca: 73.938%
sgd_reg_w/out_opp_teams_w/out_pca: 73.861%
rand_forest_reg_w/out_opp_teams_w/out_pca: 71.505%
elastic_net_reg_w/out_opp_teams_w/out_pca: 71.683%
lin_reg_w/out_opp_teams_w_pca: 73.487%
sgd_reg_w/out_opp_teams_w_pca: 73.203%
rand_forest_reg_w/out_opp_teams_w_pca: 68.385%
elastic_net_reg_w/out_opp_teams_w_pca: 73.484%
