## Modeling Yards

This notebook builds out a model to predict yardage.

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor 
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn import metrics

np.random.seed(7)

In [3]:
df = pd.read_csv('../data/receiving_train.csv', index_col = 'Player')
df.dropna(inplace = True)
df.head()

Unnamed: 0_level_0,Tm,Age,G,GS,Tgt,Rec,Yds,TD,1D,YBC,...,Yds_-2_year,TD_-2_year,Rnd,Pick,Pos,YrsPlayed,Tgt_target,Rec_target,Yds_target,TD_target
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Stefon Diggs,BUF,27,16,15,166,127,1535,8,73.0,1071,...,1021.0,9.0,5.0,146.0,WR,5.0,164,103,1225,10
Davante Adams,GNB,28,14,14,149,115,1374,18,73.0,777,...,1386.0,13.0,2.0,53.0,WR,6.0,169,123,1553,11
DeAndre Hopkins,ARI,28,16,16,160,115,1407,6,75.0,873,...,1572.0,11.0,1.0,27.0,WR,7.0,64,42,572,8
Darren Waller,LVR,28,16,15,145,107,1196,9,69.0,624,...,75.0,0.0,6.0,204.0,TE,5.0,93,55,665,2
Travis Kelce,KAN,31,15,15,145,105,1416,11,79.0,829,...,1336.0,10.0,3.0,63.0,TE,7.0,134,92,1125,9


In [4]:
df.shape

(985, 44)

### Baseline Score

In [5]:
df[['Tgt_target', 'Rec_target', 'Yds_target', 'TD_target']].mean()

Tgt_target     43.713706
Rec_target     29.422335
Yds_target    329.638579
TD_target       2.082234
dtype: float64

MSE? RMSE? Figure it out?

## Models

In [6]:
X = df.drop(columns = ['Tm', 'Pos', 'Player-additional', 'Tgt_target', 'Rec_target', 'Yds_target', 'TD_target', 'Year'])
y = df['Yds_target']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 7)

In [8]:
lr = LinearRegression()

lr.fit(X_train, y_train)

print(f'Training Score: {lr.score(X_train, y_train)}')
print(f'Testing Score: {lr.score(X_test, y_test)}')

Training Score: 0.5789262927852644
Testing Score: 0.5331506133606787


In [9]:
knn = KNeighborsRegressor()

knn.fit(X_train, y_train)

print(f'Training Score: {knn.score(X_train, y_train)}')
print(f'Testing Score: {knn.score(X_test, y_test)}')

Training Score: 0.6239769433789424
Testing Score: 0.5020546069229741


In [10]:
dt = DecisionTreeRegressor()

dt.fit(X_train, y_train)

print(f'Training Score: {dt.score(X_train, y_train)}')
print(f'Testing Score: {dt.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.05657321808376281


In [11]:
rf = RandomForestRegressor()

rf.fit(X_train, y_train)

print(f'Training Score: {rf.score(X_train, y_train)}')
print(f'Testing Score: {rf.score(X_test, y_test)}')

Training Score: 0.9291951815545912
Testing Score: 0.5540393492840742


In [12]:
gr = GradientBoostingRegressor()

gr.fit(X_train, y_train)

print(f'Training Score: {gr.score(X_train, y_train)}')
print(f'Testing Score: {gr.score(X_test, y_test)}')

Training Score: 0.8421994439551886
Testing Score: 0.5407777849575314


In [13]:
ada = AdaBoostRegressor()

ada.fit(X_train, y_train)

print(f'Training Score: {ada.score(X_train, y_train)}')
print(f'Testing Score: {ada.score(X_test, y_test)}')

Training Score: 0.5718522955975213
Testing Score: 0.43307685841830457


## Ridge

Add Gridsearching

In [4]:
X = df.drop(columns = ['Tm', 'Pos', 'Player-additional', 'Tgt_target', 'Rec_target', 'Yds_target', 'TD_target', 'Year'])
y = df['Yds_target']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 7)

In [6]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

Ridge with Default alpha = 1.0

In [29]:
ridge = Ridge()

ridge.fit(X_train_sc, y_train)

print(f'Training Score: {ridge.score(X_train_sc, y_train)}')
print(f'Testing Score: {ridge.score(X_test_sc, y_test)}')

Training Score: 0.578773885432096
Testing Score: 0.5350012120007033


Find the best alpha

In [30]:
alphas = np.logspace(0, 5, 100)
ridge_cv = RidgeCV(alphas = alphas, cv = 5)
ridge_cv.fit(X_train_sc, y_train)

RidgeCV(alphas=array([1.00000000e+00, 1.12332403e+00, 1.26185688e+00, 1.41747416e+00,
       1.59228279e+00, 1.78864953e+00, 2.00923300e+00, 2.25701972e+00,
       2.53536449e+00, 2.84803587e+00, 3.19926714e+00, 3.59381366e+00,
       4.03701726e+00, 4.53487851e+00, 5.09413801e+00, 5.72236766e+00,
       6.42807312e+00, 7.22080902e+00, 8.11130831e+00, 9.11162756e+00,
       1.02353102e+01, 1.14975700e+0...
       6.89261210e+03, 7.74263683e+03, 8.69749003e+03, 9.77009957e+03,
       1.09749877e+04, 1.23284674e+04, 1.38488637e+04, 1.55567614e+04,
       1.74752840e+04, 1.96304065e+04, 2.20513074e+04, 2.47707636e+04,
       2.78255940e+04, 3.12571585e+04, 3.51119173e+04, 3.94420606e+04,
       4.43062146e+04, 4.97702356e+04, 5.59081018e+04, 6.28029144e+04,
       7.05480231e+04, 7.92482898e+04, 8.90215085e+04, 1.00000000e+05]),
        cv=5)

In [31]:
ridge_cv.alpha_

166.81005372000593

In [32]:
ridge_cv.best_score_

0.5282910200355797

In [33]:
print(f'Training Score: {ridge_cv.score(X_train_sc, y_train)}')
print(f'Testing Score: {ridge_cv.score(X_test_sc, y_test)}')

Training Score: 0.564014046463213
Testing Score: 0.5333751173068081


## Lasso

Add Gridsearching

In [34]:
lasso = Lasso()

lasso.fit(X_train_sc, y_train)

print(f'Training Score: {lasso.score(X_train_sc, y_train)}')
print(f'Testing Score: {lasso.score(X_test_sc, y_test)}')

Training Score: 0.5740223742098975
Testing Score: 0.5412448614517376


In [36]:
l_alphas = np.logspace(-3, 0, 100)
lasso_cv = LassoCV(alphas = l_alphas, max_iter = 10000)
lasso_cv.fit(X_train_sc, y_train)

LassoCV(alphas=array([0.001     , 0.00107227, 0.00114976, 0.00123285, 0.00132194,
       0.00141747, 0.00151991, 0.00162975, 0.00174753, 0.00187382,
       0.00200923, 0.00215443, 0.00231013, 0.00247708, 0.00265609,
       0.00284804, 0.00305386, 0.00327455, 0.00351119, 0.00376494,
       0.00403702, 0.00432876, 0.00464159, 0.00497702, 0.0053367 ,
       0.00572237, 0.00613591, 0.00657933, 0.0070548 , 0.00756463,
       0.008...
       0.09326033, 0.1       , 0.10722672, 0.1149757 , 0.12328467,
       0.13219411, 0.14174742, 0.15199111, 0.16297508, 0.17475284,
       0.18738174, 0.2009233 , 0.21544347, 0.23101297, 0.24770764,
       0.26560878, 0.28480359, 0.30538555, 0.32745492, 0.35111917,
       0.37649358, 0.40370173, 0.43287613, 0.46415888, 0.49770236,
       0.53366992, 0.57223677, 0.61359073, 0.65793322, 0.70548023,
       0.75646333, 0.81113083, 0.869749  , 0.93260335, 1.        ]),
        max_iter=10000)

In [37]:
lasso_cv.alpha_

0.8697490026177834

In [39]:
print(f'Training Score: {lasso_cv.score(X_train_sc, y_train)}')
print(f'Testing Score: {lasso_cv.score(X_test_sc, y_test)}')

Training Score: 0.5748385949178121
Testing Score: 0.5404734895851897


In [48]:
lasso_coefs = lasso_cv.coef_
lasso_coefs

array([  4.2928085 ,  -3.24277948, -15.51993144,  13.16296717,
        -0.        , 141.74467733,   9.35046946,  12.20664128,
        39.00095486,  -0.        ,   0.        ,   3.76436035,
        17.45343001,   2.65578764,   3.05300148,  -2.77404496,
        -0.89244985, -42.62625116, -11.53924148, -10.03511738,
        14.51759765,   3.1837112 ,  -1.86289095, -43.13293356,
        53.04938287,  37.28299393,  12.11321262,  -8.08734838,
         0.        ,  -0.        , -20.01727526,  50.78429477,
        10.892106  ,  -0.        , -29.21846684, -55.05416681])

In [50]:
lasso_coefs = pd.Series(lasso_coefs, X.columns)

In [51]:
lasso_coefs[lasso_coefs != 0].sort_values(ascending = False)

Yds            141.744677
Rec_-1_year     53.049383
Yds_-2_year     50.784295
YBC             39.000955
Yds_-1_year     37.282994
ADOT            17.453430
AllPro          14.517598
Tgt             13.162967
1D              12.206641
TD_-1_year      12.113213
TD_-2_year      10.892106
TD               9.350469
Age              4.292808
YAC/R            3.764360
G_-1_year        3.183711
Rec/Br           3.053001
BrkTkl           2.655788
Drop%           -0.892450
GS_-1_year      -1.862891
Drop            -2.774045
G               -3.242779
G_-2_year       -8.087348
ProBowl        -10.035117
Rat            -11.539241
GS             -15.519931
Rec_-2_year    -20.017275
Pick           -29.218467
Int            -42.626251
Tgt_-1_year    -43.132934
YrsPlayed      -55.054167
dtype: float64

One Standard Deviation increase in Yds will increase target yard projections by 141.745

## ElasticNet?

Does this need to be scaled? Gridsearch?

In [52]:
enet_alphas = np.linspace(0.5, 1.0, 100)
enet_ratio = 0.5
enet_model = ElasticNetCV(alphas = enet_alphas, l1_ratio = enet_ratio, cv=5, max_iter = 100000)
enet_model = enet_model.fit(X_train, y_train)

print(f'Training Score: {enet_model.score(X_train, y_train)}')
print(f'Testing Score: {enet_model.score(X_test, y_test)}')

Training Score: 0.5750718201139815
Testing Score: 0.548132156462948


## Principal Component Analysis

Figure out how this works!

In [7]:
pca = PCA(random_state = 7)

In [8]:
X_train_sc = pca.fit_transform(X_train)
X_test_sc = pca.transform(X_test)

In [12]:
var_exp = pca.explained_variance_ratio_
print(f'Explained variance (first 20 components): {np.round(var_exp[:20], 3)}')

print('')

cum_var_exp = np.cumsum(var_exp)
print(f'Cumulative explained variance (first 20 components): {np.round(cum_var_exp[:20], 3)}')

Explained variance (first 12 components): [0.772 0.105 0.066 0.04  0.015 0.001 0.    0.    0.    0.    0.    0.   ]

Cumulative explained variance (first 12 components): [0.772 0.877 0.942 0.982 0.997 0.998 0.999 0.999 0.999 1.    1.    1.   ]


In [31]:
pca = PCA(n_components = 16, random_state = 7)

X_train_sc = pca.fit_transform(X_train)
X_test_sc = pca.transform(X_test)

lr2 = LinearRegression()

lr2.fit(X_train_sc, y_train)

var_exp = pca.explained_variance_ratio_
print(f'Explained variance (first 20 components): {np.round(var_exp[:20], 3)}')
print(f'Cumulative explained variance (first 20 components): {np.round(cum_var_exp[:20], 3)}')

print(f'Training Score: {round(lr2.score(X_train_sc, y_train),4)}')
print(f'Testing Score: {round(lr2.score(X_test_sc, y_test),4)}')

Explained variance (first 20 components): [0.772 0.105 0.066 0.04  0.015 0.001 0.    0.    0.    0.    0.    0.
 0.    0.    0.    0.   ]
Cumulative explained variance (first 20 components): [0.772 0.877 0.942 0.982 0.997 0.998 0.999 0.999 0.999 1.    1.    1.
 1.    1.    1.    1.    1.    1.    1.    1.   ]
Training Score: 0.5555
Testing Score: 0.5644
