# Final Models and Predictions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor 
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn import metrics

np.random.seed(7)

In [2]:
df_train = pd.read_csv('../data/receiving_train.csv', index_col = 'Player')
df_test = pd.read_csv('../data/receiving_test.csv', index_col = 'Player')
df_train.dropna(inplace = True)
df_test.dropna(inplace = True)

## Yardage Model

For our final Yardage model, I used the Linear Regression Model. On our training set, we had a training score of .579, and a testing score of .533, I chose this model because it was not as overfit as many of our other models, but it also gave us a decent testing score compared to many of our other models.

In [3]:
X = df_train.drop(columns = ['Tm', 'Pos', 'Player-additional', 'Tgt_target', 'Rec_target', 'Yds_target', 'TD_target', 'Year'])
y = df_train['Yds_target']

In [4]:
lr = LinearRegression()

lr.fit(X, y)

print(f'Training Score: {lr.score(X, y)}')

Training Score: 0.5752758598185246


In [5]:
preds = lr.predict(df_test.drop(columns = ['Tm', 'Pos', 'Player-additional', 'Year']))

In [6]:
yard_preds = pd.DataFrame(preds, index = df_test.index).sort_values(by = 0, ascending = False)
yard_preds[0] = round(yard_preds[0])
yard_preds.rename(columns = {0: 'Yards'}, inplace = True)
yard_preds.head(20)

Unnamed: 0_level_0,Yards
Player,Unnamed: 1_level_1
Cooper Kupp,1365.0
Justin Jefferson,1359.0
Davante Adams,1240.0
Mike Williams,1111.0
Stefon Diggs,1089.0
D.K. Metcalf,1062.0
Tyler Lockett,1035.0
Deebo Samuel,1016.0
Ja'Marr Chase,1010.0
A.J. Brown,977.0


## Touchdowns Model

For our final Touchdown model, I used the Stacking Model that combined, RandomForest, GradientBoost, AdaBoost, and Lasso, and then fed those result into a Linear Regression model. On our training set, we had a training score of .332, and a testing score of .314, I chose this model because all of our touchdown models had relatively low testing scores, but this model did not appear to be overfit and was around the same level of scores as the rest of the models.

In [7]:
X = df_train.drop(columns = ['Tm', 'Pos', 'Player-additional', 'Tgt_target', 'Rec_target', 'Yds_target', 'TD_target', 'Year'])
y = df_train['TD_target']

In [8]:
level1_models = [
    ('rf', RandomForestRegressor()),
    ('gb', GradientBoostingRegressor()),
    ('ada', AdaBoostRegressor()),
    ('lasso_pipe', Pipeline([
        ('ss', StandardScaler()),
        ('lasso', LassoCV())
    ]))
]

stack = StackingRegressor(estimators = level1_models,
                          final_estimator = LinearRegression())

stack.fit(X, y)

print(f'Training Score: {stack.score(X, y)}')

Training Score: 0.3704333930233602


In [9]:
preds = stack.predict(df_test.drop(columns = ['Tm', 'Pos', 'Player-additional', 'Year']))

In [10]:
TD_preds = pd.DataFrame(preds, index = df_test.index).sort_values(by = 0, ascending = False)
TD_preds[0] = round(TD_preds[0])
TD_preds.rename(columns = {0: 'Touchdowns'}, inplace = True)
TD_preds.head(20)

Unnamed: 0_level_0,Touchdowns
Player,Unnamed: 1_level_1
Cooper Kupp,9.0
Davante Adams,8.0
Justin Jefferson,8.0
D.K. Metcalf,7.0
Mike Evans,7.0
Tyreek Hill,7.0
Stefon Diggs,7.0
Mark Andrews,7.0
Tyler Lockett,7.0
Travis Kelce,6.0


## Receptions Model

For our final Receptions model, I used the Linear Regression Model. On our training set, we had a training score of .542, and a testing score of .474, I chose this model because it had a relatively high testing score, but also did not appear too overfit. 

In [11]:
X = df_train.drop(columns = ['Tm', 'Pos', 'Player-additional', 'Tgt_target', 'Rec_target', 'Yds_target', 'TD_target', 'Year'])
y = df_train['Rec_target']

In [17]:
lr = LinearRegression()

lr.fit(X, y)

print(f'Training Score: {lr.score(X, y)}')

Training Score: 0.5336970755809434


In [18]:
preds = lr.predict(df_test.drop(columns = ['Tm', 'Pos', 'Player-additional', 'Year']))

In [19]:
rec_preds = pd.DataFrame(preds, index = df_test.index).sort_values(by = 0, ascending = False)
rec_preds[0] = round(rec_preds[0])
rec_preds.rename(columns = {0: 'Receptions'}, inplace = True)
rec_preds.head(20)

Unnamed: 0_level_0,Receptions
Player,Unnamed: 1_level_1
Cooper Kupp,116.0
Davante Adams,107.0
Justin Jefferson,100.0
Stefon Diggs,85.0
Chris Godwin,84.0
Tyler Lockett,84.0
Mike Williams,82.0
Deebo Samuel,81.0
Diontae Johnson,79.0
Tyreek Hill,79.0


## Final Predictions

The code below combines our 3 different models into a single chart and then exports it as `predictions.csv` in the data folder. Below are the top 20 players in our projection, sorted by Receptions.

In [20]:
final_preds = rec_preds.merge(yard_preds, how = 'left', left_index = True, right_index = True)
final_preds = final_preds.merge(TD_preds, how = 'left', left_index = True, right_index = True)
final_preds.head(20)

Unnamed: 0_level_0,Receptions,Yards,Touchdowns
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Cooper Kupp,116.0,1365.0,9.0
Davante Adams,107.0,1240.0,8.0
Justin Jefferson,100.0,1359.0,8.0
Stefon Diggs,85.0,1089.0,7.0
Chris Godwin,84.0,972.0,5.0
Tyler Lockett,84.0,1035.0,7.0
Mike Williams,82.0,1111.0,5.0
Deebo Samuel,81.0,1016.0,5.0
Diontae Johnson,79.0,893.0,6.0
Tyreek Hill,79.0,969.0,7.0


In [26]:
#final_preds.to_csv('../data/predictions.csv', index = True)