# NFL RECEIVING YARDAGE PREDICTOR

Big data is becoming a big part of our world and sports are no execption. Teams are increasingly using advanced analytics to find an edge over their competition.

On such example of this is using modeling to find out what metrics are most predictive of future success. For example, what are the elements of wide receiver play in the NFL that are most important? 

**Problem to be solved:** Do past receiving yards do a good job of predicting future receiving yards or are there other metrics that can do a better job?

**Key Metrics**

*   Receiving yards -- how many yards a player gains 
*   Targets -- how often the player is thrown the ball
*   Completed passs/completion/reception -- when a player successfully catches a target
*   Air Yards -- how far in the air the ball travels before the player attempts to catch the ball
*   aDOT -- Average depth of target, or air yards per target
*   Yardage after catch (YAC) -- the yardage the receiver gains after the catch
*   Implied team total -- how many points a player's team is projected to score in the given game; this is derived from betting markets


**Methodology**


1.  Build a backwards-looking expected yardage model, based on inputs such as air yardage and field position.
2.   Assess how this model and/or other factors predict future receiving yards.

**Data Sources**

*   *nflverse (https://nflverse.nflverse.com/)*
*   *nfl_data_py (https://github.com/cooperdff/nfl_data_py)*












## 1. Data Prep

In [None]:
#install nfl_data_py package
!pip install nfl_data_py


In [None]:
import nfl_data_py as nfl


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
import nfl_data_py as nfl

%matplotlib inline

In [None]:
#Calculate the market-implied team total, derived the betting lines

def total_finder(home_or_away,home_total,away_total):
    if home_or_away == 'home':
        total = home_total
    else:
        total = away_total 
    return total

In [None]:
#load in ten years worth of data for NFL pass attempts

all_throws = []

for YEAR in tqdm(range(2012,2022)):
    data = pd.read_csv('https://github.com/nflverse/nflverse-data/releases/download/pbp/' \
                   'play_by_play_' + str(YEAR) + '.csv.gz',
                   compression= 'gzip', low_memory= False)
    
    #we only want regular season data
    data = data.loc[data.season_type=='REG']
    data['season'] = YEAR
    
    
    data = data.loc[(data.play_type.isin(['no_play','pass','run'])) & (data.epa.isna()==False)]

    data = data[data['posteam_type'].isna()==False]

    data.loc[data['pass']==1, 'play_type'] = 'pass'
    data.loc[data.rush==1, 'play_type'] = 'run'

    data.reset_index(drop=True, inplace=True)

    data = data[data['two_point_attempt']==0]


    #derive implied team total from betting market data
    data['home_implied_total'] = abs(data['total_line']/2 + data['spread_line']/2)
    data['away_implied_total'] = abs(data['total_line']/2 - data['spread_line']/2)
    implied_list = []
    for num in data.index:
        home_number = data.loc[num]['home_implied_total']
        away_number = data.loc[num]['away_implied_total']
        has_ball = data.loc[num]['posteam_type']
        number = total_finder(has_ball,home_number,away_number)
        implied_list.append(number)

    data['implied_posteam_total'] = implied_list
    
    
    #we only want throws, aka plays with air yardage (no running plays, sacks, etc.)
    throws = data[data['air_yards'].isna()==False]
    
    df = throws[['receiver_player_name','posteam','game_id','complete_pass','air_yards','yardline_100','ydstogo','implied_posteam_total','yards_gained','pass_touchdown','down','pass_location','week','season','home_implied_total','away_implied_total','posteam_type','qb_hit']]
    all_throws.append(df)
    all_throws_df=pd.concat(all_throws)






## 2. Play-by-Play Yardage Predictor

In [None]:
#Drop throws without a listed targeted receiver
all_throws = all_throws_df.dropna(subset=['receiver_player_name'])

#Create column for throws to the middle of the field(1 for throws to middle, 0 for throws elsewhere)                               
all_throws['middle'] = (all_throws['pass_location']=='middle')*1                      

We now have all the relevant features for our model. They are the following:


*   **Down** (whether a play comes on first, second, third or fourth down)
*   **Yardline_100** (where on the field a play begins)
*   **Yards to go** (how many yards a team needs for a new first down or touchdown)
*   **Implied team total**
*   **Air yards**
*   **Middle**
*   **QB_hit** (Whether or not the passer was hit while throwing)

The target variable is **yards_gained**.






In [None]:
model_df = all_throws[['yards_gained','down','yardline_100','ydstogo','implied_posteam_total','air_yards','middle','qb_hit']]

model_df.corr()[['yards_gained']].sort_values('yards_gained',ascending=False)

In [None]:

X = model_df.drop(columns={'yards_gained'})
y = model_df['yards_gained']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)


In [None]:
#hyper paramaters tuned via gridsearch
from sklearn.ensemble import GradientBoostingRegressor
play_by_play_model= GradientBoostingRegressor(max_depth=5, min_samples_split=1500, n_estimators=50).fit(X_train, y_train)

In [None]:
play_by_play_model.score(X_train, y_train)


In [None]:
play_by_play_model.score(X_test, y_test)

In [None]:
import pickle
pickle.dump(play_by_play_model, open('play_by_play.pkl', 'wb'))


In [None]:
feature_df = pd.DataFrame(play_by_play_model.feature_importances_)
feature_list = []

for feature in X.columns:
    feature_list.append(feature)

feature_df['Feature'] = feature_list
feature_df.rename(columns = {0:'Importance'},inplace=True)

feature_df = feature_df[['Feature','Importance']].sort_values('Importance',ascending = False)

feature_df.sort_values('Importance',ascending=False)

The model tells us that air yards are by far the most important element of predicting how many yards a pass will gain, followed by field position.

In [None]:

param_grid = {'n_estimators':range(10,100,20),
             'max_depth':range(1,21,4), 'min_samples_split':range(500,2001,500)}

gsearch1 = sklearn.model_selection.GridSearchCV(GradientBoostingRegressor(), param_grid, cv=3,refit = True, verbose = 3)

gsearch1.fit(X_train, y_train)

In [None]:
print("\n The best parameters across ALL searched params:\n",gsearch1.best_params_)

In [None]:
parameters = {'learning_rate': [0.0001, 0.001, 0.01],
                  'subsample'    : [0.9, 0.5, 0.2, 0.1],
                  'n_estimators' : [25, 50, 100,500,1000],
                  'max_depth'    : [1,2,5,10],
              'min_samples_split':[100,500,1000,1500]
                 }

In [None]:
grid_GBR = GridSearchCV(estimator=GradientBoostingRegressor(), param_grid = parameters, cv = 2, n_jobs=-1,verbose=3)

grid_GBR.fit(X_train, y_train)


print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)

In [None]:
all_throws['xYards']=play_by_play_model.predict(X)


In [None]:
all_throws['xYards'].corr(all_throws['yards_gained'])

In [None]:
all_throws['xYards'].corr(all_throws['yards_gained'])> 0.3334010562798367

In [None]:
#look at receivers for this year
all_throws = []

for YEAR in range(2022,2023):
    data = pd.read_csv('https://github.com/nflverse/nflverse-data/releases/download/pbp/' \
                   'play_by_play_' + str(YEAR) + '.csv.gz',
                   compression= 'gzip', low_memory= False)

    data = data.loc[data.season_type=='REG']
    data['season'] = YEAR
    data = data.loc[(data.play_type.isin(['no_play','pass','run'])) & (data.epa.isna()==False)]

    data = data[data['posteam_type'].isna()==False]

    data.loc[data['pass']==1, 'play_type'] = 'pass'
    data.loc[data.rush==1, 'play_type'] = 'run'

    data.reset_index(drop=True, inplace=True)

    data = data[data['two_point_attempt']==0]

    data['home_implied_total'] = abs(data['total_line']/2 + data['spread_line']/2)
    data['away_implied_total'] = abs(data['total_line']/2 - data['spread_line']/2)
    implied_list = []
    for num in data.index:
        home_number = data.loc[num]['home_implied_total']
        away_number = data.loc[num]['away_implied_total']
        has_ball = data.loc[num]['posteam_type']
        number = total_finder(has_ball,home_number,away_number)
        implied_list.append(number)

    data['implied_posteam_total'] = implied_list
    throws = data[data['air_yards'].isna()==False]
    df = throws[['receiver_player_name','posteam','game_id','complete_pass','air_yards','yardline_100','ydstogo','implied_posteam_total','yards_gained','pass_touchdown','down','pass_location','week','season','home_implied_total','away_implied_total','posteam_type','qb_hit','yards_after_catch','xyac_mean_yardage','cp','cpoe']]
    all_throws.append(df)
    current_szn_df=pd.concat(all_throws)


In [None]:
current_szn_df = current_szn_df.dropna(subset=['receiver_player_name'])

#Create column for throws to the middle (1 for throws to middle, 0 for throws elsewhere)                               
current_szn_df['middle'] = (current_szn_df['pass_location']=='middle')*1 

In [None]:
current_szn_df = current_szn_df[['receiver_player_name','week','posteam','yards_gained','down','yardline_100','ydstogo','implied_posteam_total','air_yards','middle','qb_hit']].set_index(['receiver_player_name','posteam'])

In [None]:
current_x = current_szn_df.drop(['yards_gained','week'],axis=1)

In [None]:
current_szn_df['xYards']=play_by_play_model.predict(current_x)
current_szn_df = current_szn_df.reset_index()

receivers = current_szn_df.groupby(['receiver_player_name','posteam']).agg({'down':'count','air_yards':'sum','yards_gained':'sum','xYards':'sum','week':'nunique'})
                                                                            
                                                                            
                                                                            #'complete_pass':'sum','cp':'sum','yards_after_catch':'sum','xyac_mean_yardage':'sum'})

receivers.rename(columns={'down':'targets','week':'games'},inplace=True)

In [None]:
receivers['Diff'] = (receivers['yards_gained'] - receivers['xYards']).round(1)

print('First 10 weeks of the 2022 season:')
print('Correlation between actual yards and predicted yards gained for receivers: {}'.format((receivers['yards_gained'].corr(receivers['xYards']).round(2))))

### 2a. Leaders, Overachievers and Underachievers

In [None]:
receivers.sort_values('xYards',ascending=False).head(10)[['games','targets','yards_gained','xYards','Diff']].round(1)

In [None]:
receivers.sort_values('Diff',ascending=False).head(10)[['games','targets','yards_gained','xYards','Diff']].round(1)

In [None]:
receivers.sort_values('Diff',ascending=True).head(10)[['targets','yards_gained','xYards','Diff']]

## 3. Predicting future performance

Now that we have a model for expected yardage for each play, we now should see how this predicts future performance, while also looking at other factors that predict future performance.

### 3. Year n to Year n+1

In [None]:
#load in ten years worth of data for NFL pass attempts

all_throws = []
n_szn = []
plus_1_szn = []

for YEAR in tqdm(range(2012,2022)):
    data = pd.read_csv('https://github.com/nflverse/nflverse-data/releases/download/pbp/' \
                   'play_by_play_' + str(YEAR) + '.csv.gz',
                   compression= 'gzip', low_memory= False)

    data = data.loc[data.season_type=='REG']
    data['season'] = YEAR
    data = data.loc[(data.play_type.isin(['no_play','pass','run'])) & (data.epa.isna()==False)]

    data = data[data['posteam_type'].isna()==False]

    data.loc[data['pass']==1, 'play_type'] = 'pass'
    data.loc[data.rush==1, 'play_type'] = 'run'

    data.reset_index(drop=True, inplace=True)

    data = data[data['two_point_attempt']==0]

    data['home_implied_total'] = abs(data['total_line']/2 + data['spread_line']/2)
    data['away_implied_total'] = abs(data['total_line']/2 - data['spread_line']/2)
    implied_list = []
    for num in data.index:
        home_number = data.loc[num]['home_implied_total']
        away_number = data.loc[num]['away_implied_total']
        has_ball = data.loc[num]['posteam_type']
        number = total_finder(has_ball,home_number,away_number)
        implied_list.append(number)

    data['implied_posteam_total'] = implied_list
    throws = data[data['air_yards'].isna()==False]
    df = throws[['receiver_player_name','posteam','game_id','complete_pass','air_yards','yardline_100','ydstogo','implied_posteam_total','yards_gained','pass_touchdown','down','pass_location','week','season','home_implied_total','away_implied_total','posteam_type','cp','xyac_mean_yardage','qb_hit','yards_after_catch']]
  
  #apply model to the season's stats

    current_szn_df = df.dropna(subset=['receiver_player_name'])

#Create column for throws to the middle (1 for throws to middle, 0 for throws elsewhere)                               
    current_szn_df['middle'] = (current_szn_df['pass_location']=='middle')*1 
    current_szn_df['goal_to_go'] = (current_szn_df['yardline_100']<=10)*1 


    current_szn_df = current_szn_df[['receiver_player_name','posteam','yards_gained','down','yardline_100','ydstogo','implied_posteam_total','air_yards','middle','qb_hit','complete_pass','yards_after_catch','cp','xyac_mean_yardage','goal_to_go']].set_index(['receiver_player_name','posteam'])
    current_x = current_szn_df.drop(['yards_gained','goal_to_go','complete_pass','yards_after_catch','cp','xyac_mean_yardage'],axis=1)
    current_szn_df['xYards']=play_by_play_model.predict(current_x)
    current_szn_df = current_szn_df.reset_index()

#create seperate DF for YAC, looking only at completed passes; then merge this back with main DF
    yac_df = current_szn_df[current_szn_df['complete_pass']==1]
    yac_df = yac_df.groupby(['receiver_player_name','posteam']).sum()
    yac_df['yac/rec'] = yac_df['yards_after_catch']/yac_df['complete_pass']
    yac_df['xyac/rec'] = yac_df['xyac_mean_yardage']/yac_df['complete_pass']
    yac_df =yac_df[['xyac/rec','yac/rec','yards_after_catch','xyac_mean_yardage']]

    receivers = current_szn_df.groupby(['receiver_player_name','posteam']).agg({'down':'count','air_yards':'mean','yards_gained':'sum','xYards':'sum','complete_pass':'sum','cp':'mean','qb_hit':'mean','yardline_100':'mean','goal_to_go':'mean'})
    receivers['catch_rate'] = receivers['complete_pass']/receivers['down']                                                                         




    receivers.rename(columns={'down':'targets'},inplace=True)



    receivers['Diff'] = receivers['yards_gained'] - receivers['xYards']
    receivers = receivers.merge(yac_df,right_index=True,left_index=True)


    receivers['Szn'] = YEAR
  
  
  
  
  
  
  
    n_szn.append(receivers)



    n_szn_df=pd.concat(n_szn)
    n_minus_one_df = pd.concat(n_szn)
    n_minus_one_df['Szn'] = n_minus_one_df['Szn'] + 1
    n_minus_one_df.set_index('Szn',append=True,inplace=True)
    n_szn_df.set_index('Szn',append=True,inplace=True)



In [None]:
n_minus_one_df = n_minus_one_df[n_minus_one_df['targets']>=50]

n_szn_df = n_szn_df[n_szn_df['targets']>=50]

In [None]:
df = n_minus_one_df.merge(n_szn_df,left_index=True,right_index=True)

In [None]:


df['cpoe_x'] = df['catch_rate_x'] - df['cp_x']

df['cpoe_y'] = df['catch_rate_y'] - df['cp_y']

#df

In [None]:
season_n_list = []
for col in df.columns:
    if "_x" in col:
        season_n_list.append(col)
        
season_n_list.append('yards_gained_y')

In [None]:
df[season_n_list].corr().sort_values('yards_gained_y',ascending=False)[['yards_gained_y']].round(2)

*Among previous season stats (denoted with the suffix '_x'), actual yardage gained and expected yardage gained have a similar correlation with next season yardage gained.*

**What explains the difference between xYards and actual yardage gained?**

In [None]:
df['xYards/target_x'] = df['xYards_x']/df['targets_x']
df['xYards/target_y'] = df['xYards_y']/df['targets_y']

df['Yards/target_x'] = df['yards_gained_x']/df['targets_x']
df['Yards/target_y'] = df['yards_gained_y']/df['targets_y']

df['yac_oe_x'] = df['yac/rec_x'] - df['xyac/rec_x']
df['yac_oe_y'] = df['yac/rec_y'] - df['xyac/rec_y']

In [None]:
df.corr().sort_values('Diff_y',ascending=False)[['Diff_y']]

**Year-to-year stability of various metrics**

In [None]:
metric_list = ['yards_gained','xYards','xYards/target','Yards/target','yac_oe','qb_hit','Diff','complete_pass','catch_rate','targets','air_yards','cp','cpoe','xyac/rec','yac/rec']

In [None]:
corr_dict = {}


for metric in metric_list:
  x = metric + '_x'
  y = metric + '_y'
  r = round(df[x].corr(df[y]),2)
  corr_dict.update({metric:r})


In [None]:
pd.DataFrame([corr_dict]).transpose().sort_values(0,ascending=False).rename(columns={0:'r'})

**There is more stability in expected yardage from one year to the next (r=0.60) than actual yardage (0.55).** The delta between the two metrics ("Diff") is not very stable from one year to the next (0.21).*

This makese sence, since air yards are among the most stable metric from one year to the next (r=0.91), and air yards are the most important element of expected yards and expected yards per target (air yards also are a big part of other metrics with high stability, such as completion probability (CP) and xYards after catch per reception).

Also, metrics like completion percentage above expectation* (CPOE) and yards-after-catch above expecation* (YAC_OE) that make up much of the in-season difference between xYards and actual yards are not as stable from one year to the next (between season n and season n+1, r=0.38 for yac_oe, .29 for CPOE).


*CPOE and YAC_OE derive from the nflverse's built-in expected completion and expected YAC models. Further reading: https://www.opensourcefootball.com/posts/2020-09-28-nflfastr-ep-wp-and-cp-models/*

**Other conclusions:**

-A receiver's ability to earn targets seems more consistent than their ability to turn targets into catches (the year-to-year correlation for targets is 0.51, while the year-to-year correlation for completion percentage over expectation is just 0.29).

-Air yards are considerably more stable than yardage after the catch (r=0.91 for air yards year-over-year, compared to a mark of 0.38 for yards after catch above expectation).

**-Therefore, measuring "opportunity" itself (namely, the number of targets and the expected value of these targets) seems more important to making predictions than actually measuring what a receiver does with this opportunity.**

### In-season predictions

In this section, I will break down the last 10 season into six-week segments and see how well data in these segments predicts the following week.

In [None]:
import warnings
warnings.filterwarnings('ignore')

import tqdm.notebook as tq


In [None]:
#correct a few formatting errors in the data

def last_name(full):
  if full == 'DanielThomas':
    return 'Thomas'
  elif full == 'JulioJones':
    return 'Jones'
  elif '.' not in full:
    print(full)
    return full
  else:
    return full.split(".",1)[1]

In [None]:
#fix team abbreviations for teams that have moved cities

def team_fixer(team):
    if team == 'SD':
        return 'LAC'
    elif team == 'OAK':
        return 'LV'
    elif team == 'STL':
        return 'LA'
    else:
        return team

In [None]:
def snap_name_changer(player):
    if 'Jr.' in player:
        return player.split(' Jr.')[0]
        #return 'D.J. Davis'
    elif 'II' in player:
        return player.split(' II')[0]
    else:
        return player

In [None]:
#load in ten years worth of data for NFL pass attempts

szn_receivers=[]
target_list=[]
  
df_list = []
weekly_list = []
target_week = [7,8,9,10,11,12,13,14,15,16,17,18]
week_list1= []
target_weeks=[]


all_throws = []
n_szn = []
plus_1_szn = []

for YEAR in tqdm(range(2012,2022)):
    print(YEAR)
    data = pd.read_csv('https://github.com/nflverse/nflverse-data/releases/download/pbp/' \
                   'play_by_play_' + str(YEAR) + '.csv.gz',
                   compression= 'gzip', low_memory= False)

    data = data.loc[data.season_type=='REG']
    data['season'] = YEAR
    data = data.loc[(data.play_type.isin(['no_play','pass','run'])) & (data.epa.isna()==False)]

    data = data[data['posteam_type'].isna()==False]

    data.loc[data['pass']==1, 'play_type'] = 'pass'
    data.loc[data.rush==1, 'play_type'] = 'run'

    data.reset_index(drop=True, inplace=True)

    data = data[data['two_point_attempt']==0]

    data['home_implied_total'] = abs(data['total_line']/2 + data['spread_line']/2)
    data['away_implied_total'] = abs(data['total_line']/2 - data['spread_line']/2)
    implied_list = []
    for num in data.index:
        home_number = data.loc[num]['home_implied_total']
        away_number = data.loc[num]['away_implied_total']
        has_ball = data.loc[num]['posteam_type']
        number = total_finder(has_ball,home_number,away_number)
        implied_list.append(number)

    data['implied_posteam_total'] = implied_list
    throws = data[data['air_yards'].isna()==False]
    df = throws[['receiver_player_name','week','posteam','game_id','complete_pass','air_yards','yardline_100','ydstogo','implied_posteam_total','yards_gained','pass_touchdown','down','pass_location','season','home_implied_total','away_implied_total','posteam_type','cp','xyac_mean_yardage','qb_hit','yards_after_catch','receiver_player_id']]

    current_szn_df = df.dropna(subset=['receiver_player_name'])

    #Create column for throws to the middle (1 for throws to middle, 0 for throws elsewhere)                               
    current_szn_df['middle'] = (current_szn_df['pass_location']=='middle')*1 
    current_szn_df['goal_to_go'] = (current_szn_df['yardline_100']<=10)*1 

    current_szn_df = current_szn_df[['receiver_player_name','week','posteam','yards_gained','down','yardline_100','ydstogo','implied_posteam_total','air_yards','middle','qb_hit','complete_pass','yards_after_catch','cp','xyac_mean_yardage','goal_to_go','receiver_player_id']].set_index(['receiver_player_name','posteam'])
    current_szn_df['year'] = YEAR
  
    #apply model to the season's stats
    current_x = current_szn_df.drop(['yards_gained','year','week','goal_to_go','complete_pass','yards_after_catch','cp','xyac_mean_yardage','receiver_player_id'],axis=1)
    current_szn_df['xYards']=play_by_play_model.predict(current_x)
    current_szn_df = current_szn_df.reset_index()
  
    #create seperate DF for YAC, looking only at completed passes; then merge this back with main DF
    yac_df = current_szn_df[current_szn_df['complete_pass']==1]
    weekly_yac = yac_df.groupby(['receiver_player_name','posteam','week']).agg({'complete_pass':'sum','yards_after_catch':'sum','xyac_mean_yardage':'sum'})
    
    
    #create new DF for team pass attempts to calculate metrics such as target share (targets per team attempts)
    weekly_pass_offense=current_szn_df.groupby(['posteam','week']).agg({'week':'count','air_yards':'sum'})
    weekly_pass_offense.rename(columns={'week':'team_attempts','air_yards':'team_air_yards'},inplace=True)
    team_attempts=weekly_pass_offense.reset_index()[['posteam','week','team_attempts','team_air_yards']]
    weekly_rec_group = current_szn_df.groupby(['receiver_player_name','posteam','week']).agg({'week':'max','down':'count','complete_pass':'sum','yards_gained':'sum','air_yards':'sum','xYards':'sum','cp':'sum','implied_posteam_total':'max','goal_to_go':'sum','qb_hit':'sum','middle':'sum','goal_to_go':'sum','receiver_player_id':'max'})
    weekly_rec_group.rename(columns={'week':'week_#','down':'targets'},inplace=True)
    weekly_rec_group = weekly_rec_group.merge(weekly_yac[['yards_after_catch','xyac_mean_yardage']],right_index=True,left_index=True)
    
    
    feature_receivers = weekly_rec_group.reset_index()

    feature_receivers.reset_index().set_index(['receiver_player_id','posteam','week'])
    
    week_df1 = feature_receivers.copy()
    week_df = week_df1.copy()
    week_df = week_df.reset_index().set_index(['receiver_player_name','posteam'])
    feature_df = feature_receivers.copy()
    
    
    #load in snap count data; this is important because players who go entire games without a target will not show up in the data for that game
    snap_df = nfl.import_snap_counts([YEAR])
    snap_df['team'] = snap_df['team'].apply(lambda x: team_fixer(x))
    snap_df = snap_df[snap_df['game_type']=='REG']
    snap_df = snap_df[['pfr_player_id','player','offense_snaps','offense_pct','week','team','game_id','position']]
    snap_df['player_copy'] = snap_df['player']
    snap_df.rename(columns={'player_copy':'full_name'},inplace=True)
    snap_df['full_name'] = snap_df['full_name'].apply(lambda x: snap_name_changer(x))
    
    
    snap_df['week#'] = snap_df['week']
    snap_df = snap_df[snap_df['offense_snaps']>0]

    #create a unique identifier for each player so the snap data dataframe can be merged with the main receiver dataframe
    snap_df['last_name'] = snap_df['full_name'].apply(lambda x: x.split(" ",1)[1])
    snap_df['identifier'] = (snap_df['full_name']).str[0]+"."+snap_df['last_name']+"_"+snap_df['team']+"_"+snap_df['week#'].astype(str)
    
    feature_df_snaps = feature_df.reset_index()
    feature_df_snaps.rename(columns={'posteam':'team'},inplace=True)
    feature_df_snaps['szn'] = YEAR
    
    feature_df_snaps = feature_df_snaps.set_index(['receiver_player_id','team','week'])        
    for player_id in tq.tqdm(feature_df_snaps.index):
        player = player_id[0:2]
        team = player[1]
        week_df = pd.DataFrame()
        
        week_df['week_#']=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
        new_df = feature_df_snaps.loc[player].merge(week_df,how='right')
        new_df['target_week'] = new_df['week_#']+1
        new_df['szn'] = new_df['szn'].fillna(YEAR)
        new_df['receiver_player_name'] = new_df['receiver_player_name'].fillna(new_df['receiver_player_name'].mode()[0])
        new_df['posteam'] = team
        
        new_df['identifier'] = (new_df['receiver_player_name']).str[0]+"."+new_df['receiver_player_name']+"_"+new_df['posteam']+"_"+new_df['week_#'].astype(str)
        new_df['last'] = new_df['receiver_player_name'].apply(lambda x: last_name(x))
        new_df['identifier'] = (new_df['receiver_player_name']).str[0]+"."+new_df['last']+"_"+new_df['posteam']+"_"+new_df['week_#'].astype(str)
               
        new_df = new_df.merge(snap_df[['identifier','full_name','game_id','offense_snaps','offense_pct','position']], on='identifier')
            
        #team_attempts = team_attempts.reset_index()
        
        team_attempts.rename(columns={'week':'week_#'},inplace=True)

        new_df = new_df.merge(team_attempts,on=['posteam','week_#'])
        
        new_df.fillna(0,inplace=True)


        #new_df will be the dataframe from trailing week data; 
        #the trailing period will be at a minimum three weeks worth of games and a max of six weeks
        
        
        #a copy of this dataframe will be for the target weeks
        target_df = new_df.copy()
        #print('shape: {}'.format(new_df.shape[0]))
        #if new_df.shape[0]==3:
         #weights = np.array([0.2, 0.3, 0.5])
         #new_df['weighted_xYards'] = new_df['xYards'].rolling(3).apply(lambda x: np.sum(weights*x))
         #new_df['weights'] = weights
         #new_df['unweighted_mean'] = new_df['xYards'].rolling(3,min_periods=3).mean()
         #new_df['unweighted_mean_test'] = new_df['xYards'].rolling(3,min_periods=3).mean()

          #display(new_df[['receiver_player_name','xYards','weighted_xYards','unweighted_mean','weights']])

        #if new_df.shape[0]==4:
         # weights = np.array([0.05,0.15, 0.3, 0.5])
         # new_df['weighted_xYards'] = new_df['xYards'].rolling(4).apply(lambda x: np.sum(weights*x))
          #new_df['weights'] = weights
         # new_df['unweighted_mean'] = new_df['xYards'].rolling(4,min_periods=3).mean()
         # new_df['unweighted_mean1'] = new_df['xYards'].rolling(4,min_periods=3).mean()
          #display(new_df[['receiver_player_name','xYards','weighted_xYards','unweighted_mean','unweighted_mean1']])

        #if new_df.shape[0]==5:
         # weights = np.array([0.05,0.05,0.15, 0.25, 0.5])
          #new_df['weighted_xYards'] = new_df['xYards'].rolling(5).apply(lambda x: np.sum(weights*x))
          #new_df['weights'] = weights
          #new_df['unweighted_mean'] = new_df['xYards'].rolling(5,min_periods=3).mean()
          #new_df['unweighted_mean1'] = new_df['xYards'].rolling(5,min_periods=3).mean()
          #display(new_df[['receiver_player_name','xYards','weighted_xYards','unweighted_mean','unweighted_mean1']])

        if new_df.shape[0]>=6:
          weights = np.array([0.05,0.1,0.1,0.1, 0.2, 0.45])
          new_df['weighted_xYards'] = new_df['xYards'].rolling(6,min_periods=6).apply(lambda x: np.sum(weights*x))
          new_df['weighted_yards'] = new_df['yards_gained'].rolling(6,min_periods=6).apply(lambda x: np.sum(weights*x))

          #new_df['weights'] = weights
          new_df['unweighted_xmean'] = new_df['xYards'].rolling(6,min_periods=6).mean()
          new_df['unweighted_yards_mean'] = new_df['yards_gained'].rolling(6,min_periods=6).mean()

          #display(new_df[['receiver_player_name','xYards','weighted_xYards','unweighted_mean']])

        



        #calculate six week-rolling averages for key metrics
        new_df['xYards/game'] = new_df['xYards'].rolling(6,min_periods=3).mean()
        new_df['xYards_median'] = new_df['xYards'].rolling(6,min_periods=3).median()

        new_df['xYards_sum'] = new_df['xYards'].rolling(6,min_periods=3).sum()
        new_df['games_played'] = new_df['game_id'].rolling(6,min_periods=1).count()
        new_df['Yards_sum'] = new_df['yards_gained'].rolling(6,min_periods=3).sum()
        new_df['Yards/game'] = new_df['yards_gained'].rolling(6,min_periods=3).mean()
        new_df['air_yards_rolling'] = new_df['air_yards'].rolling(6,min_periods=3).sum()
        new_df['team_attempts_rolling'] = new_df['team_air_yards'].rolling(6,min_periods=3).sum()
        new_df['team_air_yards_rolling'] = new_df['team_air_yards'].rolling(6,min_periods=3).sum()
        new_df['complete_pass_rolling'] = new_df['complete_pass'].rolling(6,min_periods=3).sum()
        new_df['cp_rolling'] = new_df['cp'].rolling(6,min_periods=3).sum()
        new_df['yac_rolling'] = new_df['yards_after_catch'].rolling(6,min_periods=3).sum()
        new_df['xyac_rolling'] = new_df['xyac_mean_yardage'].rolling(6,min_periods=3).sum()
        new_df['targets_sum'] = new_df['targets'].rolling(6,min_periods=3).sum()
        new_df['snaps_sum'] = new_df['offense_snaps'].rolling(6,min_periods=3).sum()


        new_df = new_df.sort_values('week_#')
        
        target_df1 = pd.DataFrame()
        target_df1['target_week']=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]

        new_df = new_df.merge(target_df1,how='right').sort_values('target_week')
        new_df = new_df.fillna(method='ffill')

        df_list.append(new_df)
        target_list.append(target_df)
        
trailing_weeks = pd.concat(df_list)   
target_weeks = pd.concat(target_list)   

target_weeks = target_weeks[target_weeks['week_#']>3]
target_weeks = target_weeks.dropna()
target_weeks = target_weeks.drop(columns={'target_week'})
target_weeks.rename(columns={'week_#':'target_week'},inplace=True)


target_weeks = target_weeks.drop_duplicates()
trailing_weeks = trailing_weeks.drop_duplicates()


trailing_weeks = trailing_weeks[['receiver_player_name','posteam','szn','position','target_week','snaps_sum','games_played','xYards','xYards_sum','xYards/game','xYards_median','Yards/game','air_yards_rolling','team_attempts_rolling','team_air_yards_rolling','complete_pass_rolling','cp_rolling','yac_rolling','xyac_rolling','targets_sum','weighted_xYards','unweighted_xmean','unweighted_yards_mean','weighted_yards']]
target_weeks = target_weeks[['receiver_player_name','posteam','target_week','szn','yards_gained','offense_snaps','targets','complete_pass','xYards','cp','yards_after_catch','xyac_mean_yardage']]


target_weeks = target_weeks[target_weeks['target_week']>3]
trailing_weeks.set_index(['receiver_player_name','posteam','target_week','szn'],inplace=True)
target_weeks.set_index(['receiver_player_name','posteam','target_week','szn'],inplace=True)





In [None]:
#merge trailing and target dataframes

merged = trailing_weeks.merge(target_weeks,right_index=True,left_index=True)

merged = merged.iloc[:,~merged.columns.duplicated()]


In [None]:
merged = merged.drop_duplicates()

#merged.columns
merged = merged.dropna()


In [None]:
merged.loc['R.Wayne'].columns

In [None]:
#create various features

merged['yac/rec_x'] = merged['yac_rolling']/merged['complete_pass_rolling']
merged['xyac/rec_x'] = merged['xyac_rolling']/merged['complete_pass_rolling']


merged['yac/rec_y'] = merged['yards_after_catch']/merged['complete_pass']
merged['xyac/rec_y'] = merged['xyac_mean_yardage']/merged['complete_pass']

merged['yacoe_x'] = merged['yac/rec_x'] - merged['xyac/rec_x']
merged['yacoe_y'] = merged['yac/rec_y'] - merged['xyac/rec_y']

merged['target_share_x'] = merged['targets_sum']/merged['team_attempts_rolling']

merged['aDOT_x'] = merged['air_yards_rolling']/merged['targets_sum']
merged['AY_share_x'] = merged['air_yards_rolling']/merged['team_air_yards_rolling']



merged['catch_rate_x'] = merged['complete_pass_rolling']/merged['targets_sum']
merged['catch_rate_y'] = merged['complete_pass']/merged['targets']

merged['catch_over_e_x'] = merged['complete_pass_rolling']-merged['cp_rolling']
merged['catch_over_e_y'] = merged['complete_pass']-merged['cp']

merged['cpoe_x'] = merged['catch_over_e_x']/merged['targets_sum']
merged['cpoe_y'] = merged['catch_over_e_y']/merged['targets']


merged['Yards/game_x'] = merged['Yards/game']
merged['Yards/game_y'] = merged['yards_gained']

merged['xYards/game_x'] = merged['xYards/game']
merged['xYards/game_y'] = merged['xYards_y']


merged['targets/game_x'] = merged['targets_sum']/merged['games_played']
merged['targets/game_y'] = merged['targets']

merged['Diff_x'] = merged['Yards/game_x']-merged['xYards/game_x']
merged['Diff_y'] = merged['Yards/game_y']-merged['xYards/game_y']

In [None]:
merged['xYards/target_x'] = merged['xYards_sum']/merged['targets_sum']
merged['xYards/target_y'] = merged['xYards_y']/merged['targets']


In [None]:
merged['xYards/target_y']

In [None]:
corr_dict = {}
metric_list = ['Yards/game_x','xYards/game_x','yacoe_x','xYards/target_x','catch_rate_x','catch_rate_x','catch_over_e_x','targets/game_x','Diff_x']

for metric in metric_list:
 x = metric
 y = metric[:-2] + '_y'
 r = round(merged[x].corr(merged[y]),2)
 corr_dict.update({metric[:-2]:r})

In [None]:
pd.DataFrame([corr_dict]).transpose().sort_values(0,ascending=False).rename(columns={0:'r'})

As was the case when looking at things from one year to the next, within the season, expected yards are more consistent than actual yardage.

Also, opportunity metrics (xYards/target and target volume) are more consistent than efficiency metrics (catch rate and yards after catch stats). This is also evident in the low correlation between the delta between yards and xYards from the trailing-week group and target-week group.



In [None]:
merged[['weighted_xYards','weighted_yards','Yards/game_x','xYards/game_x','yacoe_x','xYards/target_x','catch_rate_x','Diff_x','catch_over_e_x','targets/game_x','yards_gained']].corr()[['yards_gained']].sort_values('yards_gained',ascending=False)

In [None]:
merged_model = merged.copy()

In [None]:
#from sklearn.ensemble import HistGradientBoostingRegressor

Before applying these findings to a model, we should first look at the difference between different positions.

In [None]:
def position_handler(pos):
  if pos in ['WR','RB','TE','FB']:
    return pos
  elif pos == 'RB/W':
    return 'RB'
  elif pos == 'WR/R':
    return 'WR'
  elif pos == 'HB':
    return 'RB'
  else:
    return 'other'

In [None]:
merged_model['position'] = merged_model['position'].apply(lambda x: position_handler(x))

In [None]:
for pos in merged_model['position'].unique():
  positional_model = merged_model[merged_model['position']==pos]
  yards_per_game = round(positional_model['yards_gained'].mean(),1)
  print('{}: {}'.format(pos, yards_per_game))



Wide receivers gain much more yardage on average than other positions, so the model should account for this.

Also, since non-WR/TE/RB/FB targets are rare, we should drop them to ensure the model is more generizable.

In [None]:
#one hot-encode position

one_hot = pd.get_dummies(merged_model['position'])
merged_model = merged_model.drop('position',axis = 1)

merged_model = merged_model.join(one_hot)

#drop non WR/TE/RB/FB
merged_model = merged_model[merged_model['other']==0]

In [None]:
X = merged_model[['xYards/game','Yards/game']].dropna()


y = merged_model['yards_gained']
linear_features = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
linear_model = LinearRegression().fit(X_train, y_train)



#print('xYards per game w/ positions:',in_season_model.score(X_test, y_test).round(3))
print('xYards mean:',linear_model.score(X_test, y_test).round(3))

In [None]:
#tune max bins maybe

X = merged_model[['xYards/game','Yards/game','RB','WR','FB']].dropna()
#X = merged_model[['xYards/game','Yards/game']].dropna()
#X = merged_model[['xYards/game','RB','WR','FB']].dropna()
#X = merged_model[['xYards_median']].dropna()

y = merged_model['yards_gained']
features = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
in_season_model = GradientBoostingRegressor(max_depth=1,min_samples_split=2, n_estimators=100).fit(X_train, y_train)
#in_season_model = GradientBoostingRegressor().fit(X_train, y_train)
#in_season_model = GradientBoostingRegressor(learning_rate=0.01,max_depth=4,n_estimators=500,subsample=0.5).fit(X_train, y_train)



#print('xYards per game w/ positions:',in_season_model.score(X_test, y_test).round(3))
print('xYards mean:',in_season_model.score(X_test, y_test).round(3))

In [None]:
print('r^2:',in_season_model.score(X_test, y_test).round(3))


feature_df = pd.DataFrame(in_season_model.feature_importances_)
feature_list = []

for feature in X.columns:
    feature_list.append(feature)

feature_df['Feature'] = feature_list
feature_df.rename(columns = {0:'Importance'},inplace=True)

feature_df = feature_df[['Feature','Importance']].sort_values('Importance',ascending = False)

feature_df.sort_values('Importance',ascending=False)

In [None]:
pickle.dump(in_season_model, open('model.pkl', 'wb'))


In [None]:
parameters = {'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
                  'subsample'    : [0.9, 0.5, 0.2, 0.1],
                  'n_estimators' : [100,500,1000, 1500],
                  'max_depth'    : [4,6,8,10]
                 }

In [None]:
grid_GBR = GridSearchCV(estimator=GradientBoostingRegressor(), param_grid = parameters, cv = 2, n_jobs=-1,verbose=3)

grid_GBR.fit(X_train, y_train)


print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)

In [None]:
print('xYards mean:',grid_GBR.score(X_test, y_test).round(3))

In [None]:
pickle.dump(linear_model, open('linear_model.pkl', 'wb'))


In [None]:
y_pred = in_season_model.predict(X_test)

In [None]:
import math
MSE = np.square(np.subtract(y_pred,y_test)).mean() 
 
RMSE = math.sqrt(MSE)
print("Root Mean Square Error - Betting line:")
print(round(RMSE,1))

In [None]:
X_test['pYards'] = in_season_model.predict(X_test)







In [None]:
X_test['yards_gained']=y_test


X_test[['xYards/game','Yards/game','RB','WR','FB','pYards','yards_gained']].corr()

In [None]:
X_test['error'] = X_test['pYards']-X_test['yards_gained']

In [None]:
X_test[['xYards/game','Yards/game','pYards','error']].corr()