In [1]:
# %%
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sns
import pickle



# %%
YEARS = [2018, 2019, 2020, 2021, 2022, 2023,2024]

# %%
data_all = pd.DataFrame()

def calculate_seconds(row):
    if row['qtr'] != 5:
        return 3600 - row['game_seconds_remaining']
    else:
        return 600 - row['game_seconds_remaining'] + 3600


def get_quarter_value(dataf):
    if 'END QUARTER' in dataf['desc']:
        return dataf['level_0']
    else:
        return None

for i in YEARS:  
    i_data = pd.read_csv('https://github.com/nflverse/nflverse-data/releases/download/pbp/' \
                   'play_by_play_' + str(i) + '.csv.gz',
                   compression= 'gzip', low_memory= False)

    data_all = pd.concat([data_all,i_data])

ppr = 1

data = data_all.loc[data_all.season_type=='REG']
#data = data_all.loc[(data_all.play_type.isin(['no_play','pass','run'])) & (data_all.epa.isna()==False)]
#data.loc[data['pass']==1, 'play_type'] = 'pass'
#data.loc[data.rush==1, 'play_type'] = 'run'
data.reset_index(drop=True, inplace=True)
data['turnover'] = data['interception'] + data['fumble_lost']
data = data.dropna(subset=['posteam'])
data['inside_10'] = (data['yardline_100'] < 10).astype(int)
data['20+_play'] = (data['yards_gained'] > 19).astype(int)
data['short_pass'] = (data['air_yards'] < 10).astype(int)
data['medium_pass'] = ((data['air_yards'] > 9)&(data['air_yards']<20)).astype(int)
data['deep_pass'] = (data['air_yards'] > 19).astype(int)
data['end_zone_target'] = (data['yardline_100'] - data['air_yards']) <= 0
data['fantasy_points'] = (
    data['complete_pass'] * ppr +          # 1 point per completion
    data['touchdown'] * 6 +           # 6 points per touchdown
    data['yards_gained'] * 0.1        # 0.1 points per yard gained
)
data['distance_to_EZ_after_target'] = data['yardline_100'] - data['air_yards']


  data['turnover'] = data['interception'] + data['fumble_lost']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['turnover'] = data['interception'] + data['fumble_lost']
  data['inside_10'] = (data['yardline_100'] < 10).astype(int)
  data['20+_play'] = (data['yards_gained'] > 19).astype(int)
  data['short_pass'] = (data['air_yards'] < 10).astype(int)
  data['medium_pass'] = ((data['air_yards'] > 9)&(data['air_yards']<20)).astype(int)
  data['deep_pass'] = (data['air_yards'] > 19).astype(int)
  data['end_zone_target'] = (data['yardline_100'] - data['air_yards']) <= 0
  data['fantasy_points'] = (
  data['distance_to_EZ_after_target'] = data['yardline_100'] - data['air_yards']


In [2]:
def total_finder(home_or_away,home_total,away_total):
    if home_or_away == 'home':
        total = home_total
    else:
        total = away_total 
    return total

In [3]:
    data.reset_index(drop=True, inplace=True)

    data = data[data['two_point_attempt']==0]


    # derive implied team total from betting market data
    data['home_implied_total'] = abs(data['total_line'] / 2 + data['spread_line'] / 2)
    data['away_implied_total'] = abs(data['total_line'] / 2 - data['spread_line'] / 2)

    # Use list comprehension with zip for more efficient row-wise operations
    data['implied_posteam_total'] = [
    total_finder(has_ball, home_number, away_number)
        for has_ball, home_number, away_number in zip(data['posteam_type'], data['home_implied_total'], data['away_implied_total'])
]

    


  data['home_implied_total'] = abs(data['total_line'] / 2 + data['spread_line'] / 2)
  data['away_implied_total'] = abs(data['total_line'] / 2 - data['spread_line'] / 2)
  data['implied_posteam_total'] = [


In [4]:
    
    # we only want throws to a receiver, aka plays with air yardage (no running plays, sacks, throwaways etc.)
    throws = data[data['air_yards'].notna()]
    # only data before the current szn
    throws = throws[throws['season']!=2024]
    throws = throws[throws['receiver_player_name'].notna()]
    throws = throws[throws['pass_location'].notna()]

    
    df = throws[['receiver_player_name','receiver_player_id','posteam','pass','cp','game_id','complete_pass','inside_10','air_yards','yardline_100','ydstogo','implied_posteam_total','yards_gained','fantasy_points','pass_touchdown','down','pass_location','week','season','home_implied_total','away_implied_total','posteam_type','qb_hit','end_zone_target', 'distance_to_EZ_after_target']]


# Season-over-season Comparison

In [5]:
def load_models():
    with open('yardage_model.pkl', 'rb') as file:
        yardage_model = pickle.load(file)
    
    with open('touchdown_model.pkl', 'rb') as file:
        touchdown_model = pickle.load(file)

    return yardage_model, touchdown_model

In [10]:
yardage_model, touchdown_model = load_models()

In [11]:
df['season'].value_counts()

season
2021    18055
2023    17483
2020    17307
2022    17305
2018    17172
2019    17140
Name: count, dtype: int64

In [12]:
new_predictors = [
    'air_yards', 'yardline_100', 'ydstogo',
    'down', 'pass_location', 'season', 'qb_hit', 'end_zone_target', 'distance_to_EZ_after_target'
]

test_df = df[new_predictors]

test_df.head()

Unnamed: 0,air_yards,yardline_100,ydstogo,down,pass_location,season,qb_hit,end_zone_target,distance_to_EZ_after_target
2,8.0,80.0,15,1.0,right,2018,0.0,False,72.0
5,4.0,39.0,10,1.0,right,2018,0.0,False,35.0
6,-3.0,39.0,10,2.0,left,2018,0.0,False,42.0
7,24.0,39.0,10,3.0,left,2018,0.0,False,15.0
10,1.0,1.0,1,3.0,right,2018,0.0,True,0.0


In [13]:
test_df = pd.get_dummies(test_df, columns=['pass_location'], drop_first=True)


# Add predictions to the new dataset (optional)
df['xYards'] = yardage_model.predict(test_df)
df['xTDs'] = touchdown_model.predict(test_df)
#df['xFPs'] = best_model.predict(test_df)
df['compositeXFP'] = df['cp'] * ppr + df['xTDs'] * 6 + df['xYards'] * 0.1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['xYards'] = yardage_model.predict(test_df)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['xTDs'] = touchdown_model.predict(test_df)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['compositeXFP'] = df['cp'] * ppr + df['xTDs'] * 6 + df['xYards'] * 0.1


In [14]:
df[['compositeXFP','fantasy_points']].corr()

Unnamed: 0,compositeXFP,fantasy_points
compositeXFP,1.0,0.115942
fantasy_points,0.115942,1.0


In [16]:
receivers = df.groupby(['receiver_player_id','posteam','season']).agg({'receiver_player_name':'max','pass':'sum','xTDs':'sum','pass_touchdown':'sum','xYards':'sum','yards_gained':'sum','cp':'sum','complete_pass':'sum','fantasy_points':'sum','compositeXFP':'sum'}).sort_values('compositeXFP',ascending=False)

In [17]:
receivers

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,receiver_player_name,pass,xTDs,pass_touchdown,xYards,yards_gained,cp,complete_pass,fantasy_points,compositeXFP
receiver_player_id,posteam,season,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
00-0032765,NO,2019,M.Thomas,185,3,9.0,1457.892822,1725.0,128.447433,149.0,375.5,292.236724
00-0033908,LA,2021,C.Kupp,191,3,16.0,1441.202759,1947.0,129.081419,145.0,441.7,291.201697
00-0033040,MIA,2023,T.Hill,171,5,13.0,1470.026978,1799.0,107.142703,119.0,382.9,284.145402
00-0031381,LV,2022,D.Adams,180,3,14.0,1525.546143,1516.0,112.967685,100.0,341.6,283.522297
00-0033040,MIA,2022,T.Hill,170,2,7.0,1577.953247,1710.0,107.030899,119.0,332.0,276.826224
...,...,...,...,...,...,...,...,...,...,...,...,...
00-0035704,DEN,2021,D.Lock,1,0,0.0,-0.447261,1.0,0.562315,1.0,1.1,0.517589
00-0033057,CLE,2022,J.Conklin,1,0,0.0,0.462784,0.0,0.454621,0.0,0.0,0.500900
00-0033537,HOU,2019,D.Watson,1,0,1.0,1.134666,6.0,0.355940,1.0,7.6,0.469407
00-0036386,NYG,2021,A.Thomas,1,0,1.0,0.743284,2.0,0.368205,1.0,7.2,0.442533


In [18]:
receivers_qual = receivers[receivers['pass']>=20]

In [19]:
receivers

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,receiver_player_name,pass,xTDs,pass_touchdown,xYards,yards_gained,cp,complete_pass,fantasy_points,compositeXFP
receiver_player_id,posteam,season,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
00-0032765,NO,2019,M.Thomas,185,3,9.0,1457.892822,1725.0,128.447433,149.0,375.5,292.236724
00-0033908,LA,2021,C.Kupp,191,3,16.0,1441.202759,1947.0,129.081419,145.0,441.7,291.201697
00-0033040,MIA,2023,T.Hill,171,5,13.0,1470.026978,1799.0,107.142703,119.0,382.9,284.145402
00-0031381,LV,2022,D.Adams,180,3,14.0,1525.546143,1516.0,112.967685,100.0,341.6,283.522297
00-0033040,MIA,2022,T.Hill,170,2,7.0,1577.953247,1710.0,107.030899,119.0,332.0,276.826224
...,...,...,...,...,...,...,...,...,...,...,...,...
00-0035704,DEN,2021,D.Lock,1,0,0.0,-0.447261,1.0,0.562315,1.0,1.1,0.517589
00-0033057,CLE,2022,J.Conklin,1,0,0.0,0.462784,0.0,0.454621,0.0,0.0,0.500900
00-0033537,HOU,2019,D.Watson,1,0,1.0,1.134666,6.0,0.355940,1.0,7.6,0.469407
00-0036386,NYG,2021,A.Thomas,1,0,1.0,0.743284,2.0,0.368205,1.0,7.2,0.442533


In [20]:
receivers_qual.reset_index().sort_values(['receiver_player_id', 'season'])

Unnamed: 0,receiver_player_id,posteam,season,receiver_player_name,pass,xTDs,pass_touchdown,xYards,yards_gained,cp,complete_pass,fantasy_points,compositeXFP
892,00-0021547,LAC,2018,A.Gates,45,1,2.0,310.046295,333.0,27.173962,28.0,79.3,64.178593
382,00-0022127,DAL,2019,J.Witten,83,1,4.0,570.964417,529.0,57.494382,63.0,139.9,120.590826
188,00-0022921,ARI,2018,L.Fitzgerald,112,0,6.0,880.963379,734.0,72.544649,69.0,178.4,160.640988
176,00-0022921,ARI,2019,L.Fitzgerald,107,2,4.0,798.765869,835.0,73.160045,73.0,192.5,165.036634
468,00-0022921,ARI,2020,L.Fitzgerald,72,0,1.0,524.440125,409.0,52.608162,54.0,100.9,105.052176
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21,00-0039075,LA,2023,P.Nacua,160,1,6.0,1289.263794,1486.0,108.953619,105.0,289.6,243.879994
549,00-0039139,DET,2023,J.Gibbs,71,0,1.0,381.663513,316.0,56.332290,52.0,95.6,94.498642
860,00-0039144,GB,2023,L.Musgrave,46,0,1.0,345.465942,352.0,31.674964,34.0,75.2,66.221560
201,00-0039146,GB,2023,J.Reed,94,4,8.0,734.636780,793.0,59.541225,64.0,191.3,157.004907


In [23]:
df = receivers_qual.reset_index().sort_values(['receiver_player_id', 'season'])

# Create lagged columns for the next season's stats
for col in ['xTDs', 'pass_touchdown', 'xYards', 'yards_gained', 'cp', 'complete_pass','fantasy_points', 'compositeXFP']:
    df[f'{col}_next'] = df.groupby('receiver_player_id')[col].shift(-1)

# Drop rows where the next season's data is missing (last season for each player)
df_lagged = df.dropna(subset=[f'{col}_next' for col in ['xTDs', 'pass_touchdown', 'xYards', 'yards_gained', 'cp', 'complete_pass','fantasy_points', 'compositeXFP']])

In [24]:
df.head(20)

Unnamed: 0,receiver_player_id,posteam,season,receiver_player_name,pass,xTDs,pass_touchdown,xYards,yards_gained,cp,...,fantasy_points,compositeXFP,xTDs_next,pass_touchdown_next,xYards_next,yards_gained_next,cp_next,complete_pass_next,fantasy_points_next,compositeXFP_next
892,00-0021547,LAC,2018,A.Gates,45,1,2.0,310.046295,333.0,27.173962,...,79.3,64.178593,,,,,,,,
382,00-0022127,DAL,2019,J.Witten,83,1,4.0,570.964417,529.0,57.494382,...,139.9,120.590826,,,,,,,,
188,00-0022921,ARI,2018,L.Fitzgerald,112,0,6.0,880.963379,734.0,72.544649,...,178.4,160.640988,2.0,4.0,798.765869,835.0,73.160045,73.0,192.5,165.036634
176,00-0022921,ARI,2019,L.Fitzgerald,107,2,4.0,798.765869,835.0,73.160045,...,192.5,165.036634,0.0,1.0,524.440125,409.0,52.608162,54.0,100.9,105.052176
468,00-0022921,ARI,2020,L.Fitzgerald,72,0,1.0,524.440125,409.0,52.608162,...,100.9,105.052176,,,,,,,,
599,00-0022943,NO,2018,B.Watson,46,4,2.0,365.639679,400.0,29.137302,...,87.0,89.70127,0.0,0.0,176.1409,173.0,15.45368,17.0,34.3,33.067771
1412,00-0022943,NE,2019,B.Watson,24,0,0.0,176.1409,173.0,15.45368,...,34.3,33.067771,,,,,,,,
1350,00-0023564,PHI,2018,D.Sproles,23,1,2.0,124.240898,160.0,17.396862,...,43.0,35.820953,,,,,,,,
1046,00-0024221,WAS,2018,V.Davis,36,0,2.0,312.33725,367.0,22.39595,...,73.7,53.629675,,,,,,,,
1238,00-0024243,GB,2021,M.Lewis,28,1,0.0,160.565704,214.0,19.881775,...,44.4,41.938345,,,,,,,,


In [25]:
df_lagged

Unnamed: 0,receiver_player_id,posteam,season,receiver_player_name,pass,xTDs,pass_touchdown,xYards,yards_gained,cp,...,fantasy_points,compositeXFP,xTDs_next,pass_touchdown_next,xYards_next,yards_gained_next,cp_next,complete_pass_next,fantasy_points_next,compositeXFP_next
188,00-0022921,ARI,2018,L.Fitzgerald,112,0,6.0,880.963379,734.0,72.544649,...,178.4,160.640988,2.0,4.0,798.765869,835.0,73.160045,73.0,192.5,165.036634
176,00-0022921,ARI,2019,L.Fitzgerald,107,2,4.0,798.765869,835.0,73.160045,...,192.5,165.036634,0.0,1.0,524.440125,409.0,52.608162,54.0,100.9,105.052176
599,00-0022943,NO,2018,B.Watson,46,4,2.0,365.639679,400.0,29.137302,...,87.0,89.701270,0.0,0.0,176.140900,173.0,15.453680,17.0,34.3,33.067771
1379,00-0025394,WAS,2018,A.Peterson,26,0,1.0,143.061844,208.0,20.414077,...,46.8,34.720262,0.0,0.0,109.799454,142.0,17.924322,17.0,31.2,28.904267
1168,00-0025396,NO,2018,T.Ginn,30,0,2.0,283.107178,209.0,17.385297,...,49.9,45.696015,0.0,2.0,499.710510,421.0,32.226284,30.0,84.1,82.197335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1191,00-0038117,NYG,2022,W.Robinson,31,0,1.0,224.932175,227.0,22.078208,...,51.7,44.571426,0.0,1.0,527.911743,525.0,56.055348,60.0,118.5,108.846524
1251,00-0038120,NYJ,2022,Br.Hall,31,0,1.0,192.434555,218.0,21.799425,...,46.8,41.042882,0.0,4.0,498.789948,591.0,73.959531,76.0,159.1,123.838526
553,00-0038124,GB,2022,C.Watson,66,0,7.0,560.143677,611.0,38.034695,...,144.1,94.049062,2.0,5.0,439.387939,422.0,28.629502,28.0,100.2,84.568298
457,00-0038129,TB,2022,C.Otton,65,3,2.0,437.800903,391.0,45.103641,...,93.1,106.883731,0.0,4.0,453.356232,452.0,46.431071,47.0,116.2,91.766693


In [26]:
df_lagged.tail(11)

Unnamed: 0,receiver_player_id,posteam,season,receiver_player_name,pass,xTDs,pass_touchdown,xYards,yards_gained,cp,...,fantasy_points,compositeXFP,xTDs_next,pass_touchdown_next,xYards_next,yards_gained_next,cp_next,complete_pass_next,fantasy_points_next,compositeXFP_next
511,00-0037816,GB,2022,R.Doubs,67,0,3.0,541.177368,425.0,44.964937,...,102.5,99.082675,1.0,8.0,774.804138,674.0,57.829159,59.0,174.4,141.309576
673,00-0037838,BAL,2022,I.Likely,60,0,3.0,429.49295,373.0,40.330922,...,91.3,83.280217,0.0,5.0,284.859711,411.0,27.930377,30.0,101.1,56.41635
1326,00-0038041,DAL,2022,J.Ferguson,22,1,2.0,140.835526,174.0,16.795957,...,48.4,36.87951,4.0,5.0,647.428467,761.0,71.300789,71.0,177.1,160.043635
1102,00-0038090,KC,2022,S.Moore,33,0,0.0,265.516174,250.0,23.407232,...,47.0,49.958851,0.0,1.0,288.43399,244.0,24.452009,21.0,51.4,53.295409
703,00-0038104,NE,2022,T.Thornton,45,2,2.0,401.870117,247.0,26.600075,...,58.7,78.787086,0.0,0.0,184.883804,91.0,15.103351,13.0,22.1,33.591732
1141,00-0038115,NYG,2022,D.Bellinger,35,0,2.0,223.158844,268.0,24.952157,...,68.8,47.268042,0.0,0.0,197.065216,255.0,21.681777,25.0,50.5,41.388299
1191,00-0038117,NYG,2022,W.Robinson,31,0,1.0,224.932175,227.0,22.078208,...,51.7,44.571426,0.0,1.0,527.911743,525.0,56.055348,60.0,118.5,108.846524
1251,00-0038120,NYJ,2022,Br.Hall,31,0,1.0,192.434555,218.0,21.799425,...,46.8,41.042882,0.0,4.0,498.789948,591.0,73.959531,76.0,159.1,123.838526
553,00-0038124,GB,2022,C.Watson,66,0,7.0,560.143677,611.0,38.034695,...,144.1,94.049062,2.0,5.0,439.387939,422.0,28.629502,28.0,100.2,84.568298
457,00-0038129,TB,2022,C.Otton,65,3,2.0,437.800903,391.0,45.103641,...,93.1,106.883731,0.0,4.0,453.356232,452.0,46.431071,47.0,116.2,91.766693


In [27]:
correlations = {}

# Loop through each stat to calculate its correlation with the following season
for col in ['xTDs', 'pass_touchdown', 'xYards', 'yards_gained', 'cp', 'complete_pass', 'fantasy_points', 'compositeXFP']:
    correlation = df_lagged[col].corr(df_lagged[f'{col}_next'])
    #print(df_lagged)
    correlations[col] = correlation

# Display the results
correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['Correlation with Next Season'])
print(correlations_df)

                Correlation with Next Season
xTDs                                0.265338
pass_touchdown                      0.487664
xYards                              0.712754
yards_gained                        0.687642
cp                                  0.611072
complete_pass                       0.620011
fantasy_points                      0.662647
compositeXFP                        0.667255


In [28]:
#correlations_df.drop(['xFPs'],inplace=True)

In [29]:
correlations = {}

# Loop through each stat to calculate its correlation with the following season
for metric in ['pass_touchdown', 'yards_gained', 'complete_pass','fantasy_points']:
    if metric == 'pass_touchdown':
        predictor = 'xTDs'
    if metric == 'yards_gained':
        predictor = 'xYards'
    if metric == 'complete_pass':
        predictor = 'cp'
    if metric == 'fantasy_points':
        predictor = 'compositeXFP'
    #print(f"{predictor} to predict {metric}")
    correlation = df_lagged[predictor].corr(df_lagged[f'{metric}_next'])
    correlations[metric] = correlation

# Display the results
x_stat_correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['Expected Stat Correlation with Actual Next Season Target'])
print(x_stat_correlations_df)

                Expected Stat Correlation with Actual Next Season Target
pass_touchdown                                           0.272887       
yards_gained                                             0.679533       
complete_pass                                            0.604711       
fantasy_points                                           0.640119       


In [30]:
correlations_df.merge(x_stat_correlations_df,right_index=True,left_index=True,how='outer').sort_values('Correlation with Next Season',ascending=False)

Unnamed: 0,Correlation with Next Season,Expected Stat Correlation with Actual Next Season Target
xYards,0.712754,
yards_gained,0.687642,0.679533
compositeXFP,0.667255,
fantasy_points,0.662647,0.640119
complete_pass,0.620011,0.604711
cp,0.611072,
pass_touchdown,0.487664,0.272887
xTDs,0.265338,


In [31]:
df_lagged[['xTDs','pass_touchdown','pass_touchdown_next']].corr()

Unnamed: 0,xTDs,pass_touchdown,pass_touchdown_next
xTDs,1.0,0.495539,0.272887
pass_touchdown,0.495539,1.0,0.487664
pass_touchdown_next,0.272887,0.487664,1.0


In [32]:
df_lagged[['xYards','yards_gained','yards_gained_next']].corr()

Unnamed: 0,xYards,yards_gained,yards_gained_next
xYards,1.0,0.962869,0.679533
yards_gained,0.962869,1.0,0.687642
yards_gained_next,0.679533,0.687642,1.0


In [33]:
df_lagged

Unnamed: 0,receiver_player_id,posteam,season,receiver_player_name,pass,xTDs,pass_touchdown,xYards,yards_gained,cp,...,fantasy_points,compositeXFP,xTDs_next,pass_touchdown_next,xYards_next,yards_gained_next,cp_next,complete_pass_next,fantasy_points_next,compositeXFP_next
188,00-0022921,ARI,2018,L.Fitzgerald,112,0,6.0,880.963379,734.0,72.544649,...,178.4,160.640988,2.0,4.0,798.765869,835.0,73.160045,73.0,192.5,165.036634
176,00-0022921,ARI,2019,L.Fitzgerald,107,2,4.0,798.765869,835.0,73.160045,...,192.5,165.036634,0.0,1.0,524.440125,409.0,52.608162,54.0,100.9,105.052176
599,00-0022943,NO,2018,B.Watson,46,4,2.0,365.639679,400.0,29.137302,...,87.0,89.701270,0.0,0.0,176.140900,173.0,15.453680,17.0,34.3,33.067771
1379,00-0025394,WAS,2018,A.Peterson,26,0,1.0,143.061844,208.0,20.414077,...,46.8,34.720262,0.0,0.0,109.799454,142.0,17.924322,17.0,31.2,28.904267
1168,00-0025396,NO,2018,T.Ginn,30,0,2.0,283.107178,209.0,17.385297,...,49.9,45.696015,0.0,2.0,499.710510,421.0,32.226284,30.0,84.1,82.197335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1191,00-0038117,NYG,2022,W.Robinson,31,0,1.0,224.932175,227.0,22.078208,...,51.7,44.571426,0.0,1.0,527.911743,525.0,56.055348,60.0,118.5,108.846524
1251,00-0038120,NYJ,2022,Br.Hall,31,0,1.0,192.434555,218.0,21.799425,...,46.8,41.042882,0.0,4.0,498.789948,591.0,73.959531,76.0,159.1,123.838526
553,00-0038124,GB,2022,C.Watson,66,0,7.0,560.143677,611.0,38.034695,...,144.1,94.049062,2.0,5.0,439.387939,422.0,28.629502,28.0,100.2,84.568298
457,00-0038129,TB,2022,C.Otton,65,3,2.0,437.800903,391.0,45.103641,...,93.1,106.883731,0.0,4.0,453.356232,452.0,46.431071,47.0,116.2,91.766693


# Intra-season Comparison

In [34]:
    # we only want throws to a receiver, aka plays with air yardage (no running plays, sacks, throwaways etc.)
    throws = data[data['air_yards'].notna()]
    # only data before the current szn
    throws = throws[throws['season']!=2024]
    throws = throws[throws['receiver_player_name'].notna()]
    throws = throws[throws['pass_location'].notna()]

    
    df = throws[['receiver_player_name','receiver_player_id','posteam','pass','cp','game_id','complete_pass','inside_10','air_yards','yardline_100','ydstogo','implied_posteam_total','yards_gained','fantasy_points','pass_touchdown','down','pass_location','week','season','home_implied_total','away_implied_total','posteam_type','qb_hit','end_zone_target', 'distance_to_EZ_after_target']]


In [35]:
new_predictors = [
    'air_yards', 'yardline_100', 'ydstogo',
    'down', 'pass_location', 'season', 'qb_hit', 'end_zone_target', 'distance_to_EZ_after_target'
]

test_df = df[new_predictors]

test_df.head()

Unnamed: 0,air_yards,yardline_100,ydstogo,down,pass_location,season,qb_hit,end_zone_target,distance_to_EZ_after_target
2,8.0,80.0,15,1.0,right,2018,0.0,False,72.0
5,4.0,39.0,10,1.0,right,2018,0.0,False,35.0
6,-3.0,39.0,10,2.0,left,2018,0.0,False,42.0
7,24.0,39.0,10,3.0,left,2018,0.0,False,15.0
10,1.0,1.0,1,3.0,right,2018,0.0,True,0.0


In [36]:
    
    # we only want throws to a receiver, aka plays with air yardage (no running plays, sacks, throwaways etc.)
    throws = data[data['air_yards'].notna()]
    # only data before the current szn
    throws = throws[throws['season']!=2024]
    throws = throws[throws['receiver_player_name'].notna()]
    throws = throws[throws['pass_location'].notna()]

    
    df = throws[['receiver_player_name','receiver_player_id','posteam','pass','cp','game_id','complete_pass','inside_10','air_yards','yardline_100','ydstogo','implied_posteam_total','yards_gained','fantasy_points','pass_touchdown','down','pass_location','week','season','home_implied_total','away_implied_total','posteam_type','qb_hit','end_zone_target', 'distance_to_EZ_after_target']]


In [37]:
test_df = pd.get_dummies(test_df, columns=['pass_location'], drop_first=True)


# Add predictions to the new dataset (optional)
df['xYards'] = yardage_model.predict(test_df)
df['xTDs'] = touchdown_model.predict(test_df)
#df['xFPs'] = best_model.predict(test_df)
df['compositeXFP'] = df['cp'] * ppr + df['xTDs'] * 6 + df['xYards'] * 0.1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['xYards'] = yardage_model.predict(test_df)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['xTDs'] = touchdown_model.predict(test_df)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['compositeXFP'] = df['cp'] * ppr + df['xTDs'] * 6 + df['xYards'] * 0.1


In [38]:

# Filter weeks 1-9 and weeks 10-18
first_half = df[df['week'].between(1, 9)]
second_half = df[df['week'].between(10, 18)]

# Define metrics to aggregate
metrics = ['pass','xTDs', 'pass_touchdown', 'xYards', 'yards_gained', 'cp', 'complete_pass',  'fantasy_points', 'compositeXFP']

# Aggregate metrics for each player-season for weeks 1-9
first_half_agg = first_half.groupby(['receiver_player_id', 'season'])[metrics].sum()
first_half_agg = first_half_agg.add_suffix('_wk1_9')

# Aggregate metrics for each player-season for weeks 10-18
second_half_agg = second_half.groupby(['receiver_player_id', 'season'])[metrics].sum()
second_half_agg = second_half_agg.add_suffix('_wk10_18')

# Merge the two aggregated DataFrames
df_aggregated = pd.merge(first_half_agg, second_half_agg, left_index=True, right_index=True)


In [39]:
df_aggregated = df_aggregated[(df_aggregated['pass_wk1_9'] >= 5) & (df_aggregated['pass_wk10_18'] >= 5)]


In [40]:
# Create a dictionary to store correlations for each metric
correlations = {}

# Calculate correlation for each metric between weeks 1-9 and weeks 10-18
for metric in metrics:
    correlation = df_aggregated[f'{metric}_wk1_9'].corr(df_aggregated[f'{metric}_wk10_18'])
    correlations[metric] = correlation

# Display the correlation results
correlations_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['Correlation between Weeks 1-9 and 10-18'])
print(correlations_df)


                Correlation between Weeks 1-9 and 10-18
pass                                           0.712226
xTDs                                           0.128900
pass_touchdown                                 0.420054
xYards                                         0.751190
yards_gained                                   0.711520
cp                                             0.687478
complete_pass                                  0.686201
fantasy_points                                 0.688007
compositeXFP                                   0.713632


In [41]:
df_aggregated[['fantasy_points_wk1_9','compositeXFP_wk1_9','fantasy_points_wk10_18']].corr()

Unnamed: 0,fantasy_points_wk1_9,compositeXFP_wk1_9,fantasy_points_wk10_18
fantasy_points_wk1_9,1.0,0.950179,0.688007
compositeXFP_wk1_9,0.950179,1.0,0.682733
fantasy_points_wk10_18,0.688007,0.682733,1.0


In [42]:
df_aggregated[['yards_gained_wk1_9','xYards_wk1_9','yards_gained_wk10_18']].corr()

Unnamed: 0,yards_gained_wk1_9,xYards_wk1_9,yards_gained_wk10_18
yards_gained_wk1_9,1.0,0.952486,0.71152
xYards_wk1_9,0.952486,1.0,0.705179
yards_gained_wk10_18,0.71152,0.705179,1.0


In [43]:
df_aggregated[['pass_touchdown_wk1_9','xTDs_wk1_9','pass_touchdown_wk10_18']].corr()

Unnamed: 0,pass_touchdown_wk1_9,xTDs_wk1_9,pass_touchdown_wk10_18
pass_touchdown_wk1_9,1.0,0.453297,0.420054
xTDs_wk1_9,0.453297,1.0,0.221686
pass_touchdown_wk10_18,0.420054,0.221686,1.0
