In [209]:
import nfl_data_py as nfl
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from typing import Tuple, List
import pickle
import os

In [210]:
parent_dir = os.path.dirname(os.getcwd())  # Get parent of current working directory

def load_models():

    model_path = os.path.join(parent_dir, 'yardage_model.pkl')
    with open(model_path, 'rb') as f:
        yardage_model = pickle.load(f)
    
    model_path = os.path.join(parent_dir, 'touchdown_model.pkl')
    with open(model_path, 'rb') as f:
        touchdown_model = pickle.load(f)

    return yardage_model, touchdown_model



In [211]:
YEARS = [2024]

# %%
data_all = pd.DataFrame()

def calculate_seconds(row):
    if row['qtr'] != 5:
        return 3600 - row['game_seconds_remaining']
    else:
        return 600 - row['game_seconds_remaining'] + 3600


def get_quarter_value(dataf):
    if 'END QUARTER' in dataf['desc']:
        return dataf['level_0']
    else:
        return None

for i in YEARS:  
    i_data = pd.read_csv('https://github.com/nflverse/nflverse-data/releases/download/pbp/' \
                   'play_by_play_' + str(i) + '.csv.gz',
                   compression= 'gzip', low_memory= False)

    data_all = pd.concat([data_all,i_data])

ppr = 1

data = data_all.loc[data_all.season_type=='REG']
#data = data_all.loc[(data_all.play_type.isin(['no_play','pass','run'])) & (data_all.epa.isna()==False)]
#data.loc[data['pass']==1, 'play_type'] = 'pass'
#data.loc[data.rush==1, 'play_type'] = 'run'
data.reset_index(drop=True, inplace=True)
data['turnover'] = data['interception'] + data['fumble_lost']
data = data.dropna(subset=['posteam'])
data['inside_10'] = (data['yardline_100'] < 10).astype(int)
data['20+_play'] = (data['yards_gained'] > 19).astype(int)
data['short_pass'] = (data['air_yards'] < 10).astype(int)
data['medium_pass'] = ((data['air_yards'] > 9)&(data['air_yards']<20)).astype(int)
data['deep_pass'] = (data['air_yards'] > 19).astype(int)
data['end_zone_target'] = (data['yardline_100'] - data['air_yards']) <= 0
data['fantasy_points'] = (
    data['complete_pass'] * ppr +          # 1 point per completion
    data['touchdown'] * 6 +           # 6 points per touchdown
    data['yards_gained'] * 0.1        # 0.1 points per yard gained
)
data['distance_to_EZ_after_target'] = data['yardline_100'] - data['air_yards']

  data['turnover'] = data['interception'] + data['fumble_lost']
  data['inside_10'] = (data['yardline_100'] < 10).astype(int)
  data['20+_play'] = (data['yards_gained'] > 19).astype(int)
  data['short_pass'] = (data['air_yards'] < 10).astype(int)
  data['medium_pass'] = ((data['air_yards'] > 9)&(data['air_yards']<20)).astype(int)
  data['deep_pass'] = (data['air_yards'] > 19).astype(int)
  data['end_zone_target'] = (data['yardline_100'] - data['air_yards']) <= 0
  data['fantasy_points'] = (
  data['distance_to_EZ_after_target'] = data['yardline_100'] - data['air_yards']


In [212]:
def total_finder(home_or_away,home_total,away_total):
    if home_or_away == 'home':
        total = home_total
    else:
        total = away_total 
    return total

In [213]:
    data.reset_index(drop=True, inplace=True)

    data = data[data['two_point_attempt']==0]

    data['total_plays'] = data['pass'] + data['rush']

    # derive implied team total from betting market data
    data['home_implied_total'] = abs(data['total_line'] / 2 + data['spread_line'] / 2)
    data['away_implied_total'] = abs(data['total_line'] / 2 - data['spread_line'] / 2)
    data = data[(data['play_type']=='pass')|(data['play_type']=='run')]
    # Use list comprehension with zip for more efficient row-wise operations
    data['implied_posteam_total'] = [
    total_finder(has_ball, home_number, away_number)
        for has_ball, home_number, away_number in zip(data['posteam_type'], data['home_implied_total'], data['away_implied_total'])
]


  data['total_plays'] = data['pass'] + data['rush']
  data['home_implied_total'] = abs(data['total_line'] / 2 + data['spread_line'] / 2)
  data['away_implied_total'] = abs(data['total_line'] / 2 - data['spread_line'] / 2)
  data['implied_posteam_total'] = [


In [214]:
    
    # we only want throws to a receiver, aka plays with air yardage (no running plays, sacks, throwaways etc.)
    throws = data[data['air_yards'].notna()]

    throws = throws[throws['receiver_player_name'].notna()]
    throws = throws[throws['pass_location'].notna()]

    
    df = throws[['receiver_player_name','receiver_player_id','posteam','pass','cp','game_id','complete_pass','inside_10','air_yards','yardline_100','ydstogo','implied_posteam_total','yards_gained','fantasy_points','pass_touchdown','down','pass_location','week','season','home_implied_total','away_implied_total','posteam_type','qb_hit','end_zone_target', 'distance_to_EZ_after_target']]


In [215]:
data[data['posteam']=='NYG']['week'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64)

In [216]:
data['play_type'].value_counts()

play_type
pass    10396
run      8017
Name: count, dtype: int64

In [217]:
sample = data[data['week']>5].groupby('posteam').agg({'pass':'mean','total_plays':'sum','pass_oe':'mean','game_id':'nunique'})


sample = data[data['week']>5].groupby('posteam').agg(
    pass_total=('pass', 'sum'),
    pass_rate=('pass', 'mean'),
    pass_oe=('pass_oe', 'mean'),
    plays=('total_plays', 'sum'),
    game_id = ('game_id','nunique'))


sample['trailing_total_plays_avg'] = sample['plays']/sample['game_id']
sample['pass_total'] = sample['pass_total']/sample['game_id']

sample = sample.rename(columns={'pass_rate':'trailing_pass_avg','pass_total':'trailing_pass_total','pass_oe':'trailing_pass_oe_avg'})



sample[['trailing_pass_total','trailing_pass_avg', 'trailing_pass_oe_avg', 'trailing_total_plays_avg']]


Unnamed: 0_level_0,trailing_pass_total,trailing_pass_avg,trailing_pass_oe_avg,trailing_total_plays_avg
posteam,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ARI,30.0,0.531915,-4.830742,56.4
ATL,33.6,0.525,-7.856446,64.0
BAL,31.2,0.543554,-4.858697,57.4
BUF,36.6,0.601974,2.243954,60.8
CAR,32.0,0.592593,-6.083002,54.0
CHI,39.25,0.638211,-4.034352,61.5
CIN,40.6,0.674419,8.355236,60.2
CLE,46.25,0.682657,1.784771,67.75
DAL,43.75,0.694444,-2.515532,63.0
DEN,38.2,0.62623,0.066836,61.0


In [218]:
def predict_columns(data, yardage_model, touchdown_model):
    new_predictors = [
        'air_yards', 'yardline_100', 'ydstogo',
        'down', 'pass_location', 'season', 'qb_hit', 'end_zone_target', 'distance_to_EZ_after_target'
    ]
    
    new_X = data[new_predictors]
    new_X = pd.get_dummies(new_X, columns=['pass_location'], drop_first=True)
    
    # Perform predictions
    predictions = {
        'xYards': yardage_model.predict(new_X),
        'xTDs': touchdown_model.predict(new_X),
        'xFPs': (yardage_model.predict(new_X) * 0.1) + (touchdown_model.predict(new_X) * 6) + data['cp']
    }
    
    return pd.DataFrame(predictions)

In [219]:
yardage_model, touchdown_model = load_models()

In [220]:
with open('pass_volume_model.pkl', 'rb') as file:
    pass_volume_model = pickle.load(file)

In [221]:
def df_creator(team,spread,total):
    team_df = sample[sample.index==team]
    team_total = total/2 - spread/2


    team_df['total_line'] = total
    team_df['pos_team_total'] = team_total
    team_df['pos_spread'] = spread

    return team_df






In [222]:
chosen_team = 'MIA'

receiver_name = 'D.Achane'


team_df = df_creator(chosen_team,-2,49)

team_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['total_line'] = total
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['pos_team_total'] = team_total
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team_df['pos_spread'] = spread


Unnamed: 0_level_0,trailing_pass_total,trailing_pass_avg,trailing_pass_oe_avg,plays,game_id,trailing_total_plays_avg,total_line,pos_team_total,pos_spread
posteam,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MIA,33.666667,0.531579,-8.981223,190,3,63.333333,49,25.5,-2


In [223]:
data[data['posteam']==chosen_team].groupby(['posteam','week']).agg({'pass':'sum'})

Unnamed: 0_level_0,Unnamed: 1_level_0,pass
posteam,week,Unnamed: 2_level_1
MIA,1,41
MIA,2,44
MIA,3,39
MIA,4,26
MIA,5,36
MIA,7,31
MIA,8,40
MIA,9,30


In [224]:
data[data['posteam']==chosen_team].groupby(['posteam','week']).agg({'pass':'sum'})['pass'].mean()

35.875

In [225]:
predicted_attempts = pass_volume_model.predict(team_df[['trailing_pass_total','trailing_pass_avg','trailing_pass_oe_avg','trailing_total_plays_avg','total_line','pos_team_total','pos_spread']])[0]

predicted_attempts

37.043674

In [226]:
new_columns_current = predict_columns(df, yardage_model, touchdown_model)
current_szn = pd.concat([df, new_columns_current], axis=1)

In [227]:

xYardsmean = current_szn[current_szn['receiver_player_name']==receiver_name]['xYards'].mean()

xYards_sd = current_szn[current_szn['receiver_player_name']==receiver_name]['xYards'].std()

print(f'xYards mean: {xYardsmean.round(1)}; xYards SD: {xYards_sd.round(1)}')


xYards mean: 5.099999904632568; xYards SD: 1.7000000476837158


In [228]:
team_period = current_szn[(current_szn['posteam']==chosen_team)&(current_szn['week']<5)].groupby('receiver_player_name').agg({'pass':'sum','xYards':'sum','game_id':'nunique','yards_gained':'sum'})

excluded_receiver1 = 'M.Pittman'

excluded_receiver2 = ''



team_targets = team_period[team_period.index!=excluded_receiver1]

team_targets = team_targets[team_targets.index!=excluded_receiver2]['pass'].sum()


team_period['target_share'] = team_period['pass']/team_targets

team_period['xYards_game'] = team_period['xYards']/team_period['game_id']

team_period['yards_game'] = team_period['yards_gained']/team_period['game_id']


team_period.round(2).sort_values('xYards_game',ascending=False)[['game_id','pass','target_share','xYards','xYards_game','yards_game']]



Unnamed: 0_level_0,game_id,pass,target_share,xYards,xYards_game,yards_game
receiver_player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
T.Hill,4,30,0.24,269.920013,67.48,54.25
J.Waddle,4,20,0.16,156.669998,39.17,53.0
G.DuBose,1,3,0.02,33.360001,33.36,13.0
R.Chosen,1,4,0.03,28.530001,28.53,5.0
D.Achane,4,22,0.18,112.580002,28.15,46.75
J.Smith,4,13,0.1,81.160004,20.29,19.5
R.Mostert,1,3,0.02,17.209999,17.21,10.0
E.Ezukanma,1,2,0.02,16.309999,16.31,0.0
T.Conner,2,4,0.03,30.299999,15.15,8.0
D.Eskridge,1,1,0.01,12.15,12.15,30.0


In [229]:
team_period['target_share'].sum()

1.0

In [230]:
rec_target_share = team_period[team_period.index == receiver_name]['target_share'].values[0]

rec_target_share

0.176

In [231]:
rec_df = current_szn[(current_szn['receiver_player_name']==receiver_name)&(current_szn['posteam']==chosen_team)].groupby('week').agg({'pass':'sum','xYards':'sum','yards_gained':'sum'}).round(1)

rec_df

Unnamed: 0_level_0,pass,xYards,yards_gained
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,7,32.799999,76.0
2,7,41.700001,69.0
3,5,22.4,28.0
4,3,15.6,14.0
5,1,4.6,-1.0
7,3,18.5,8.0
8,8,38.200001,50.0
9,8,42.099998,35.0


In [232]:
print(f"Season median: {rec_df['xYards'].median()}; Last four games median: {rec_df.tail(4)['xYards'].median()}")

Season median: 27.599998474121094; Last four games median: 28.350000381469727


In [233]:
def simulate_receiver_game(
    team_pass_attempts: int,
    target_rate: float,
    yards_mean: float,
    yards_std: float,
    num_simulations: int = 1000
) -> Tuple[List[int], List[float]]:
    """
    Simulate receiving statistics for a player based on team passing attempts and player metrics.
    
    Args:
        team_pass_attempts: Number of team pass attempts
        target_rate: Rate at which player is targeted (between 0 and 1)
        yards_mean: Mean yards per target
        yards_std: Standard deviation of yards per target
        num_simulations: Number of games to simulate
        
    Returns:
        Tuple containing:
        - List of targets for each simulation
        - List of receiving yards for each simulation
    """
    
    simulated_targets = []
    simulated_yards = []
    
    for _ in range(num_simulations):
        # Simulate targets using binomial distribution
        targets = np.random.binomial(n=team_pass_attempts, p=target_rate)
        
        # Simulate yards for each target using normal distribution
        if targets > 0:
            yards = np.random.normal(yards_mean, yards_std, targets)
            # Round to 1 decimal place and ensure no negative yards
            yards = np.maximum(0, np.round(yards, 1))
            total_yards = sum(yards)
        else:
            total_yards = 0
            
        simulated_targets.append(targets)
        simulated_yards.append(total_yards)
    
    return simulated_targets, simulated_yards

def analyze_simulation_results(
    targets: List[int],
    yards: List[float]
) -> dict:
    """
    Analyze the results of the simulation.
    
    Args:
        targets: List of simulated target counts
        yards: List of simulated receiving yards
        
    Returns:
        Dictionary containing summary statistics
    """
    return {
        'avg_targets': np.mean(targets),
        'median_targets': np.median(targets),
        'target_share': round(np.mean(targets)/predicted_attempts,3),
        'target_percentiles': np.percentile(targets, [10, 25, 75, 90]),
        'avg_yards': np.mean(yards),
        'median_yards': np.median(yards),
        'yard_percentiles': np.percentile(yards, [10, 25, 75, 90])
    }


In [234]:

# Example usage
if __name__ == "__main__":
    # Example parameters
    team_passes = predicted_attempts 
    player_target_rate = rec_target_share 
    yards_per_target_mean = xYardsmean
    yards_per_target_std = xYards_sd
    
    # Run simulation
    targets, yards = simulate_receiver_game(
        team_pass_attempts=team_passes,
        target_rate=player_target_rate,
        yards_mean=yards_per_target_mean,
        yards_std=yards_per_target_std
    )
    
    # Analyze results
    results = analyze_simulation_results(targets, yards)
    
    print(f"Median Yards: {results['median_yards']:.1f}")


results
    #print(f"Simulation Results:")
    #print(f"Average Targets: {results['avg_targets']:.1f}")
    #print(f"Median Yards: {results['median_yards']:.1f}")
    #print(f"Target Range (10th-90th percentile): {results['target_percentiles'][0]:.1f} - {results['target_percentiles'][3]:.1f}")
    #print(f"Yards Range (10th-90th percentile): {results['yard_percentiles'][0]:.1f} - {results['yard_percentiles'][3]:.1f}")

Median Yards: 33.0


{'avg_targets': 6.462,
 'median_targets': 6.0,
 'target_share': 0.174,
 'target_percentiles': array([3., 5., 8., 9.]),
 'avg_yards': 33.2446,
 'median_yards': 33.05,
 'yard_percentiles': array([17.19, 23.9 , 41.4 , 49.01])}