In [1]:
import nfl_data_py as nfl
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = nfl.import_pbp_data(list(range(2018, 2024)), downcast=False, cache=False, alt_path=None)

2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.


In [3]:
#lastzn = nfl.import_pbp_data(list(range(2018, 2019)), downcast=False, cache=False, alt_path=None)

In [4]:
#lastzn[(lastzn['game_id']=='2018_10_ARI_KC')&(lastzn['posteam']=='ARI')&(lastzn['pass']==1)][['desc','play_type']]

In [5]:
df = df[df['season_type']=='REG']
df.loc[df['pass']==1, 'play_type'] = 'pass'
df.loc[df.rush==1, 'play_type'] = 'run'
df['total_plays'] = df['pass'] + df['rush']
df = df[(df['play_type']=='pass')|(df['play_type']=='run')]

In [6]:
games = df.copy()

In [7]:
# Group by 'game_id' and calculate both the sum and mean for 'pass', and any other aggregations as needed
games_grouped = games.groupby(['game_id','posteam']).agg(
    pass_total=('pass', 'sum'),
    pass_rate=('pass', 'mean'),
    pass_oe=('pass_oe', 'mean'),
    total_plays=('total_plays', 'sum'),
    total_line=('total_line', 'max'),
    spread_line=('spread_line', 'max'),
    week=('week', 'max'),
    season=('season', 'max'),

    #posteam=('posteam', 'max'),
    home_team=('home_team', 'max')).reset_index()  # reset_index() to turn the group labels back into columns if needed


In [8]:
defense_grouped = games.groupby(['game_id','defteam']).agg(
    pass_total=('pass', 'sum'),
    pass_rate=('pass', 'mean'),
    pass_oe=('pass_oe', 'mean'),
    total_plays=('total_plays', 'sum'),
    total_line=('total_line', 'max'),
    spread_line=('spread_line', 'max'),
    week=('week', 'max'),
    season=('season', 'max'),
    posteam=('posteam', 'max'),
    home_team=('home_team', 'max')).reset_index()  # reset_index() to turn the group labels back into columns if needed

In [9]:
defense_grouped

Unnamed: 0,game_id,defteam,pass_total,pass_rate,pass_oe,total_plays,total_line,spread_line,week,season,posteam,home_team
0,2018_01_ATL_PHI,ATL,42.0,0.591549,-1.611162,71.0,44.5,1.0,1,2018,PHI,PHI
1,2018_01_ATL_PHI,PHI,52.0,0.742857,6.015367,70.0,44.5,1.0,1,2018,ATL,PHI
2,2018_01_BUF_BAL,BAL,44.0,0.676923,-5.026491,65.0,39.0,7.5,1,2018,BUF,BAL
3,2018_01_BUF_BAL,BUF,45.0,0.562500,0.668776,80.0,39.0,7.5,1,2018,BAL,BAL
4,2018_01_CHI_GB,CHI,43.0,0.716667,1.566976,60.0,45.0,6.5,1,2018,GB,GB
...,...,...,...,...,...,...,...,...,...,...,...,...
3161,2023_18_PIT_BAL,PIT,38.0,0.678571,-6.086057,56.0,34.0,-3.0,18,2023,BAL,BAL
3162,2023_18_SEA_ARI,ARI,32.0,0.581818,-9.487502,55.0,48.0,-2.5,18,2023,SEA,ARI
3163,2023_18_SEA_ARI,SEA,32.0,0.457143,-14.381830,70.0,48.0,-2.5,18,2023,ARI,ARI
3164,2023_18_TB_CAR,CAR,38.0,0.612903,1.011848,62.0,36.5,-5.0,18,2023,TB,CAR


In [10]:

def update_spread(games):
  # Create a new column 'pos_spread' to store the adjusted spread values
  games['pos_spread'] = np.where(games['posteam'] == games['home_team'], games['spread_line'], games['spread_line'] * -1)
  return games

games_grouped = update_spread(games_grouped.copy())
defenses_grouped = update_spread(defense_grouped.copy())

In [11]:
games_grouped

Unnamed: 0,game_id,posteam,pass_total,pass_rate,pass_oe,total_plays,total_line,spread_line,week,season,home_team,pos_spread
0,2018_01_ATL_PHI,ATL,52.0,0.742857,6.015367,70.0,44.5,1.0,1,2018,PHI,-1.0
1,2018_01_ATL_PHI,PHI,42.0,0.591549,-1.611162,71.0,44.5,1.0,1,2018,PHI,1.0
2,2018_01_BUF_BAL,BAL,45.0,0.562500,0.668776,80.0,39.0,7.5,1,2018,BAL,7.5
3,2018_01_BUF_BAL,BUF,44.0,0.676923,-5.026491,65.0,39.0,7.5,1,2018,BAL,-7.5
4,2018_01_CHI_GB,CHI,46.0,0.657143,7.004935,70.0,45.0,6.5,1,2018,GB,-6.5
...,...,...,...,...,...,...,...,...,...,...,...,...
3161,2023_18_PIT_BAL,PIT,23.0,0.377049,-20.734013,61.0,34.0,-3.0,18,2023,BAL,3.0
3162,2023_18_SEA_ARI,ARI,32.0,0.457143,-14.381830,70.0,48.0,-2.5,18,2023,ARI,-2.5
3163,2023_18_SEA_ARI,SEA,32.0,0.581818,-9.487502,55.0,48.0,-2.5,18,2023,ARI,2.5
3164,2023_18_TB_CAR,CAR,24.0,0.461538,-21.364459,52.0,36.5,-5.0,18,2023,CAR,-5.0


In [12]:
games_grouped['pos_team_total'] = games_grouped['total_line']/2 + games_grouped['pos_spread']/2
defenses_grouped['pos_team_total'] = defenses_grouped['total_line']/2 + defenses_grouped['pos_spread']/2


In [13]:
df = games_grouped.sort_values(['posteam','season','week'])

In [14]:
defense_df = defenses_grouped.sort_values(['defteam','season','week'])

In [15]:
def data_creator(df, trailing_weeks):
# Assuming df is your original dataframe with relevant columns and it's sorted by game date
# Calculate the trailing 4-game average for 'pass', 'pass_oe', and 'total_plays', shifted by one additional game to exclude the current one
    #print(df.head(2))
    df['trailing_pass_avg'] = df['pass_rate'].shift(1).rolling(window=trailing_weeks).mean()
    df['trailing_pass_oe_avg'] = df['pass_oe'].shift(1).rolling(window=trailing_weeks).mean()
    df['trailing_total_plays_avg'] = df['total_plays'].shift(1).rolling(window=trailing_weeks).mean()

# Shift the columns to get the values of the next game for 'pass', 'pos_team_total', and 'total_line'
    df['next_game_pass'] = df['pass_total'].shift(-1)
    df['next_game_pos_team_total'] = df['pos_team_total'].shift(-1)
    df['next_game_total_line'] = df['total_line'].shift(-1)

# Drop rows where we do not have a full 4-game average or the next game's values
    df_trailing_averages = df.dropna(subset=['trailing_pass_avg', 'trailing_pass_oe_avg', 'trailing_total_plays_avg'])

# Select only the relevant columns in the final dataframe
    all_data = df_trailing_averages[['game_id','trailing_pass_avg', 'trailing_pass_oe_avg', 'trailing_total_plays_avg', 
                                             'pass_total', 'pos_team_total', 'total_line']]
    return all_data

In [16]:
defense_df

Unnamed: 0,game_id,defteam,pass_total,pass_rate,pass_oe,total_plays,total_line,spread_line,week,season,posteam,home_team,pos_spread,pos_team_total
30,2018_01_WAS_ARI,ARI,39.0,0.500000,-2.195774,78.0,43.5,2.0,1,2018,WAS,ARI,-2.0,20.75
32,2018_02_ARI_LA,ARI,36.0,0.500000,-1.742604,72.0,43.5,13.5,2,2018,LA,LA,13.5,28.50
66,2018_03_CHI_ARI,ARI,45.0,0.608108,-1.142559,74.0,39.0,-6.0,3,2018,CHI,ARI,6.0,22.50
120,2018_04_SEA_ARI,ARI,31.0,0.476923,-14.971658,65.0,40.0,-3.5,4,2018,SEA,ARI,3.5,21.75
126,2018_05_ARI_SF,ARI,64.0,0.646465,-4.681148,99.0,40.5,3.0,5,2018,SF,SF,3.0,21.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3003,2023_13_MIA_WAS,WAS,26.0,0.426230,-11.123617,61.0,50.0,-8.5,13,2023,MIA,WAS,8.5,29.25
3069,2023_15_WAS_LA,WAS,38.0,0.506667,-3.583407,75.0,48.5,6.5,15,2023,LA,LA,6.5,27.50
3101,2023_16_WAS_NYJ,WAS,55.0,0.611111,1.489080,90.0,36.5,3.0,16,2023,NYJ,NYJ,3.0,19.75
3131,2023_17_SF_WAS,WAS,31.0,0.455882,-2.505045,68.0,49.0,-14.0,17,2023,SF,WAS,14.0,31.50


In [17]:
defense_games = data_creator(defense_df,4)

In [18]:

# Assuming df is your original dataframe with relevant columns and it's sorted by game date
# Calculate the trailing 4-game average for 'pass', 'pass_oe', and 'total_plays', shifted by one additional game to exclude the current one
df['trailing_pass_avg'] = df['pass_rate'].shift(1).rolling(window=4).mean()
df['trailing_pass_oe_avg'] = df['pass_oe'].shift(1).rolling(window=4).mean()
df['trailing_total_plays_avg'] = df['total_plays'].shift(1).rolling(window=4).mean()

# Shift the columns to get the values of the next game for 'pass', 'pos_team_total', and 'total_line'
df['next_game_pass'] = df['pass_total'].shift(-1)
df['next_game_pos_team_total'] = df['pos_team_total'].shift(-1)
df['next_game_total_line'] = df['total_line'].shift(-1)

# Drop rows where we do not have a full 4-game average or the next game's values
df_trailing_averages = df.dropna(subset=['trailing_pass_avg', 'trailing_pass_oe_avg', 'trailing_total_plays_avg'])

# Select only the relevant columns in the final dataframe
all_data = df_trailing_averages[['game_id','trailing_pass_avg', 'trailing_pass_oe_avg', 'trailing_total_plays_avg', 
                                             'pass_total', 'pos_team_total', 'total_line']]


In [19]:
defense_data = defense_games.rename(columns={'trailing_pass_avg':'def_pass_rate','trailing_pass_oe_avg':'def_pass_oe','trailing_total_plays_avg':'def_total_plays'})[['game_id','def_pass_rate','def_pass_oe','def_total_plays']]

In [20]:
merged_data = all_data.merge(defense_data,on='game_id')

In [21]:

# Prepare the features (X) and target (y)

X = merged_data[['trailing_pass_avg', 'trailing_pass_oe_avg', 'trailing_total_plays_avg','total_line','pos_team_total','def_pass_rate','def_pass_oe','def_total_plays']]
y = merged_data['pass_total']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror')

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

# Feature importance
importance = best_model.feature_importances_
feature_names = X.columns

# Create a DataFrame for better readability
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

# Print the feature importance
print("Feature Importance:")
print(feature_importance_df)


Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}
Mean Squared Error: 55.40
R^2 Score: 0.36
Feature Importance:
                    Feature  Importance
1      trailing_pass_oe_avg    0.158549
3                total_line    0.155270
4            pos_team_total    0.154103
2  trailing_total_plays_avg    0.140646
0         trailing_pass_avg    0.117155
7           def_total_plays    0.097908
6               def_pass_oe    0.088228
5             def_pass_rate    0.088141


In [24]:
import pickle


with open('pass_volume_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)