In [1]:
import nfl_data_py as nfl
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = nfl.import_pbp_data(list(range(2018, 2024)), downcast=False, cache=False, alt_path=None)

2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.


In [3]:
#lastzn = nfl.import_pbp_data(list(range(2018, 2019)), downcast=False, cache=False, alt_path=None)

In [4]:
#lastzn[(lastzn['game_id']=='2018_10_ARI_KC')&(lastzn['posteam']=='ARI')&(lastzn['pass']==1)][['desc','play_type']]

In [5]:
df = df[df['season_type']=='REG']
df.loc[df['pass']==1, 'play_type'] = 'pass'
df.loc[df.rush==1, 'play_type'] = 'run'
df['total_plays'] = df['pass'] + df['rush']
df = df[df['two_point_attempt']==0]
df = df[(df['play_type']=='pass')|(df['play_type']=='run')]

In [6]:
df.loc[df['receiver_player_name'].isna()==False, 'target']=1

In [7]:
df['target'] = df['target'].fillna(0)

In [8]:
games = df.copy()

In [9]:
# Group by 'game_id' and calculate both the sum and mean for 'pass', and any other aggregations as needed
games_grouped = games.groupby(['game_id','posteam']).agg(
    target_total=('target', 'sum'),

    
    pass_total=('pass', 'sum'),
    pass_rate=('pass', 'mean'),
    pass_oe=('pass_oe', 'mean'),
    total_plays=('total_plays', 'sum'),
    total_line=('total_line', 'max'),
    spread_line=('spread_line', 'max'),
    week=('week', 'max'),
    season=('season', 'max'),

    #posteam=('posteam', 'max'),
    home_team=('home_team', 'max')).reset_index()  # reset_index() to turn the group labels back into columns if needed


In [10]:
defense_grouped = games.groupby(['game_id','defteam']).agg(
    target_total=('target', 'sum'),
    pass_total=('pass', 'sum'),
    pass_rate=('pass', 'mean'),
    pass_oe=('pass_oe', 'mean'),
    total_plays=('total_plays', 'sum'),
    total_line=('total_line', 'max'),
    spread_line=('spread_line', 'max'),
    week=('week', 'max'),
    season=('season', 'max'),
    posteam=('posteam', 'max'),
    home_team=('home_team', 'max')).reset_index()  # reset_index() to turn the group labels back into columns if needed

In [11]:

def update_spread(games):
  # Create a new column 'pos_spread' to store the adjusted spread values
  games['pos_spread'] = np.where(games['posteam'] == games['home_team'], games['spread_line'], games['spread_line'] * -1)
  return games

games_grouped = update_spread(games_grouped.copy())
defenses_grouped = update_spread(defense_grouped.copy())

In [12]:
games_grouped['pos_team_total'] = games_grouped['total_line']/2 + games_grouped['pos_spread']/2
defenses_grouped['pos_team_total'] = defenses_grouped['total_line']/2 + defenses_grouped['pos_spread']/2


In [13]:
df = games_grouped.sort_values(['posteam','season','week'])

In [14]:
defense_df = defenses_grouped.sort_values(['defteam','season','week'])

In [19]:
def data_creator(df, trailing_weeks):
# First, make sure you have a season column. If not, create it from your date
# Assuming you have a 'game_date' column
# df['season'] = pd.to_datetime(df['game_date']).dt.year

# Sort by team, season, and date
    df = df.sort_values(['posteam', 'season', 'week'])

# Group by team and season, then calculate trailing averages within each group
    df['trailing_pass_avg'] = df.groupby(['posteam', 'season'])['pass_rate'].shift(1).rolling(window=trailing_weeks, min_periods=trailing_weeks).mean()
    df['trailing_pass_oe_avg'] = df.groupby(['posteam', 'season'])['pass_oe'].shift(1).rolling(window=trailing_weeks, min_periods=trailing_weeks).mean()
    df['trailing_total_plays_avg'] = df.groupby(['posteam', 'season'])['total_plays'].shift(1).rolling(window=trailing_weeks, min_periods=trailing_weeks).mean()

# For next game values, also group by team and season
    df['next_game_target'] = df.groupby(['posteam', 'season'])['target_total'].shift(-1)
    df['next_game_pos_team_total'] = df.groupby(['posteam', 'season'])['pos_team_total'].shift(-1)
    df['next_game_total_line'] = df.groupby(['posteam', 'season'])['total_line'].shift(-1)

# Select final columns
    all_data = df[['game_id', 'defteam', 'season', 'trailing_pass_avg', 'trailing_pass_oe_avg', 
               'trailing_total_plays_avg', 'target_total', 'pos_team_total', 'total_line']]
    
    return all_data

In [20]:
defense_games = data_creator(defense_df,4)

In [21]:
defense_games.head(6)

Unnamed: 0,game_id,defteam,season,trailing_pass_avg,trailing_pass_oe_avg,trailing_total_plays_avg,target_total,pos_team_total,total_line
31,2018_01_WAS_ARI,WAS,2018,,,,34.0,22.75,43.5
33,2018_02_ARI_LA,LA,2018,,,,27.0,15.0,43.5
67,2018_03_CHI_ARI,CHI,2018,,,,25.0,16.5,39.0
121,2018_04_SEA_ARI,SEA,2018,,,,25.0,18.25,40.0
127,2018_05_ARI_SF,SF,2018,0.655441,-4.927605,51.5,23.0,18.75,40.5
157,2018_06_ARI_MIN,MIN,2018,0.612784,-4.244404,50.5,31.0,16.75,43.5


In [22]:
# First, make sure you have a season column. If not, create it from your date
# Assuming you have a 'game_date' column
# df['season'] = pd.to_datetime(df['game_date']).dt.year

# Sort by team, season, and date
df = df.sort_values(['posteam', 'season', 'week'])

# Group by team and season, then calculate trailing averages within each group
df['trailing_pass_avg'] = df.groupby(['posteam', 'season'])['pass_rate'].shift(1).rolling(window=4, min_periods=4).mean()
df['trailing_pass_total'] = df.groupby(['posteam', 'season'])['pass_total'].shift(1).rolling(window=4, min_periods=4).mean()
df['trailing_pass_oe_avg'] = df.groupby(['posteam', 'season'])['pass_oe'].shift(1).rolling(window=4, min_periods=4).mean()
df['trailing_total_plays_avg'] = df.groupby(['posteam', 'season'])['total_plays'].shift(1).rolling(window=4, min_periods=4).mean()

# For next game values, also group by team and season
df['next_game_pass'] = df.groupby(['posteam', 'season'])['pass_total'].shift(-1)
df['next_game_pos_team_total'] = df.groupby(['posteam', 'season'])['pos_team_total'].shift(-1)
df['next_game_total_line'] = df.groupby(['posteam', 'season'])['total_line'].shift(-1)

# Select final columns
all_data = df[['game_id', 'posteam', 'season','trailing_pass_total', 'trailing_pass_avg', 'trailing_pass_oe_avg', 
               'trailing_total_plays_avg', 'target_total', 'pos_team_total', 'total_line','week','pos_spread']]


In [23]:
all_data['identifier'] = all_data['game_id']+'_pos_'+all_data['posteam']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data['identifier'] = all_data['game_id']+'_pos_'+all_data['posteam']


In [24]:
defense_games['identifier'] = defense_games['game_id']+'_pos_'+defense_games['defteam']

In [25]:
defense_data = defense_games.rename(columns={'trailing_pass_avg':'def_pass_rate','trailing_pass_oe_avg':'def_pass_oe','trailing_total_plays_avg':'def_total_plays'})[['identifier','def_pass_rate','def_pass_oe','def_total_plays']]

In [26]:
merged_data = all_data.merge(defense_data,on='identifier')

In [27]:
merged_data.dropna(inplace=True)

In [28]:
merged_data

Unnamed: 0,game_id,posteam,season,trailing_pass_total,trailing_pass_avg,trailing_pass_oe_avg,trailing_total_plays_avg,target_total,pos_team_total,total_line,week,pos_spread,identifier,def_pass_rate,def_pass_oe,def_total_plays
4,2018_05_ARI_SF,ARI,2018,33.50,0.655441,-4.927605,51.50,23.0,18.75,40.5,5,-3.0,2018_05_ARI_SF_pos_ARI,0.626637,2.057152,64.00
5,2018_06_ARI_MIN,ARI,2018,30.75,0.612784,-4.244404,50.50,31.0,16.75,43.5,6,-10.0,2018_06_ARI_MIN_pos_ARI,0.776669,10.646369,67.50
6,2018_07_DEN_ARI,ARI,2018,32.50,0.605600,-3.786745,53.75,36.0,20.75,42.5,7,-1.0,2018_07_DEN_ARI_pos_ARI,0.696229,0.818644,67.25
7,2018_08_SF_ARI,ARI,2018,35.75,0.614842,-2.824292,57.75,35.0,18.75,40.0,8,-2.5,2018_08_SF_ARI_pos_ARI,0.616123,-1.711947,68.00
8,2018_10_ARI_KC,ARI,2018,39.00,0.654265,-0.781184,59.00,37.0,16.50,49.5,10,-16.5,2018_10_ARI_KC_pos_ARI,0.659447,9.904429,61.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3161,2023_13_MIA_WAS,WAS,2023,51.50,0.697874,5.403667,74.25,22.0,20.75,50.0,13,-8.5,2023_13_MIA_WAS_pos_WAS,0.608493,-0.657864,68.50
3162,2023_15_WAS_LA,WAS,2023,45.75,0.663302,0.515918,68.50,34.0,21.00,48.5,15,-6.5,2023_15_WAS_LA_pos_WAS,0.588728,-2.652622,67.50
3163,2023_16_WAS_NYJ,WAS,2023,45.00,0.651007,-2.047875,68.50,32.0,16.75,36.5,16,-3.0,2023_16_WAS_NYJ_pos_WAS,0.726999,3.417612,62.75
3164,2023_17_SF_WAS,WAS,2023,41.25,0.645188,-1.330325,63.50,26.0,17.50,49.0,17,-14.0,2023_17_SF_WAS_pos_WAS,0.605519,1.336591,58.50


In [29]:

# Prepare the features (X) and target (y)

X = all_data[['trailing_pass_total','trailing_pass_avg', 'trailing_pass_oe_avg', 'trailing_total_plays_avg','total_line','pos_team_total','pos_spread']]
y = all_data['target_total']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror')

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

# Feature importance
importance = best_model.feature_importances_
feature_names = X.columns

# Create a DataFrame for better readability
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

# Print the feature importance
print("Feature Importance:")
print(feature_importance_df)


Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Mean Squared Error: 59.57
R^2 Score: 0.09
Feature Importance:
                    Feature  Importance
2      trailing_pass_oe_avg    0.307438
4                total_line    0.166973
0       trailing_pass_total    0.156906
5            pos_team_total    0.122914
6                pos_spread    0.085588
3  trailing_total_plays_avg    0.084580
1         trailing_pass_avg    0.075601


In [None]:
# 'def_pass_rate','def_pass_oe','def_total_plays' didn't seem to add any predictive power

In [30]:
import pickle


with open('pass_volume_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)