In [1]:
import nfl_data_py as nfl
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = nfl.import_pbp_data(list(range(2018, 2024)), downcast=False, cache=False, alt_path=None)

2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.


In [3]:
#lastzn = nfl.import_pbp_data(list(range(2018, 2019)), downcast=False, cache=False, alt_path=None)

In [4]:
#lastzn[(lastzn['game_id']=='2018_10_ARI_KC')&(lastzn['posteam']=='ARI')&(lastzn['pass']==1)][['desc','play_type']]

In [5]:
df = df[df['season_type']=='REG']
df.loc[df['pass']==1, 'play_type'] = 'pass'
df.loc[df.rush==1, 'play_type'] = 'run'
df['total_plays'] = df['pass'] + df['rush']
df = df[(df['play_type']=='pass')|(df['play_type']=='run')]

In [6]:
games = df.copy()

In [7]:
# Group by 'game_id' and calculate both the sum and mean for 'pass', and any other aggregations as needed
games_grouped = games.groupby(['game_id','posteam']).agg(
    pass_total=('pass', 'sum'),
    pass_rate=('pass', 'mean'),
    pass_oe=('pass_oe', 'mean'),
    total_plays=('total_plays', 'sum'),
    total_line=('total_line', 'max'),
    spread_line=('spread_line', 'max'),
    week=('week', 'max'),
    season=('season', 'max'),

    #posteam=('posteam', 'max'),
    home_team=('home_team', 'max')).reset_index()  # reset_index() to turn the group labels back into columns if needed


In [8]:
defense_grouped = games.groupby(['game_id','defteam']).agg(
    pass_total=('pass', 'sum'),
    pass_rate=('pass', 'mean'),
    pass_oe=('pass_oe', 'mean'),
    total_plays=('total_plays', 'sum'),
    total_line=('total_line', 'max'),
    spread_line=('spread_line', 'max'),
    week=('week', 'max'),
    season=('season', 'max'),
    posteam=('posteam', 'max'),
    home_team=('home_team', 'max')).reset_index()  # reset_index() to turn the group labels back into columns if needed

In [9]:
defense_grouped

Unnamed: 0,game_id,defteam,pass_total,pass_rate,pass_oe,total_plays,total_line,spread_line,week,season,posteam,home_team
0,2018_01_ATL_PHI,ATL,42.0,0.591549,-1.611162,71.0,44.5,1.0,1,2018,PHI,PHI
1,2018_01_ATL_PHI,PHI,52.0,0.742857,6.015367,70.0,44.5,1.0,1,2018,ATL,PHI
2,2018_01_BUF_BAL,BAL,44.0,0.676923,-5.026491,65.0,39.0,7.5,1,2018,BUF,BAL
3,2018_01_BUF_BAL,BUF,45.0,0.562500,0.668776,80.0,39.0,7.5,1,2018,BAL,BAL
4,2018_01_CHI_GB,CHI,43.0,0.716667,1.566976,60.0,45.0,6.5,1,2018,GB,GB
...,...,...,...,...,...,...,...,...,...,...,...,...
3161,2023_18_PIT_BAL,PIT,38.0,0.678571,-6.086057,56.0,34.0,-3.0,18,2023,BAL,BAL
3162,2023_18_SEA_ARI,ARI,32.0,0.581818,-9.487502,55.0,48.0,-2.5,18,2023,SEA,ARI
3163,2023_18_SEA_ARI,SEA,32.0,0.457143,-14.381830,70.0,48.0,-2.5,18,2023,ARI,ARI
3164,2023_18_TB_CAR,CAR,38.0,0.612903,1.011848,62.0,36.5,-5.0,18,2023,TB,CAR


In [10]:

def update_spread(games):
  # Create a new column 'pos_spread' to store the adjusted spread values
  games['pos_spread'] = np.where(games['posteam'] == games['home_team'], games['spread_line'], games['spread_line'] * -1)
  return games

games_grouped = update_spread(games_grouped.copy())
defenses_grouped = update_spread(defense_grouped.copy())

In [11]:
games_grouped['pos_team_total'] = games_grouped['total_line']/2 + games_grouped['pos_spread']/2
defenses_grouped['pos_team_total'] = defenses_grouped['total_line']/2 + defenses_grouped['pos_spread']/2


In [12]:
df = games_grouped.sort_values(['posteam','season','week'])

In [13]:
defense_df = defenses_grouped.sort_values(['defteam','season','week'])

In [14]:
def data_creator(df, trailing_weeks):
# First, make sure you have a season column. If not, create it from your date
# Assuming you have a 'game_date' column
# df['season'] = pd.to_datetime(df['game_date']).dt.year

# Sort by team, season, and date
    df = df.sort_values(['posteam', 'season', 'week'])

# Group by team and season, then calculate trailing averages within each group
    df['trailing_pass_avg'] = df.groupby(['posteam', 'season'])['pass_rate'].shift(1).rolling(window=trailing_weeks, min_periods=trailing_weeks).mean()
    df['trailing_pass_oe_avg'] = df.groupby(['posteam', 'season'])['pass_oe'].shift(1).rolling(window=trailing_weeks, min_periods=trailing_weeks).mean()
    df['trailing_total_plays_avg'] = df.groupby(['posteam', 'season'])['total_plays'].shift(1).rolling(window=trailing_weeks, min_periods=trailing_weeks).mean()

# For next game values, also group by team and season
    df['next_game_pass'] = df.groupby(['posteam', 'season'])['pass_total'].shift(-1)
    df['next_game_pos_team_total'] = df.groupby(['posteam', 'season'])['pos_team_total'].shift(-1)
    df['next_game_total_line'] = df.groupby(['posteam', 'season'])['total_line'].shift(-1)

# Select final columns
    all_data = df[['game_id', 'posteam', 'season', 'trailing_pass_avg', 'trailing_pass_oe_avg', 
               'trailing_total_plays_avg', 'pass_total', 'pos_team_total', 'total_line']]
    
    return all_data

In [15]:
defense_games = data_creator(defense_df,4)

In [16]:
# First, make sure you have a season column. If not, create it from your date
# Assuming you have a 'game_date' column
# df['season'] = pd.to_datetime(df['game_date']).dt.year

# Sort by team, season, and date
df = df.sort_values(['posteam', 'season', 'week'])

# Group by team and season, then calculate trailing averages within each group
df['trailing_pass_avg'] = df.groupby(['posteam', 'season'])['pass_rate'].shift(1).rolling(window=4, min_periods=4).mean()
df['trailing_pass_total'] = df.groupby(['posteam', 'season'])['pass_total'].shift(1).rolling(window=4, min_periods=4).mean()
df['trailing_pass_oe_avg'] = df.groupby(['posteam', 'season'])['pass_oe'].shift(1).rolling(window=4, min_periods=4).mean()
df['trailing_total_plays_avg'] = df.groupby(['posteam', 'season'])['total_plays'].shift(1).rolling(window=4, min_periods=4).mean()

# For next game values, also group by team and season
df['next_game_pass'] = df.groupby(['posteam', 'season'])['pass_total'].shift(-1)
df['next_game_pos_team_total'] = df.groupby(['posteam', 'season'])['pos_team_total'].shift(-1)
df['next_game_total_line'] = df.groupby(['posteam', 'season'])['total_line'].shift(-1)

# Select final columns
all_data = df[['game_id', 'posteam', 'season','trailing_pass_total', 'trailing_pass_avg', 'trailing_pass_oe_avg', 
               'trailing_total_plays_avg', 'pass_total', 'pos_team_total', 'total_line','week']]


In [17]:
all_data['identifier'] = all_data['game_id']+'_pos_'+all_data['posteam']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data['identifier'] = all_data['game_id']+'_pos_'+all_data['posteam']


In [18]:
defense_games['identifier'] = defense_games['game_id']+'_pos_'+defense_games['posteam']

In [19]:
defense_data = defense_games.rename(columns={'trailing_pass_avg':'def_pass_rate','trailing_pass_oe_avg':'def_pass_oe','trailing_total_plays_avg':'def_total_plays'})[['identifier','def_pass_rate','def_pass_oe','def_total_plays']]

In [20]:
merged_data = all_data.merge(defense_data,on='identifier')

In [33]:
merged_data.dropna(inplace=True)

In [35]:
merged_data

Unnamed: 0,game_id,posteam,season,trailing_pass_total,trailing_pass_avg,trailing_pass_oe_avg,trailing_total_plays_avg,pass_total,pos_team_total,total_line,week,identifier,def_pass_rate,def_pass_oe,def_total_plays
4,2018_05_ARI_SF,ARI,2018,33.75,0.656667,-4.927605,51.75,27.0,18.75,40.5,5,2018_05_ARI_SF_pos_ARI,0.656667,-4.927605,51.75
5,2018_06_ARI_MIN,ARI,2018,30.75,0.612784,-4.244404,50.50,37.0,16.75,43.5,6,2018_06_ARI_MIN_pos_ARI,0.612784,-4.244404,50.50
6,2018_07_DEN_ARI,ARI,2018,32.50,0.605600,-3.786745,53.75,46.0,20.75,42.5,7,2018_07_DEN_ARI_pos_ARI,0.605600,-3.786745,53.75
7,2018_08_SF_ARI,ARI,2018,35.75,0.614842,-2.824292,57.75,47.0,18.75,40.0,8,2018_08_SF_ARI_pos_ARI,0.614842,-2.824292,57.75
8,2018_10_ARI_KC,ARI,2018,39.25,0.655373,-0.781184,59.25,47.0,16.50,49.5,10,2018_10_ARI_KC_pos_ARI,0.655373,-0.781184,59.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3161,2023_13_MIA_WAS,WAS,2023,51.50,0.697874,5.403667,74.25,30.0,20.75,50.0,13,2023_13_MIA_WAS_pos_WAS,0.697874,5.403667,74.25
3162,2023_15_WAS_LA,WAS,2023,46.00,0.665406,0.515918,68.75,44.0,21.00,48.5,15,2023_15_WAS_LA_pos_WAS,0.665406,0.515918,68.75
3163,2023_16_WAS_NYJ,WAS,2023,45.25,0.653111,-2.047875,68.75,40.0,16.75,36.5,16,2023_16_WAS_NYJ_pos_WAS,0.653111,-2.047875,68.75
3164,2023_17_SF_WAS,WAS,2023,41.50,0.647293,-1.330325,63.75,32.0,17.50,49.0,17,2023_17_SF_WAS_pos_WAS,0.647293,-1.330325,63.75


In [36]:

# Prepare the features (X) and target (y)

X = all_data[['trailing_pass_total','trailing_pass_avg', 'trailing_pass_oe_avg', 'trailing_total_plays_avg','total_line','pos_team_total']]
y = all_data['pass_total']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror')

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
best_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

# Feature importance
importance = best_model.feature_importances_
feature_names = X.columns

# Create a DataFrame for better readability
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

# Print the feature importance
print("Feature Importance:")
print(feature_importance_df)


Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Mean Squared Error: 80.63
R^2 Score: 0.08
Feature Importance:
                    Feature  Importance
2      trailing_pass_oe_avg    0.279000
0       trailing_pass_total    0.201868
4                total_line    0.158166
1         trailing_pass_avg    0.151815
3  trailing_total_plays_avg    0.105354
5            pos_team_total    0.103796


In [23]:
# 'def_pass_rate','def_pass_oe','def_total_plays' didn't seem to add any predictive power

In [24]:
import pickle


with open('pass_volume_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)