In [1]:

# Setup
!pip install pyarrow --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [2]:

# Load past play-by-play data
pbp_2022 = pd.read_parquet("/content/drive/My Drive/NFL_data/play_by_play_2022.parquet")
pbp_2023 = pd.read_parquet("/content/drive/My Drive/NFL_data/play_by_play_2023.parquet")
pbp = pd.concat([pbp_2022, pbp_2023], ignore_index=True)
pbp = pbp[pbp['season_type'] == 'REG']


In [3]:

# Aggregate team-level game stats
pbp = pbp[pbp['game_id'].notna() & pbp['epa'].notna()]

# Offensive EPA per team per game
off = pbp.groupby(['game_id', 'posteam'])['epa'].mean().reset_index()
off.columns = ['game_id', 'team', 'off_epa']

# Defensive EPA per team per game
def_ = pbp.groupby(['game_id', 'defteam'])['epa'].mean().reset_index()
def_.columns = ['game_id', 'team', 'def_epa']

# Merge with winner info
games = pbp.drop_duplicates('game_id')[['game_id', 'home_team', 'away_team', 'home_score', 'away_score', 'season', 'week']]
games = games.dropna()

games['home_win'] = (games['home_score'] > games['away_score']).astype(int)

# Merge team stats
home_stats = off.rename(columns={'team': 'home_team', 'off_epa': 'home_off_epa'}).merge(
    def_.rename(columns={'team': 'away_team', 'def_epa': 'away_def_epa'}),
    on='game_id'
)

away_stats = off.rename(columns={'team': 'away_team', 'off_epa': 'away_off_epa'}).merge(
    def_.rename(columns={'team': 'home_team', 'def_epa': 'home_def_epa'}),
    on='game_id'
)

full_stats = games.merge(home_stats, on=['game_id', 'home_team', 'away_team'], how='inner')
full_stats = full_stats.merge(away_stats, on=['game_id', 'home_team', 'away_team'], how='inner')

full_stats = full_stats.dropna()
full_stats.head()


Unnamed: 0,game_id,home_team,away_team,home_score,away_score,season,week,home_win,home_off_epa,away_def_epa,away_off_epa,home_def_epa
0,2022_01_BAL_NYJ,NYJ,BAL,9,24,2022,1,0,-0.19539,-0.19539,0.016643,0.016643
1,2022_01_BUF_LA,LA,BUF,10,31,2022,1,0,-0.232365,-0.232365,0.137512,0.137512
2,2022_01_CLE_CAR,CAR,CLE,24,26,2022,1,0,0.023642,0.023642,0.081044,0.081044
3,2022_01_DEN_SEA,SEA,DEN,17,16,2022,1,1,0.071675,0.071675,0.057772,0.057772
4,2022_01_GB_MIN,MIN,GB,23,7,2022,1,1,0.081753,0.081753,-0.170877,-0.170877


In [4]:

# Feature set
full_stats['epa_diff'] = full_stats['home_off_epa'] - full_stats['away_def_epa']
full_stats['def_diff'] = full_stats['home_def_epa'] - full_stats['away_off_epa']

features = ['epa_diff', 'def_diff']
X = full_stats[features]
y = full_stats['home_win']


In [5]:

# Train-test split
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        48
           1       0.56      1.00      0.72        61

    accuracy                           0.56       109
   macro avg       0.28      0.50      0.36       109
weighted avg       0.31      0.56      0.40       109



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:

# Load 2024 pbp for team stats
pbp_2024 = pd.read_parquet("/content/drive/My Drive/NFL_data/play_by_play_2024.parquet")

off_epa = (
    pbp_2024[pbp_2024['posteam'].notna()]
    .groupby('posteam')['epa']
    .mean()
    .reset_index()
    .rename(columns={'posteam': 'team', 'epa': 'off_epa'})
)

def_epa = (
    pbp_2024[pbp_2024['defteam'].notna()]
    .groupby('defteam')['epa']
    .mean()
    .reset_index()
    .rename(columns={'defteam': 'team', 'epa': 'def_epa'})
)

team_stats_2024 = pd.merge(off_epa, def_epa, on='team')
team_stats_2024.head()


Unnamed: 0,team,off_epa,def_epa
0,ARI,0.06277,0.049731
1,ATL,0.0114,0.043905
2,BAL,0.143104,0.016175
3,BUF,0.140825,0.007585
4,CAR,-0.039231,0.121386


In [7]:

# Load 2025 schedule (placeholder)
schedule = pd.DataFrame({
    'week': range(1, 19),
    'home_team': np.random.choice(team_stats_2024['team'], size=18, replace=True),
    'away_team': np.random.choice(team_stats_2024['team'], size=18, replace=True)
})
schedule = schedule[schedule['home_team'] != schedule['away_team']].reset_index(drop=True)


In [8]:

# Join 2024 stats to 2025 schedule
df_2025 = schedule.copy()

df_2025 = df_2025.merge(team_stats_2024.add_prefix('home_'), left_on='home_team', right_on='home_team', how='left')
df_2025 = df_2025.merge(team_stats_2024.add_prefix('away_'), left_on='away_team', right_on='away_team', how='left')

df_2025['epa_diff'] = df_2025['home_off_epa'] - df_2025['away_def_epa']
df_2025['def_diff'] = df_2025['home_def_epa'] - df_2025['away_off_epa']

X_2025 = df_2025[['epa_diff', 'def_diff']]
df_2025['predicted_home_win'] = model.predict(X_2025)

df_2025[['week', 'home_team', 'away_team', 'predicted_home_win']]


Unnamed: 0,week,home_team,away_team,predicted_home_win
0,1,CAR,CLE,1
1,2,SEA,CHI,1
2,3,ATL,LAC,1
3,5,ARI,LA,1
4,6,NE,PHI,1
5,7,ARI,MIN,1
6,8,MIA,NYG,1
7,9,CHI,BUF,1
8,10,TEN,SEA,1
9,11,LAC,PIT,1


In [9]:
import pandas as pd

matchups_path = "/content/drive/My Drive/NFL_data/nfl_week1_2025_matchups.csv"
matchups = pd.read_csv(matchups_path)


In [10]:
team_stats = team_stats_2024  # if defined earlier in notebook


In [11]:
df_2025 = matchups.copy()

df_2025 = df_2025.merge(team_stats.add_prefix("home_"), left_on="home_team", right_on="home_team", how="left")
df_2025 = df_2025.merge(team_stats.add_prefix("away_"), left_on="away_team", right_on="away_team", how="left")


In [12]:
df_2025["epa_diff"] = df_2025["home_off_epa"] - df_2025["away_def_epa"]
df_2025["def_diff"] = df_2025["home_def_epa"] - df_2025["away_off_epa"]

X_2025 = df_2025[["epa_diff", "def_diff"]]


In [13]:
df_2025["predicted_home_win"] = model.predict(X_2025)


In [14]:
df_2025[["week", "home_team", "away_team", "predicted_home_win"]]


Unnamed: 0,week,home_team,away_team,predicted_home_win
0,1,DAL,PHI,1
1,1,KC,LAC,1
2,1,TB,ATL,1
3,1,CIN,CLE,1
4,1,MIA,IND,1
5,1,CAR,JAX,1
6,1,LV,NE,1
7,1,ARI,NO,1
8,1,PIT,NYJ,1
9,1,NYG,WSH,1


In [None]:

# Add home_field advantage and slight noise
df_2025['home_field'] = 1
df_2025['random_noise'] = np.random.normal(0, 0.01, size=len(df_2025))

df_2025["epa_diff"] = df_2025["home_off_epa"] - df_2025["away_def_epa"]
df_2025["def_diff"] = df_2025["home_def_epa"] - df_2025["away_off_epa"]

X_2025 = df_2025[["epa_diff", "def_diff", "home_field", "random_noise"]]
df_2025["predicted_home_win"] = model.predict(X_2025)

df_2025[["week", "home_team", "away_team", "predicted_home_win"]]
    