# Notebook 1: Data Loading and Initial Cleaning

In [2]:
import pandas as pd
import numpy as np
from nfl_data_py import import_pbp_data
import seaborn as sns
import matplotlib.pyplot as plt

year_list = []

for i in range(1999, 2025):
    year_list.append(i)
    
# df = import_pbp_data(year_list)
df = pd.read_csv('nfl_data_1999_2024.csv', low_memory = False)

pd.set_option('display.max_colwidth', None)

## Data Overview

- Source: `nfl_data_py` play-by-play dataset (1999-2024)
- Granularity: One row per play (~1.2 million rows, 393 columns)
- Key fields: `game_id`, `posteam`, `yardline_100`, `qtr`, `down`, `ydstogo`, `game_seconds_remaining`, `score_differential`, `vegas_home_wp`

The dataset's granularity enables real-time win probability modeling, as each row represents a decision point before the ball is snapped.

Note: Games prior to 2003 were excluded due to invalid score data. This filtering ensures training labels are accurate and consistent. Overtime plays were also excluded due to invalid target values.

## Initial Cleaning

In [3]:
# View snapshot of top rows.
df.head()

Unnamed: 0.1,Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,...,offense_players,defense_players,n_offense,n_defense,ngs_air_yards,time_to_throw,was_pressure,route,defense_man_zone_type,defense_coverage_type
0,0,35.0,1999_01_ARI_PHI,1999091000.0,PHI,ARI,REG,1,PHI,home,...,,,,,,,,,,
1,1,60.0,1999_01_ARI_PHI,1999091000.0,PHI,ARI,REG,1,PHI,home,...,,,,,,,,,,
2,2,82.0,1999_01_ARI_PHI,1999091000.0,PHI,ARI,REG,1,PHI,home,...,,,,,,,,,,
3,3,103.0,1999_01_ARI_PHI,1999091000.0,PHI,ARI,REG,1,PHI,home,...,,,,,,,,,,
4,4,126.0,1999_01_ARI_PHI,1999091000.0,PHI,ARI,REG,1,PHI,home,...,,,,,,,,,,


In [4]:
# Drop redundant index column.
df = df.drop('Unnamed: 0', axis = 1)

In [5]:
# View shape.
df.shape

(1230855, 392)

In [None]:
# View info and data types.
df.info(verbose=True)

In [None]:
# Create alphabetical list of columns for easy reference.
col_list = list(df.columns)
col_list.sort()
col_list

In [5]:
# View number of games.
len(df['game_id'].unique())

6988

## Feature Engineering

We reframe key features relative to the home team to provide a consistent baseline across games.

Without this step, possession-based features such as `score_differential` would create inconsistencies for modeling.

In [6]:
# Create home possession and home score differential. 
df['home_score_differential'] = np.where(df['home_pos'] == 1, df['score_differential'], 0-df['score_differential'])

In [7]:
# Confirm home score differential
df[df['game_id'] == '2003_02_CAR_TB'][['score_differential', 'home_score_differential', 'desc']].iloc[-50:-40]

Unnamed: 0,score_differential,home_score_differential,desc
187082,6.0,-6.0,"(2:18) 48-S.Davis left tackle to CAR 48 for 4 yards (99-W.Sapp, 55-D.Brooks)."
187083,,,Timeout #3 by TB at 02:11.
187084,6.0,-6.0,(2:11) 48-S.Davis left end to 50 for 2 yards (20-R.Barber).
187085,6.0,-6.0,"(2:00) 10-T.Sauerbrun punts 40 yards to TB 10, Center-56-J.Kyle. 86-K.Williams ran ob at TB 18 for 8 yards (88-K.Hankton)."
187086,-6.0,-6.0,(1:49) 14-B.Johnson pass incomplete to 19-K.Johnson (27-D.Grant).
187087,-6.0,-6.0,(1:43) 14-B.Johnson pass to 86-K.Williams ran ob at CAR 39 for 43 yards (30-M.Minter).
187088,-6.0,-6.0,(1:35) 14-B.Johnson pass incomplete to 32-M.Pittman (23-R.Howard).
187089,-6.0,-6.0,(1:28) 14-B.Johnson pass incomplete to 87-K.McCardell. RECEIVER RULED OUT OF BOUNDS
187090,-6.0,-6.0,(1:22) 14-B.Johnson pass to 32-M.Pittman pushed ob at CAR 28 for 11 yards (54-W.Witherspoon).
187091,-6.0,-6.0,(1:15) 14-B.Johnson pass to 32-M.Pittman to CAR 18 for 10 yards (54-W.Witherspoon).


In [8]:
# Create winner and home_win labels.
df['winner'] = np.where(df['result'] > 0, df['home_team'], df['away_team'])
df['home_win'] = (df['home_team'] == df['winner']).astype(int)

We also create a `home_win` label for supervised evaluation. 

While not used in model training, it allows cross-checking model predictions against actual outcomes.

In [9]:
# Standardize yardline to home perspective.
df['yardline_100_home'] = np.where(df['home_pos'] == 1, df['yardline_100'], 100-df['yardline_100'])

We standardize field position so it always represents distance to the opponent's end zone for the home team. This avoids possession-based inconsistencies.

In [10]:
# Create time_weight feature.
df['time_weight'] = (1-df['game_seconds_remaining']/3600)

Time_weight essentially reverses and scales the game clock. The beginning of the game has a time_weight of 0, and the end has a time_weight of 1.

In [11]:
# Set overtime time_weight to 1.
df['time_weight'] = np.where(df['qtr'] > 4, 1, df['time_weight'])

Overtime is indicated as quarter 5 (6 if double OT). Although NFL overtime rules are not technically sudden death (in which the first team to score wins), it's reasonable to suggest that any play in overtime is just as important as the end of the 4th quarter and should therefore be weighted as such. For this reason, we set any overtime period to a fixed time_weight of 1.

In [12]:
# View and count ties.
print(df[df['result'] == 0]['game_id'].unique())
print("Count: ", len(df[df['result'] == 0]['game_id'].unique()))

['2002_10_ATL_PIT' '2008_11_PHI_CIN' '2012_10_STL_SF' '2013_12_MIN_GB'
 '2014_06_CAR_CIN' '2016_07_SEA_ARI' '2016_08_WAS_CIN' '2018_01_PIT_CLE'
 '2018_02_MIN_GB' '2019_01_DET_ARI' '2020_03_CIN_PHI' '2021_10_DET_PIT'
 '2022_01_IND_HOU' '2022_13_WAS_NYG']
Count:  14


NFL tie games are incredibly rare, with only 14 occurring in the last 22 seasons. We exclude them.

Fun fact: The Bengals have four of the fourteen ties, two of which are against the Eagles.

In [12]:
# Exclude ties.
df = df[df['result'] != 0].copy()

In [13]:
# Remove non-play rows.
df = df[df['play_type_nfl'] != 'COMMENT']
df = df[df['play_type_nfl'] != 'END_QUARTER']
df = df[df['play_type_nfl'] != 'END_GAME']
df = df[df['play_type_nfl'] != 'TIMEOUT']
df = df[df['play_type_nfl'] != 'PENALTY']
df = df[df['play_type_nfl'] != 'UNSPECIFIED']

These rows are not valid game plays and are irrelevant to our model's training.

In [14]:
# Create cleaned dataframe.
df_cleaned = df.copy()

# Clear original large dataframe to save memory.
# del df

In [16]:
# Glimpse of cleaned dataframe.
df_cleaned.head()

Unnamed: 0,play_id,game_id,old_game_id,home_team,away_team,season_type,week,posteam,posteam_type,defteam,...,was_pressure,route,defense_man_zone_type,defense_coverage_type,home_pos,home_score_differential,winner,home_win,yardline_100_home,time_weight
0,35.0,1999_01_ARI_PHI,1999091000.0,PHI,ARI,REG,1,PHI,home,ARI,...,,,,,1,,ARI,0,30.0,0.0
1,60.0,1999_01_ARI_PHI,1999091000.0,PHI,ARI,REG,1,PHI,home,ARI,...,,,,,1,0.0,ARI,0,77.0,0.0
2,82.0,1999_01_ARI_PHI,1999091000.0,PHI,ARI,REG,1,PHI,home,ARI,...,,,,,1,0.0,ARI,0,77.0,0.0
3,103.0,1999_01_ARI_PHI,1999091000.0,PHI,ARI,REG,1,PHI,home,ARI,...,,,,,1,0.0,ARI,0,76.0,0.0
4,126.0,1999_01_ARI_PHI,1999091000.0,PHI,ARI,REG,1,PHI,home,ARI,...,,,,,1,0.0,ARI,0,81.0,0.0


In [15]:
# Export cleaned dataframe to csv.
df_cleaned.to_csv('df_cleaned.csv', index = True)