# Saturday, October 25, 2025

### 1. IMPORT LIBRARIES

In [2]:
import nfl_data_py as nfl
import pandas as pd

### 2. LOAD THE DATASET
Load all REGULAR seasons only

In [3]:
seasons = [2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017,
           2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008,
           2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999]
regular = True

# load play-by-play
dataset = nfl.import_pbp_data(seasons, downcast=True, cache=False)

# keep regular season only
if regular and "season_type" in dataset.columns:
    dataset = dataset[dataset["season_type"] == "REG"]

2025 done.
2024 done.
2023 done.
2022 done.
2021 done.
2020 done.
2019 done.
2018 done.
2017 done.
2016 done.
2015 done.
2014 done.
2013 done.
2012 done.
2011 done.
2010 done.
2009 done.
2008 done.
2007 done.
2006 done.
2005 done.
2004 done.
2003 done.
2002 done.
2001 done.
2000 done.
1999 done.
Downcasting floats.


- Save the dataset for the first time
- All other times just load the dataset

In [None]:
dataset.to_csv('nfl_1999.csv')

  dataset = pd.read_csv('nfl_1999.csv')


### 3. CLEAN THE DATASET
We only want net EPA and net YPP

In [None]:
# keep only scrimmage plays that actually gain/lose yards: runs or passes
# (This implicitly excludes penalties-only, timeouts, spikes, kneels, punts, etc.)
play_mask = (dataset.get("rush", 0) == 1) | (dataset.get("pass", 0) == 1)
dataset = dataset[play_mask].copy()

# safety: make sure required columns exist
required_cols = ["posteam", "defteam", "epa", "yards_gained"]
missing = [c for c in required_cols if c not in dataset.columns]
if missing:
    raise ValueError(f"Missing columns in PBP: {missing}")

# OFFENSE metrics (by possession team)
# epa: expected points added - how good offense is
offense = (
    dataset.groupby("posteam")
       .agg(
           epa_off=("epa", "mean"),
           ypp_off=("yards_gained", "mean"),
           plays_off=("epa", "size"),
       )
       .rename_axis("team")
)

# DEFENSE metrics (what each defense allowed)
# epa_allowed: mean offensive EPA by opponents vs this defense - how weak defense is
# we'll keep it positive-as-bad (higher means defense allowed more).
defense = (
    dataset.groupby("defteam")
       .agg(
           epa_allowed=("epa", "mean"),
           ypp_allowed=("yards_gained", "mean"),
           plays_def=("epa", "size"),
       )
       .rename_axis("team")
)

# combine and compute NET metrics
team_eff = offense.join(defense, how="outer")  # merge on 'team'
team_eff["net_epa_per_play"] = team_eff["epa_off"] - team_eff["epa_allowed"] # off - def
team_eff["net_ypp"] = team_eff["ypp_off"] - team_eff["ypp_allowed"]

# clean display: sort by Net EPA/play (best → worst)
cols = [
    "epa_off","epa_allowed","net_epa_per_play",
    "ypp_off","ypp_allowed","net_ypp",
    "plays_off","plays_def"
]
team_eff_sorted = team_eff[cols].sort_values("net_epa_per_play", ascending=False)

team_eff_sorted.to_csv('NFL[1999]')

In [None]:
team1 = ['ARI','LAC','CHI','IND','SF','DEN','MIN','CAR','JAX','NO','KC','SEA']
team2 = ['DAL','TEN','CIN','PIT','NYG','HOU','DET','GB','LV','LA','BUF','WAS']
for i in range(len(team1)):
    if team_eff.loc[team1[i], 'net_epa_per_play'] > team_eff.loc[team2[i], 'net_epa_per_play']:
        print(team1[i])
    else:
        print(team2[i])

# DEN, LV, BUF

DAL
LAC
CIN
PIT
SF
DEN
MIN
GB
JAX
NO
KC
SEA


### 4. PREP FOR TRAINING
- The model should know EPA, YPP, home & away scores
- The home & away teams are only for our eyes, not the model

In [None]:
import pandas as pd

dataset = pd.read_csv('nfl[1999-2025].csv')

  dataset = pd.read_csv('nfl[1999-2025].csv')


In [None]:
# drop duplicates cuz we gangster
dataset = dataset[['home_team', 'away_team', 'game_seconds_remaining', 'total_home_epa', 'total_away_epa', 'total_home_score', 'total_away_score']].drop_duplicates()
nodup_dataset = dataset.drop(columns=['total_home_score', 'total_away_score'])
nodup_dataset = nodup_dataset.dropna() 

In [None]:
nodup_dataset['home_won'] = dataset['total_home_score'] > dataset['total_away_score']
nodup_dataset.head(1000)

Unnamed: 0,home_team,away_team,game_seconds_remaining,total_home_epa,total_away_epa,home_won
0,NO,ARI,3600.0,0.000000,0.000000,False
1,NO,ARI,3600.0,0.352700,-0.352700,False
2,NO,ARI,3596.0,0.542752,-0.542752,False
3,NO,ARI,3558.0,-0.774588,0.774588,False
4,NO,ARI,3520.0,0.919772,-0.919772,False
...,...,...,...,...,...,...
1004,GB,DET,265.0,22.255856,-22.255856,True
1005,GB,DET,262.0,21.675568,-21.675568,True
1006,GB,DET,262.0,22.181154,-22.181154,True
1007,GB,DET,256.0,23.298624,-23.298624,True


In [None]:
team_map = {
    'NO': 1,
    'BUF': 2,
    'JAX': 3,
    'CLE': 4,
    'PHI': 5,
    'GB': 6,
    'LA': 7,
    'LAC': 8,
    'NE': 9,
    'IND': 10,
    'CHI': 11,
    'WAS': 12,
    'NYJ': 13,
    'SEA': 14,
    'ATL': 15,
    'DEN': 16,
    'MIN': 17,
    'ARI': 18,
    'DET': 19,
    'BAL': 20,
    'CIN': 21,
    'LV': 22,
    'TEN': 23,
    'MIA': 24,
    'DAL': 25,
    'KC': 26,
    'PIT': 27,
    'HOU': 28,
    'SF': 29,
    'CAR': 30,
    'NYG': 31,
    'TB': 32
}

nodup_dataset['home_team'] = nodup_dataset['home_team'].map(team_map)
nodup_dataset['away_team'] = nodup_dataset['away_team'].map(team_map)
nodup_dataset.drop_duplicates()

Unnamed: 0,home_team,away_team,game_seconds_remaining,total_home_epa,total_away_epa,home_won
0,1,18,3600.0,0.000000,0.000000,False
1,1,18,3600.0,0.352700,-0.352700,False
2,1,18,3596.0,0.542752,-0.542752,False
3,1,18,3558.0,-0.774588,0.774588,False
4,1,18,3520.0,0.919772,-0.919772,False
...,...,...,...,...,...,...
1201877,27,23,47.0,-13.826335,13.826335,False
1201878,27,23,40.0,-14.254064,14.254064,False
1201879,27,23,34.0,-15.871923,15.871923,False
1201880,27,23,30.0,-14.322854,14.322854,False


In [None]:
import category_encoders as ce
import pandas as pd

# find columns that have words in them and not numbers
categorical_cols = nodup_dataset.select_dtypes(include=['object', 'category']).columns
# encode the selected columns
encoder = ce.OrdinalEncoder(cols=categorical_cols)
# transform the words in those columns from our dataset to the encoded numbers we got
nodup_dataset_encoded = encoder.fit_transform(nodup_dataset)
nodup_dataset_encoded

Unnamed: 0,home_team,away_team,game_seconds_remaining,total_home_epa,total_away_epa,home_won
0,1,18,3600.0,0.000000,0.000000,False
1,1,18,3600.0,0.352700,-0.352700,False
2,1,18,3596.0,0.542752,-0.542752,False
3,1,18,3558.0,-0.774588,0.774588,False
4,1,18,3520.0,0.919772,-0.919772,False
...,...,...,...,...,...,...
1201877,27,23,47.0,-13.826335,13.826335,False
1201878,27,23,40.0,-14.254064,14.254064,False
1201879,27,23,34.0,-15.871923,15.871923,False
1201880,27,23,30.0,-14.322854,14.322854,False


In [None]:
# set the training and testing variables
from sklearn.model_selection import train_test_split

X = nodup_dataset_encoded.drop(columns=['home_won'])
y = nodup_dataset_encoded['home_won']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80% train / 20% test

### 5. AI Model Training


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

print("\nRandom Forest Classifier\n")
classifier = RandomForestClassifier(n_estimators=10, random_state=42)
classifier.fit(X_train, y_train)


Random Forest Classifier



0,1,2
,n_estimators,10
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
y_pred = classifier.predict(X_test)
print("\nAccuracy Score: ", accuracy_score(y_test, y_pred))

# n=10   ran=42   |  acc=93%
# n=100  ran= 42  |  acc=96%


Accuracy Score:  0.9243207867102405


In [None]:
import joblib

# save the model
joblib.dump(classifier, './V1_NFL[1999-2025].joblib')

['./NFL[1999-2025].joblib']

In [None]:
import joblib

# load the model
rf = joblib.load('./V1_NFL[1999-2025].joblib')

In [None]:
from sklearn.model_selection import cross_val_score, KFold

# define number of k-folds
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# perform k-fold cross-validation
cross_val_results = cross_val_score(rf, X_train, y_train, cv=kf)

# evaluation metrics
print("Cross-Validation Results (Accuracy): ")
for i, result in enumerate(cross_val_results, 1):
    print(f" Fold {i}: {result * 100:.2f}%")

print(f"Mean Accuracy: {cross_val_results.mean() * 100:.2f}%")

Cross-Validation Results (Accuracy): 
 Fold 1: 92.15%
 Fold 2: 92.07%
 Fold 3: 92.07%
 Fold 4: 92.15%
 Fold 5: 92.27%
Mean Accuracy: 92.14%


In [None]:
new_data = {
    'home_team': 27, 'away_team': 10,
    'game_seconds_remaining': 3600.0,
    'total_home_epa': 0.07146893362022438, 'total_away_epa': 0.04494070986871855,
}

new_X = pd.DataFrame(data=new_data, index=[0])

categorical_cols = new_X.select_dtypes(include=['object', 'category']).columns
encoder = ce.OrdinalEncoder(cols=categorical_cols)
new_X = encoder.fit_transform(new_X)
new_X

Unnamed: 0,home_team,away_team,game_seconds_remaining,total_home_epa,total_away_epa
0,27,10,3600.0,0.071469,0.044941


In [None]:
# increment the time for each loop
# keep track of all the True and False outputs
# find the average of those outputs and that is our accurate prediction

preds = []
i = 0.0
while i <= 3596.0:
    preds.append(rf.predict(new_X))
    i += 1.0
    new_X.update(pd.DataFrame(['game_seconds_remaining'], [i]))

home_won = 0
home_lost = 0
for i in range(len(preds)):
    if preds[i] == True:
        home_won += 1
    else:
        home_lost += 1
if home_won > home_lost:
    print("Home will win!")
else:
    print("Home will lose...")

# WEEK 9            #1 VER. #2 VER. #3 VER. #4 VER. #5 VER.
# NFL GAMES         AWAY W  AVRG W  NO DATE TEAMMAP TOTALS
# ---------------   ------  ------  ------- ------- ------
# BAL @ MIA = BAL   *               !       !
# CHI @ CIN = CHI   *               !       !
# MIN @ DET = MIN   *               !       !       *
# CAR @ GB  = CAR   *               !       *       *
# LAC @ TEN = LAC   *               !       !       *
# ATL @ NE  =  NE   !       !       *       *       !
# SF  @ NYG = NYG   !       *               *       !
# IND @ PIT = PIT   !       *               *       
# DEN @ HOU = DEN   *       !               !
# JAX @ LV  = JAX   *       !       
# NO  @ LA  = LA    !       !       
# KC  @ BUF = BUF   !       !       
# SEA @ WAS = SEA   *       !       
# ARI @ DAL = ARI   *       *       
# ---------------   -       -       -
# 14 GAMES TOTAL!   9                  

Home will lose...
