In [1]:
!ls

archive.zip  sample_data


In [2]:
!unzip -o archive.zip

Archive:  archive.zip
  inflating: NFL Play by Play 2009-2016 (v3).csv  


In [3]:
!ls


 archive.zip  'NFL Play by Play 2009-2016 (v3).csv'   sample_data


In [4]:
import pandas as pd
import glob

csv_files = glob.glob("*.csv")
print("CSV files found:", csv_files)

df = pd.read_csv(csv_files[0], low_memory=False)
df.head()

CSV files found: ['NFL Play by Play 2009-2016 (v3).csv']


Unnamed: 0,Date,GameID,Drive,qtr,down,time,TimeUnder,TimeSecs,PlayTimeDiff,SideofField,...,yacEPA,Home_WP_pre,Away_WP_pre,Home_WP_post,Away_WP_post,Win_Prob,WPA,airWPA,yacWPA,Season
0,2009-09-10,2009091000,1,1,,15:00,15,3600.0,0.0,TEN,...,,0.485675,0.514325,0.546433,0.453567,0.485675,0.060758,,,2009
1,2009-09-10,2009091000,1,1,1.0,14:53,15,3593.0,7.0,PIT,...,1.146076,0.546433,0.453567,0.551088,0.448912,0.546433,0.004655,-0.032244,0.036899,2009
2,2009-09-10,2009091000,1,1,2.0,14:16,15,3556.0,37.0,PIT,...,,0.551088,0.448912,0.510793,0.489207,0.551088,-0.040295,,,2009
3,2009-09-10,2009091000,1,1,3.0,13:35,14,3515.0,41.0,PIT,...,-5.031425,0.510793,0.489207,0.461217,0.538783,0.510793,-0.049576,0.106663,-0.156239,2009
4,2009-09-10,2009091000,1,1,4.0,13:27,14,3507.0,8.0,PIT,...,,0.461217,0.538783,0.558929,0.441071,0.461217,0.097712,,,2009


In [6]:
import pandas as pd

# Keep only the columns we need
tmp = df[["GameID", "HomeTeam", "AwayTeam", "Season", "PosTeamScore", "DefTeamScore"]].copy()

# Convert scores to numeric (sometimes they come in as strings)
tmp["PosTeamScore"] = pd.to_numeric(tmp["PosTeamScore"], errors="coerce")
tmp["DefTeamScore"] = pd.to_numeric(tmp["DefTeamScore"], errors="coerce")

# In play-by-play, the final score will be the max score reached in that game
final_scores = tmp.groupby("GameID")[["PosTeamScore", "DefTeamScore"]].max().reset_index()

# Get one row per game for teams/season
games_meta = tmp[["GameID", "HomeTeam", "AwayTeam", "Season"]].drop_duplicates(subset=["GameID"])

# Merge meta + final scores
games = games_meta.merge(final_scores, on="GameID", how="left")

games.head()


Unnamed: 0,GameID,HomeTeam,AwayTeam,Season,PosTeamScore,DefTeamScore
0,2009091000,PIT,TEN,2009,10.0,10.0
1,2009091304,CLE,MIN,2009,34.0,34.0
2,2009091307,NO,DET,2009,45.0,45.0
3,2009091308,TB,DAL,2009,34.0,34.0
4,2009091305,HOU,NYJ,2009,24.0,24.0


In [7]:
# Take the last play row per game (end-of-game snapshot)
last_play = df.sort_values(["GameID", "qtr", "TimeSecs"]).groupby("GameID").tail(1)[
    ["GameID", "posteam", "PosTeamScore", "DefTeamScore", "HomeTeam", "AwayTeam", "Season"]
].copy()

last_play["PosTeamScore"] = pd.to_numeric(last_play["PosTeamScore"], errors="coerce")
last_play["DefTeamScore"] = pd.to_numeric(last_play["DefTeamScore"], errors="coerce")

def compute_home_away_scores(row):
    # if the last possession team is HomeTeam:
    if row["posteam"] == row["HomeTeam"]:
        home = row["PosTeamScore"]
        away = row["DefTeamScore"]
    # if last possession team is AwayTeam:
    elif row["posteam"] == row["AwayTeam"]:
        away = row["PosTeamScore"]
        home = row["DefTeamScore"]
    else:
        # rare edge case (missing/blank posteam). fallback to NaN.
        home, away = (pd.NA, pd.NA)
    return pd.Series({"home_score": home, "away_score": away})

scores = last_play.apply(compute_home_away_scores, axis=1)

games = last_play[["GameID", "HomeTeam", "AwayTeam", "Season"]].reset_index(drop=True).join(scores)
games.head()


Unnamed: 0,GameID,HomeTeam,AwayTeam,Season,home_score,away_score
0,2009091000,PIT,TEN,2009,,
1,2009091300,ATL,MIA,2009,,
2,2009091301,BAL,KC,2009,,
3,2009091302,CAR,PHI,2009,,
4,2009091303,CIN,DEN,2009,,


In [8]:
games["home_win"] = (games["home_score"] > games["away_score"]).astype(int)
games[["HomeTeam","AwayTeam","home_score","away_score","home_win"]].head()


Unnamed: 0,HomeTeam,AwayTeam,home_score,away_score,home_win
0,PIT,TEN,,,0
1,ATL,MIA,,,0
2,BAL,KC,,,0
3,CAR,PHI,,,0
4,CIN,DEN,,,0


In [9]:
games[["home_score","away_score"]].isna().mean()


Unnamed: 0,0
home_score,0.994141
away_score,0.994141


In [10]:
import pandas as pd

# Keep only scoring-related columns
score_df = df[
    ["GameID", "HomeTeam", "AwayTeam", "posteam", "PosTeamScore", "DefTeamScore", "Season"]
].copy()

score_df["PosTeamScore"] = pd.to_numeric(score_df["PosTeamScore"], errors="coerce")
score_df["DefTeamScore"] = pd.to_numeric(score_df["DefTeamScore"], errors="coerce")

# Scores when team is on offense
off_scores = score_df.groupby(["GameID", "posteam"])["PosTeamScore"].max().reset_index()
off_scores.columns = ["GameID", "Team", "Score"]

# Scores when team is on defense
def_scores = score_df.groupby(["GameID", "posteam"])["DefTeamScore"].max().reset_index()
def_scores.columns = ["GameID", "Opponent", "Score"]

team_scores = pd.concat([off_scores, def_scores], ignore_index=True)
team_scores = team_scores.dropna(subset=["Team"])
team_scores = team_scores.groupby(["GameID", "Team"])["Score"].max().reset_index()

team_scores.head()


Unnamed: 0,GameID,Team,Score
0,2009091000,PIT,10.0
1,2009091000,TEN,10.0
2,2009091300,ATL,19.0
3,2009091300,MIA,6.0
4,2009091301,BAL,37.0


In [11]:
games_meta = df[["GameID", "HomeTeam", "AwayTeam", "Season"]].drop_duplicates("GameID")

games = games_meta.merge(
    team_scores, left_on=["GameID", "HomeTeam"], right_on=["GameID", "Team"], how="left"
).rename(columns={"Score": "home_score"}).drop(columns="Team")

games = games.merge(
    team_scores, left_on=["GameID", "AwayTeam"], right_on=["GameID", "Team"], how="left"
).rename(columns={"Score": "away_score"}).drop(columns="Team")

games.head()


Unnamed: 0,GameID,HomeTeam,AwayTeam,Season,home_score,away_score
0,2009091000,PIT,TEN,2009,10.0,10.0
1,2009091304,CLE,MIN,2009,18.0,34.0
2,2009091307,NO,DET,2009,45.0,26.0
3,2009091308,TB,DAL,2009,20.0,34.0
4,2009091305,HOU,NYJ,2009,6.0,24.0


In [12]:
games[["home_score", "away_score"]].isna().mean()


Unnamed: 0,0
home_score,0.0
away_score,0.0


In [13]:
games["home_win"] = (games["home_score"] > games["away_score"]).astype(int)
games[["HomeTeam", "AwayTeam", "home_score", "away_score", "home_win"]].head()


Unnamed: 0,HomeTeam,AwayTeam,home_score,away_score,home_win
0,PIT,TEN,10.0,10.0,0
1,CLE,MIN,18.0,34.0,0
2,NO,DET,45.0,26.0,1
3,TB,DAL,20.0,34.0,0
4,HOU,NYJ,6.0,24.0,0


In [14]:
games = games.sort_values(["Season", "GameID"]).reset_index(drop=True)


In [15]:
# Home team history
games["home_pts_for_hist"] = (
    games.groupby(["Season", "HomeTeam"])["home_score"]
    .transform(lambda s: s.shift(1).expanding().mean())
)

games["home_pts_against_hist"] = (
    games.groupby(["Season", "HomeTeam"])["away_score"]
    .transform(lambda s: s.shift(1).expanding().mean())
)

# Away team history
games["away_pts_for_hist"] = (
    games.groupby(["Season", "AwayTeam"])["away_score"]
    .transform(lambda s: s.shift(1).expanding().mean())
)

games["away_pts_against_hist"] = (
    games.groupby(["Season", "AwayTeam"])["home_score"]
    .transform(lambda s: s.shift(1).expanding().mean())
)


In [16]:
games["home_pts_for_hist"] = games["home_pts_for_hist"].fillna(
    games.groupby("Season")["home_score"].transform("mean")
)

games["home_pts_against_hist"] = games["home_pts_against_hist"].fillna(
    games.groupby("Season")["away_score"].transform("mean")
)

games["away_pts_for_hist"] = games["away_pts_for_hist"].fillna(
    games.groupby("Season")["away_score"].transform("mean")
)

games["away_pts_against_hist"] = games["away_pts_against_hist"].fillna(
    games.groupby("Season")["home_score"].transform("mean")
)

games[
    ["HomeTeam","AwayTeam","home_pts_for_hist","away_pts_for_hist"]
].head()


Unnamed: 0,HomeTeam,AwayTeam,home_pts_for_hist,away_pts_for_hist
0,PIT,TEN,21.765625,19.902344
1,ATL,MIA,21.765625,19.902344
2,BAL,KC,21.765625,19.902344
3,CAR,PHI,21.765625,19.902344
4,CIN,DEN,21.765625,19.902344


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

features = [
    "home_pts_for_hist",
    "home_pts_against_hist",
    "away_pts_for_hist",
    "away_pts_against_hist"
]

X = games[features]
y = games["home_win"]


In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [19]:
model = RandomForestClassifier(
    n_estimators=400,
    max_depth=8,
    random_state=42,
    class_weight="balanced"
)

model.fit(X_train, y_train)


In [20]:
preds = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))


Accuracy: 0.5560975609756098
              precision    recall  f1-score   support

           0       0.53      0.57      0.55       193
           1       0.59      0.54      0.56       217

    accuracy                           0.56       410
   macro avg       0.56      0.56      0.56       410
weighted avg       0.56      0.56      0.56       410



In [21]:
games["home_point_diff"] = games["home_pts_for_hist"] - games["home_pts_against_hist"]
games["away_point_diff"] = games["away_pts_for_hist"] - games["away_pts_against_hist"]


In [22]:
features = [
    "home_pts_for_hist",
    "home_pts_against_hist",
    "home_point_diff",
    "away_pts_for_hist",
    "away_pts_against_hist",
    "away_point_diff"
]


In [24]:
games["net_diff"] = games["pts_for_diff"] + games["pts_against_diff"]


In [25]:
features = [
    "home_pts_for_hist",
    "home_pts_against_hist",
    "away_pts_for_hist",
    "away_pts_against_hist",
    "pts_for_diff",
    "pts_against_diff",
    "net_diff"
]

X = games[features]
y = games["home_win"]


In [26]:
games["pts_for_diff"] = games["home_pts_for_hist"] - games["away_pts_for_hist"]
games["pts_against_diff"] = games["away_pts_against_hist"] - games["home_pts_against_hist"]
games["net_diff"] = games["pts_for_diff"] + games["pts_against_diff"]


In [27]:
features = [
    "home_pts_for_hist",
    "home_pts_against_hist",
    "away_pts_for_hist",
    "away_pts_against_hist",
    "pts_for_diff",
    "pts_against_diff",
    "net_diff"
]

X = games[features]
y = games["home_win"]


In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [29]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=400,
    max_depth=8,
    random_state=42,
    class_weight="balanced"
)

model.fit(X_train, y_train)


In [30]:
from sklearn.metrics import accuracy_score, classification_report

preds = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))


Accuracy: 0.5585365853658537
              precision    recall  f1-score   support

           0       0.53      0.61      0.57       193
           1       0.60      0.51      0.55       217

    accuracy                           0.56       410
   macro avg       0.56      0.56      0.56       410
weighted avg       0.56      0.56      0.56       410



In [31]:
games = games.sort_values(["Season", "GameID"]).reset_index(drop=True)

games["home_pts_for_5"] = (
    games.groupby(["Season", "HomeTeam"])["home_score"]
    .transform(lambda s: s.shift(1).rolling(5, min_periods=1).mean())
)

games["home_pts_against_5"] = (
    games.groupby(["Season", "HomeTeam"])["away_score"]
    .transform(lambda s: s.shift(1).rolling(5, min_periods=1).mean())
)

games["away_pts_for_5"] = (
    games.groupby(["Season", "AwayTeam"])["away_score"]
    .transform(lambda s: s.shift(1).rolling(5, min_periods=1).mean())
)

games["away_pts_against_5"] = (
    games.groupby(["Season", "AwayTeam"])["home_score"]
    .transform(lambda s: s.shift(1).rolling(5, min_periods=1).mean())
)


In [32]:
games["pts_for_diff_5"] = games["home_pts_for_5"] - games["away_pts_for_5"]
games["pts_against_diff_5"] = games["away_pts_against_5"] - games["home_pts_against_5"]
games["net_diff_5"] = games["pts_for_diff_5"] + games["pts_against_diff_5"]


In [33]:
features = [
    "home_pts_for_5",
    "home_pts_against_5",
    "away_pts_for_5",
    "away_pts_against_5",
    "pts_for_diff_5",
    "pts_against_diff_5",
    "net_diff_5"
]

X = games[features]
y = games["home_win"]


In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [35]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=500,
    max_depth=10,
    random_state=42,
    class_weight="balanced"
)

model.fit(X_train, y_train)


In [36]:
from sklearn.metrics import accuracy_score, classification_report

preds = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))


Accuracy: 0.5195121951219512
              precision    recall  f1-score   support

           0       0.49      0.53      0.51       193
           1       0.55      0.51      0.53       217

    accuracy                           0.52       410
   macro avg       0.52      0.52      0.52       410
weighted avg       0.52      0.52      0.52       410



In [37]:
features = [
    # season-long strength
    "home_pts_for_hist",
    "home_pts_against_hist",
    "away_pts_for_hist",
    "away_pts_against_hist",

    # recent form
    "home_pts_for_5",
    "home_pts_against_5",
    "away_pts_for_5",
    "away_pts_against_5",

    # matchup signals
    "pts_for_diff",
    "pts_against_diff",
    "net_diff",
    "net_diff_5"
]

X = games[features]
y = games["home_win"]


In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [40]:
X.isna().mean().sort_values(ascending=False).head(20)


Unnamed: 0,0
net_diff_5,0.144531
home_pts_for_5,0.125488
home_pts_against_5,0.125488
away_pts_for_5,0.125
away_pts_against_5,0.125
away_pts_for_hist,0.0
home_pts_against_hist,0.0
home_pts_for_hist,0.0
away_pts_against_hist,0.0
pts_for_diff,0.0


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# (Rebuild X,y just in case)
features = [
    "home_pts_for_hist","home_pts_against_hist","away_pts_for_hist","away_pts_against_hist",
    "home_pts_for_5","home_pts_against_5","away_pts_for_5","away_pts_against_5",
    "pts_for_diff","pts_against_diff","net_diff","net_diff_5"
]
X = games[features]
y = games["home_win"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),  # fills NaNs using column median
    ("model", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

pipe.fit(X_train, y_train)

preds = pipe.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))


Accuracy: 0.5536585365853659
              precision    recall  f1-score   support

           0       0.53      0.54      0.53       193
           1       0.58      0.56      0.57       217

    accuracy                           0.55       410
   macro avg       0.55      0.55      0.55       410
weighted avg       0.55      0.55      0.55       410



In [42]:
# Make sure games are in chronological-ish order
games = games.sort_values(["Season", "GameID"]).reset_index(drop=True)

# Home team's historical home win rate (before current game)
games["home_homewinrate_hist"] = (
    games.groupby(["Season", "HomeTeam"])["home_win"]
    .transform(lambda s: s.shift(1).expanding().mean())
)

# Away team's historical away win rate (before current game)
# (away win = home_win == 0)
games["away_awaywinrate_hist"] = (
    games.groupby(["Season", "AwayTeam"])["home_win"]
    .transform(lambda s: (1 - s).shift(1).expanding().mean())
)

# Fill early games with season averages (so no NaNs)
games["home_homewinrate_hist"] = games["home_homewinrate_hist"].fillna(
    games.groupby("Season")["home_win"].transform("mean")
)
games["away_awaywinrate_hist"] = games["away_awaywinrate_hist"].fillna(
    1 - games.groupby("Season")["home_win"].transform("mean")
)

games[["HomeTeam","AwayTeam","home_homewinrate_hist","away_awaywinrate_hist"]].head()


Unnamed: 0,HomeTeam,AwayTeam,home_homewinrate_hist,away_awaywinrate_hist
0,PIT,TEN,0.53125,0.46875
1,ATL,MIA,0.53125,0.46875
2,BAL,KC,0.53125,0.46875
3,CAR,PHI,0.53125,0.46875
4,CIN,DEN,0.53125,0.46875


In [43]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

features = [
    "home_pts_for_hist","home_pts_against_hist","away_pts_for_hist","away_pts_against_hist",
    "home_pts_for_5","home_pts_against_5","away_pts_for_5","away_pts_against_5",
    "pts_for_diff","pts_against_diff","net_diff","net_diff_5",
    "home_homewinrate_hist","away_awaywinrate_hist"
]

X = games[features]
y = games["home_win"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

pipe.fit(X_train, y_train)
preds = pipe.predict(X_test)

print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))


Accuracy: 0.5682926829268292
              precision    recall  f1-score   support

           0       0.54      0.57      0.55       193
           1       0.60      0.57      0.58       217

    accuracy                           0.57       410
   macro avg       0.57      0.57      0.57       410
weighted avg       0.57      0.57      0.57       410



In [44]:
from sklearn.metrics import roc_auc_score

probs = pipe.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, probs))


ROC-AUC: 0.5958191065160813
