# Build Dataset

> conda env create -f environment.yml

Games

In [64]:
# === Cell: ESPN NFL results -> results_games_{YEAR}.csv, results_teamweek_{YEAR}.csv ===
import json, time, datetime as dt, re
from pathlib import Path
import pandas as pd
import requests

YEAR  = 2025
WEEKS = list(range(1, 19))
DATA_DIR = Path("../data"); DATA_DIR.mkdir(parents=True, exist_ok=True)

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/120 Safari/537.36")
})

# Reuse the same city-style keys you've used everywhere else
TEAM_KEY = {
    # NFC
    "Arizona Cardinals":"Arizona","Arizona":"Arizona",
    "Atlanta Falcons":"Atlanta","Atlanta":"Atlanta",
    "Carolina Panthers":"Carolina","Carolina":"Carolina",
    "Chicago Bears":"Chicago","Chicago":"Chicago",
    "Dallas Cowboys":"Dallas","Dallas":"Dallas",
    "Detroit Lions":"Detroit","Detroit":"Detroit",
    "Green Bay Packers":"Green Bay","Green Bay":"Green Bay",
    "Los Angeles Rams":"LA Rams","L.A. Rams":"LA Rams","LA Rams":"LA Rams","Rams":"LA Rams",
    "Minnesota Vikings":"Minnesota","Minnesota":"Minnesota",
    "New Orleans Saints":"New Orleans","New Orleans":"New Orleans",
    "New York Giants":"NY Giants","NY Giants":"NY Giants",
    "Philadelphia Eagles":"Philadelphia","Philadelphia":"Philadelphia",
    "San Francisco 49ers":"San Francisco","San Francisco":"San Francisco","49ers":"San Francisco",
    "Seattle Seahawks":"Seattle","Seattle":"Seattle",
    "Tampa Bay Buccaneers":"Tampa Bay","Tampa Bay":"Tampa Bay","Buccaneers":"Tampa Bay","Bucs":"Tampa Bay",
    "Washington Commanders":"Washington","Washington":"Washington",
    # AFC
    "Baltimore Ravens":"Baltimore","Baltimore":"Baltimore",
    "Buffalo Bills":"Buffalo","Buffalo":"Buffalo",
    "Cincinnati Bengals":"Cincinnati","Cincinnati":"Cincinnati",
    "Cleveland Browns":"Cleveland","Cleveland":"Cleveland",
    "Denver Broncos":"Denver","Denver":"Denver",
    "Houston Texans":"Houston","Houston":"Houston",
    "Indianapolis Colts":"Indianapolis","Indianapolis":"Indianapolis",
    "Jacksonville Jaguars":"Jacksonville","Jacksonville":"Jacksonville","Jaguars":"Jacksonville","Jags":"Jacksonville",
    "Kansas City Chiefs":"Kansas City","Kansas City":"Kansas City",
    "Las Vegas Raiders":"Las Vegas","Las Vegas":"Las Vegas","Raiders":"Las Vegas",
    "Los Angeles Chargers":"LA Chargers","L.A. Chargers":"LA Chargers","LA Chargers":"LA Chargers","Chargers":"LA Chargers",
    "Miami Dolphins":"Miami","Miami":"Miami",
    "New England Patriots":"New England","New England":"New England",
    "New York Jets":"NY Jets","NY Jets":"NY Jets","Jets":"NY Jets",
    "Pittsburgh Steelers":"Pittsburgh","Pittsburgh":"Pittsburgh",
    "Tennessee Titans":"Tennessee","Tennessee":"Tennessee",
}

def norm(s: str) -> str:
    return re.sub(r"\s+"," ",str(s or "")).strip()

def to_key(s: str) -> str:
    s = norm(s).replace("N.Y.","NY").replace("L.A.","LA")
    return TEAM_KEY.get(s, s)

def fetch_week_json(year: int, week: int, cache: bool=True) -> dict:
    cache_path = DATA_DIR / f"espn_scoreboard_year{year}_week{week}.json"
    if cache and cache_path.exists():
        return json.loads(cache_path.read_text(encoding="utf-8", errors="ignore"))
    url = "https://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard"
    params = {"year": year, "week": week, "seasontype": 2}
    r = SESSION.get(url, params=params, timeout=25); r.raise_for_status()
    data = r.json()
    if cache:
        cache_path.write_text(json.dumps(data), encoding="utf-8")
    return data

def parse_week_results(js: dict, week: int) -> pd.DataFrame:
    rows = []
    for ev in js.get("events", []):
        comps = ev.get("competitions", [])
        if not comps: 
            continue
        comp = comps[0]
        status = comp.get("status", {}) or ev.get("status", {})
        st_type = (status.get("type") or {}).get("name")
        completed = (status.get("type") or {}).get("completed", False)

        # competitors: two entries, with 'homeAway'
        teams = comp.get("competitors", [])
        if len(teams) != 2:
            continue

        # Map home/away
        home = next((t for t in teams if t.get("homeAway")=="home"), None)
        away = next((t for t in teams if t.get("homeAway")=="away"), None)
        if not home or not away:
            continue

        def pick_name(tobj):
            # Try location (e.g., "Kansas City"), fallback to displayName
            loc = (tobj.get("team") or {}).get("location") or ""
            disp = (tobj.get("team") or {}).get("displayName") or ""
            name = loc or disp
            return to_key(name)

        home_key = pick_name(home)
        away_key = pick_name(away)

        def to_int(x):
            try: return int(x)
            except: return None

        home_score = to_int(home.get("score"))
        away_score = to_int(away.get("score"))

        winner_team_id = None
        for t in teams:
            if t.get("winner") is True:
                winner_team_id = (t.get("team") or {}).get("id")
                break

        winner_key = None
        if winner_team_id:
            for t in teams:
                if (t.get("team") or {}).get("id") == winner_team_id:
                    winner_key = pick_name(t)

        game_id = comp.get("id") or ev.get("id")
        start = ev.get("date") or comp.get("date")
        notes = ";".join([n.get("headline","") for n in comp.get("notes", [])]) if comp.get("notes") else ""

        rows.append({
            "week": week,
            "game_id": game_id,
            "start_time": start,
            "status_type": st_type,
            "completed": bool(completed),
            "home_team": home_key, "away_team": away_key,
            "home_score": home_score, "away_score": away_score,
            "winner": winner_key,
            "notes": notes,
        })
    return pd.DataFrame(rows)

# Build all weeks
weekly = []
for w in WEEKS:
    js = fetch_week_json(YEAR, w, cache=True)
    dfw = parse_week_results(js, w)
    if not dfw.empty:
        weekly.append(dfw)
    time.sleep(0.4)

if weekly:
    results_games = (pd.concat(weekly, ignore_index=True)
                       .drop_duplicates(subset=["week","home_team","away_team"])
                       .reset_index(drop=True))
else:
    results_games = pd.DataFrame(columns=[
        "week","game_id","start_time","status_type","completed",
        "home_team","away_team","home_score","away_score","winner","notes"
    ])

# Expand to team-week long
home_rows = (results_games
    .rename(columns={"home_team":"team","away_team":"opponent"})
    .assign(home_away="H",
            team_score=lambda d: d["home_score"],
            opp_score=lambda d: d["away_score"],
            win=lambda d: (d["completed"]) & (d["winner"]==d["team"]),
            loss=lambda d: (d["completed"]) & (d["winner"].notna()) & (d["winner"]!=d["team"]),
            margin=lambda d: (d["team_score"].fillna(0) - d["opp_score"].fillna(0)))
    [["week","team","opponent","home_away","team_score","opp_score","margin",
      "win","loss","completed","game_id","start_time","notes"]])

away_rows = (results_games
    .rename(columns={"away_team":"team","home_team":"opponent"})
    .assign(home_away="A",
            team_score=lambda d: d["away_score"],
            opp_score=lambda d: d["home_score"],
            win=lambda d: (d["completed"]) & (d["winner"]==d["team"]),
            loss=lambda d: (d["completed"]) & (d["winner"].notna()) & (d["winner"]!=d["team"]),
            margin=lambda d: (d["team_score"].fillna(0) - d["opp_score"].fillna(0)))
    [["week","team","opponent","home_away","team_score","opp_score","margin",
      "win","loss","completed","game_id","start_time","notes"]])

results_teamweek = pd.concat([home_rows, away_rows], ignore_index=True)\
                     .sort_values(["week","team"]).reset_index(drop=True)

# Save
p_games = DATA_DIR / f"results_games_{YEAR}.csv"
p_teamw = DATA_DIR / f"results_teamweek_{YEAR}.csv"
results_games.to_csv(p_games, index=False)
results_teamweek.to_csv(p_teamw, index=False)

print(f"Saved game results -> {p_games}  ({len(results_games)} games)")
print(f"Saved team-week results -> {p_teamw}  ({len(results_teamweek)} rows)")
display(results_games.head(10))
display(results_teamweek.head(10))

Saved game results -> ../data/results_games_2025.csv  (272 games)
Saved team-week results -> ../data/results_teamweek_2025.csv  (544 rows)


Unnamed: 0,week,game_id,start_time,status_type,completed,home_team,away_team,home_score,away_score,winner,notes
0,1,401772510,2025-09-05T00:20Z,STATUS_FINAL,True,Philadelphia,Dallas,24,20,Philadelphia,
1,1,401772714,2025-09-06T00:00Z,STATUS_FINAL,True,Los Angeles,Kansas City,27,21,Los Angeles,NFL São Paulo Game
2,1,401772830,2025-09-07T17:00Z,STATUS_FINAL,True,Atlanta,Tampa Bay,20,23,Tampa Bay,
3,1,401772829,2025-09-07T17:00Z,STATUS_FINAL,True,Cleveland,Cincinnati,16,17,Cincinnati,
4,1,401772719,2025-09-07T17:00Z,STATUS_FINAL,True,Indianapolis,Miami,33,8,Indianapolis,
5,1,401772720,2025-09-07T17:00Z,STATUS_FINAL,True,New England,Las Vegas,13,20,Las Vegas,
6,1,401772718,2025-09-07T17:00Z,STATUS_FINAL,True,New Orleans,Arizona,13,20,Arizona,
7,1,401772721,2025-09-07T17:00Z,STATUS_FINAL,True,New York,Pittsburgh,32,34,Pittsburgh,
8,1,401772827,2025-09-07T17:00Z,STATUS_FINAL,True,Washington,New York,21,6,Washington,
9,1,401772828,2025-09-07T17:00Z,STATUS_FINAL,True,Jacksonville,Carolina,26,10,Jacksonville,


Unnamed: 0,week,team,opponent,home_away,team_score,opp_score,margin,win,loss,completed,game_id,start_time,notes
0,1,Arizona,New Orleans,A,20,13,7,True,False,True,401772718,2025-09-07T17:00Z,
1,1,Atlanta,Tampa Bay,H,20,23,-3,False,True,True,401772830,2025-09-07T17:00Z,
2,1,Baltimore,Buffalo,A,40,41,-1,False,True,True,401772918,2025-09-08T00:20Z,
3,1,Buffalo,Baltimore,H,41,40,1,True,False,True,401772918,2025-09-08T00:20Z,
4,1,Carolina,Jacksonville,A,10,26,-16,False,True,True,401772828,2025-09-07T17:00Z,
5,1,Chicago,Minnesota,H,24,27,-3,False,True,True,401772810,2025-09-09T00:15Z,
6,1,Cincinnati,Cleveland,A,17,16,1,True,False,True,401772829,2025-09-07T17:00Z,
7,1,Cleveland,Cincinnati,H,16,17,-1,False,True,True,401772829,2025-09-07T17:00Z,
8,1,Dallas,Philadelphia,A,20,24,-4,False,True,True,401772510,2025-09-05T00:20Z,
9,1,Denver,Tennessee,H,20,12,8,True,False,True,401772832,2025-09-07T20:05Z,


In [29]:
# === ESPN NFL schedule scraper (games only) ===
import re, time
from pathlib import Path
import pandas as pd
import requests
from bs4 import BeautifulSoup

YEAR  = 2025
WEEKS = list(range(1, 19))
SLEEP = 0.6
DATA_DIR = Path("../data"); DATA_DIR.mkdir(parents=True, exist_ok=True)

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
})

def normalize(s: str) -> str:
    return re.sub(r"\s+", " ", str(s)).strip()

def fetch_week_html(year: int, week: int, cache: bool = True) -> str:
    cache_path = DATA_DIR / f"espn_week{week}_year{year}.html"
    if cache and cache_path.exists():
        return cache_path.read_text(encoding="utf-8", errors="ignore")
    url = f"https://www.espn.com/nfl/schedule/_/week/{week}/year/{year}/seasontype/2"
    r = SESSION.get(url, timeout=20); r.raise_for_status()
    html = r.text
    if cache:
        cache_path.write_text(html, encoding="utf-8")
    return html

def _parse_table(df: pd.DataFrame, week: int) -> pd.DataFrame:
    d = df.copy()
    d.columns = [str(c).strip().lower() for c in d.columns]
    if "matchup" not in d.columns:
        return pd.DataFrame(columns=["week","away_team","home_team","game_label"])

    rows = []
    # Layout A: two columns
    if "matchup.1" in d.columns:
        tmp = d[["matchup","matchup.1"]].dropna()
        for _, r in tmp.iterrows():
            away = normalize(r["matchup"])
            home = normalize(r["matchup.1"].replace("@","").replace("vs.",""))
            if away and home:
                rows.append({"week": week,
                             "away_team": away,
                             "home_team": home,
                             "game_label": f"{away} @ {home}"})
    else:
        # Layout B: "Away @ Home"
        keep = d["matchup"].astype(str).str.contains(r"\s@\s", regex=True, na=False)
        tmp = d.loc[keep, ["matchup"]]
        for _, r in tmp.iterrows():
            m = re.findall(r"^(.*?)\s*@\s*(.*?)\s*$", str(r["matchup"]))
            if m:
                away, home = [normalize(x) for x in m[0]]
                rows.append({"week": week,
                             "away_team": away,
                             "home_team": home,
                             "game_label": f"{away} @ {home}"})
    return pd.DataFrame(rows)

def parse_week_games(html: str, week: int) -> pd.DataFrame:
    soup = BeautifulSoup(html, "lxml")
    blocks = []
    for tbl in soup.select("table"):
        try:
            df = pd.read_html(str(tbl))[0]
        except ValueError:
            continue
        g = _parse_table(df, week)
        if not g.empty:
            blocks.append(g)
    if not blocks:
        return pd.DataFrame(columns=["week","away_team","home_team","game_label"])
    return pd.concat(blocks, ignore_index=True).drop_duplicates().reset_index(drop=True)

def to_team_week_long(games_week: pd.DataFrame) -> pd.DataFrame:
    home_rows = games_week.rename(columns={"home_team":"team", "away_team":"opponent"}) \
                          .assign(home_away="H")[["week","team","opponent","home_away","game_label"]]
    away_rows = games_week.rename(columns={"away_team":"team", "home_team":"opponent"}) \
                          .assign(home_away="A")[["week","team","opponent","home_away","game_label"]]
    return pd.concat([home_rows, away_rows], ignore_index=True)

def build_games_only(year: int, weeks: list[int]):
    all_games = []
    for w in weeks:
        html = fetch_week_html(year, w, cache=True)
        gw = parse_week_games(html, w)
        if not gw.empty:
            all_games.append(gw)
        time.sleep(SLEEP)
    if not all_games:
        return pd.DataFrame(), pd.DataFrame()
    games_df = pd.concat(all_games, ignore_index=True).drop_duplicates().reset_index(drop=True)
    teamweek_df = to_team_week_long(games_df).sort_values(["week","team"]).reset_index(drop=True)
    return games_df, teamweek_df

# --- run ---
games_df, teamweek_df = build_games_only(YEAR, WEEKS)

print(f"Games: {len(games_df)}  (expected ~272)")
print(f"Team-week rows: {len(teamweek_df)}  (expected ~544)")
display(games_df.head(15))
display(teamweek_df.head(15))

games_df.to_csv(DATA_DIR / f"games_{YEAR}.csv", index=False)
teamweek_df.to_csv(DATA_DIR / f"schedule_{YEAR}.csv", index=False)


  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.

Games: 272  (expected ~272)
Team-week rows: 544  (expected ~544)


Unnamed: 0,week,away_team,home_team,game_label
0,1,Dallas,Philadelphia,Dallas @ Philadelphia
1,1,Kansas City,Los Angeles,Kansas City @ Los Angeles
2,1,Tampa Bay,Atlanta,Tampa Bay @ Atlanta
3,1,Cincinnati,Cleveland,Cincinnati @ Cleveland
4,1,Miami,Indianapolis,Miami @ Indianapolis
5,1,Las Vegas,New England,Las Vegas @ New England
6,1,Arizona,New Orleans,Arizona @ New Orleans
7,1,Pittsburgh,New York,Pittsburgh @ New York
8,1,New York,Washington,New York @ Washington
9,1,Carolina,Jacksonville,Carolina @ Jacksonville


Unnamed: 0,week,team,opponent,home_away,game_label
0,1,Arizona,New Orleans,A,Arizona @ New Orleans
1,1,Atlanta,Tampa Bay,H,Tampa Bay @ Atlanta
2,1,Baltimore,Buffalo,A,Baltimore @ Buffalo
3,1,Buffalo,Baltimore,H,Baltimore @ Buffalo
4,1,Carolina,Jacksonville,A,Carolina @ Jacksonville
5,1,Chicago,Minnesota,H,Minnesota @ Chicago
6,1,Cincinnati,Cleveland,A,Cincinnati @ Cleveland
7,1,Cleveland,Cincinnati,H,Cincinnati @ Cleveland
8,1,Dallas,Philadelphia,A,Dallas @ Philadelphia
9,1,Denver,Tennessee,H,Tennessee @ Denver


### Odds

In [52]:
# === Cell 1: Fetch moneylines via The Odds API and save as odds_long_{YEAR}.csv ===
import os, re, datetime as dt
from pathlib import Path
import pandas as pd
import requests
import numpy as np

YEAR = 2025
DATA_DIR = Path("../data"); DATA_DIR.mkdir(parents=True, exist_ok=True)

# Use your key directly or move it to an env var later
API_KEY = os.getenv("THE_ODDS_API_KEY") or "a25d13b6734a800828e661ebb91a17ca"

def clean(s: str) -> str:
    return re.sub(r"\s+", " ", str(s or "")).strip()

# map book/team strings -> ESPN city-style keys (extend if you see mismatches)
TEAM_KEY = {
    # NFC
    "Arizona Cardinals":"Arizona","Arizona":"Arizona",
    "Atlanta Falcons":"Atlanta","Atlanta":"Atlanta",
    "Carolina Panthers":"Carolina","Carolina":"Carolina",
    "Chicago Bears":"Chicago","Chicago":"Chicago",
    "Dallas Cowboys":"Dallas","Dallas":"Dallas",
    "Detroit Lions":"Detroit","Detroit":"Detroit",
    "Green Bay Packers":"Green Bay","Green Bay":"Green Bay",
    "Los Angeles Rams":"LA Rams","L.A. Rams":"LA Rams","LA Rams":"LA Rams","Rams":"LA Rams",
    "Minnesota Vikings":"Minnesota","Minnesota":"Minnesota",
    "New Orleans Saints":"New Orleans","New Orleans":"New Orleans",
    "New York Giants":"NY Giants","NY Giants":"NY Giants","Giants":"NY Giants",
    "Philadelphia Eagles":"Philadelphia","Philadelphia":"Philadelphia",
    "San Francisco 49ers":"San Francisco","San Francisco":"San Francisco",
    "Seattle Seahawks":"Seattle","Seattle":"Seattle",
    "Tampa Bay Buccaneers":"Tampa Bay","Tampa Bay":"Tampa Bay",
    "Washington Commanders":"Washington","Washington":"Washington",
    # AFC
    "Baltimore Ravens":"Baltimore","Baltimore":"Baltimore",
    "Buffalo Bills":"Buffalo","Buffalo":"Buffalo",
    "Cincinnati Bengals":"Cincinnati","Cincinnati":"Cincinnati",
    "Cleveland Browns":"Cleveland","Cleveland":"Cleveland",
    "Denver Broncos":"Denver","Denver":"Denver",
    "Houston Texans":"Houston","Houston":"Houston",
    "Indianapolis Colts":"Indianapolis","Indianapolis":"Indianapolis",
    "Jacksonville Jaguars":"Jacksonville","Jacksonville":"Jacksonville",
    "Kansas City Chiefs":"Kansas City","Kansas City":"Kansas City",
    "Las Vegas Raiders":"Las Vegas","Las Vegas":"Las Vegas","Raiders":"Las Vegas",
    "Los Angeles Chargers":"LA Chargers","L.A. Chargers":"LA Chargers","LA Chargers":"LA Chargers","Chargers":"LA Chargers",
    "Miami Dolphins":"Miami","Miami":"Miami",
    "New England Patriots":"New England","New England":"New England",
    "New York Jets":"NY Jets","NY Jets":"NY Jets","Jets":"NY Jets",
    "Pittsburgh Steelers":"Pittsburgh","Pittsburgh":"Pittsburgh",
    "Tennessee Titans":"Tennessee","Tennessee":"Tennessee",
}
def to_key(x: str) -> str:
    x = clean(x).replace("N.Y.","NY").replace("L.A.","LA")
    return TEAM_KEY.get(x, x)

def nfl_week_window(year: int, week: int) -> tuple[dt.datetime, dt.datetime]:
    base = dt.datetime(year, 9, 1, tzinfo=dt.timezone.utc)
    while base.weekday() != 3:  # Thu
        base += dt.timedelta(days=1)
    start = base + dt.timedelta(days=(week-1)*7 - 1)  # Wed before TNF
    end   = start + dt.timedelta(days=8)
    return start, end

def assign_week_from_time(t_iso: str, year: int) -> int | None:
    try:
        t = dt.datetime.fromisoformat(t_iso.replace("Z","+00:00"))
        if t.tzinfo is None: t = t.replace(tzinfo=dt.timezone.utc)
    except Exception:
        return None
    for w in range(1, 19):
        s, e = nfl_week_window(year, w)
        if s <= t <= e:
            return w
    return None

# fetch odds
url = "https://api.the-odds-api.com/v4/sports/americanfootball_nfl/odds"
params = {"apiKey": API_KEY, "regions": "us", "markets": "h2h", "oddsFormat": "american"}
r = requests.get(url, params=params, timeout=25); r.raise_for_status()
data = r.json()

rows = []
for ev in data:
    home = to_key(ev.get("home_team","")); away = to_key(ev.get("away_team",""))
    commence = ev.get("commence_time")
    week = assign_week_from_time(commence, YEAR)
    if not week: 
        continue

    # prefer DraftKings if available, else first book
    books = ev.get("bookmakers", [])
    book_by_key = {bk.get("key"): bk for bk in books}
    bk = book_by_key.get("draftkings") or (books[0] if books else None)
    if not bk: 
        continue
    h2h = next((m for m in bk.get("markets", []) if m.get("key")=="h2h"), None)
    if not h2h or len(h2h.get("outcomes", [])) != 2:
        continue

    # map outcomes by name
    ml_map = { to_key(o.get("name","")): o.get("price") for o in h2h["outcomes"] }
    home_ml = ml_map.get(home); away_ml = ml_map.get(away)

    # fallback if the names didn't match ordering
    if home_ml is None or away_ml is None:
        prices = [o.get("price") for o in h2h["outcomes"]]
        if len(prices) == 2:
            away_ml, home_ml = prices[0], prices[1]

    rows += [
        {"week": week, "team": home, "opponent": away, "home_away": "H", "ml": home_ml,
         "book": bk.get("key"), "commence_time": commence},
        {"week": week, "team": away, "opponent": home, "home_away": "A", "ml": away_ml,
         "book": bk.get("key"), "commence_time": commence},
    ]

odds_long = pd.DataFrame(rows)
out_path = DATA_DIR / f"odds_long_{YEAR}.csv"
odds_long.to_csv(out_path, index=False)
print(f"Saved odds -> {out_path}  ({len(odds_long)} team-week rows)")
display(odds_long.head(12))

Saved odds -> ../data/odds_long_2025.csv  (58 team-week rows)


Unnamed: 0,week,team,opponent,home_away,ml,book,commence_time
0,4,Pittsburgh,Minnesota,H,120,draftkings,2025-09-28T13:31:00Z
1,4,Minnesota,Pittsburgh,A,-142,draftkings,2025-09-28T13:31:00Z
2,4,Detroit,Cleveland,H,-535,draftkings,2025-09-28T17:00:00Z
3,4,Cleveland,Detroit,A,400,draftkings,2025-09-28T17:00:00Z
4,4,NY Giants,LA Chargers,H,220,draftkings,2025-09-28T17:00:00Z
5,4,LA Chargers,NY Giants,A,-270,draftkings,2025-09-28T17:00:00Z
6,4,Atlanta,Washington,H,105,draftkings,2025-09-28T17:01:00Z
7,4,Washington,Atlanta,A,-125,draftkings,2025-09-28T17:01:00Z
8,4,Buffalo,New Orleans,H,-1450,draftkings,2025-09-28T17:01:00Z
9,4,New Orleans,Buffalo,A,850,draftkings,2025-09-28T17:01:00Z


### nans

In [53]:
# total NaNs in the whole DF
odds_long.isna().sum().sum()

# NaNs per column
odds_long.isna().sum()

# percentage of NaNs per column
odds_long.isna().mean() * 100

week             0.0
team             0.0
opponent         0.0
home_away        0.0
ml               0.0
book             0.0
commence_time    0.0
dtype: float64

### Rank

###### CBS

In [63]:
# === Robust CBS Power Rankings -> cbs_rank_{YEAR}.csv ===
import re, sys
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pathlib import Path

YEAR = 2025
DATA = Path("../data"); DATA.mkdir(parents=True, exist_ok=True)
CBS_URL  = "https://www.cbssports.com/nfl/news/nfl-week-4-power-rankings-buccaneers-on-the-move/"
CBS_HTML = DATA / "cbs_power.html"   # optional local copy
OUT_PATH = DATA / f"cbs_rank_{YEAR}.csv"

print(f"[CBS] DATA dir: {DATA.resolve()}")
print(f"[CBS] Will write: {OUT_PATH.resolve()}")

TEAM_KEY = {
    # NFC
    "Arizona Cardinals":"Arizona","Arizona":"Arizona",
    "Atlanta Falcons":"Atlanta","Atlanta":"Atlanta",
    "Carolina Panthers":"Carolina","Carolina":"Carolina",
    "Chicago Bears":"Chicago","Chicago":"Chicago",
    "Dallas Cowboys":"Dallas","Dallas":"Dallas",
    "Detroit Lions":"Detroit","Detroit":"Detroit",
    "Green Bay Packers":"Green Bay","Green Bay":"Green Bay",
    "Los Angeles Rams":"LA Rams","L.A. Rams":"LA Rams","LA Rams":"LA Rams","Rams":"LA Rams",
    "Minnesota Vikings":"Minnesota","Minnesota":"Minnesota","Vikings":"Minnesota",
    "New Orleans Saints":"New Orleans","New Orleans":"New Orleans","Saints":"New Orleans",
    "New York Giants":"NY Giants","NY Giants":"NY Giants","Giants":"NY Giants",
    "Philadelphia Eagles":"Philadelphia","Philadelphia":"Philadelphia","Eagles":"Philadelphia",
    "San Francisco 49ers":"San Francisco","San Francisco":"San Francisco","49ers":"San Francisco",
    "Seattle Seahawks":"Seattle","Seattle":"Seattle","Seahawks":"Seattle",
    "Tampa Bay Buccaneers":"Tampa Bay","Tampa Bay":"Tampa Bay","Buccaneers":"Tampa Bay","Bucs":"Tampa Bay",
    "Washington Commanders":"Washington","Washington":"Washington","Commanders":"Washington",
    # AFC
    "Baltimore Ravens":"Baltimore","Baltimore":"Baltimore","Ravens":"Baltimore",
    "Buffalo Bills":"Buffalo","Buffalo":"Buffalo","Bills":"Buffalo",
    "Cincinnati Bengals":"Cincinnati","Cincinnati":"Cincinnati","Bengals":"Cincinnati",
    "Cleveland Browns":"Cleveland","Cleveland":"Cleveland","Browns":"Cleveland",
    "Denver Broncos":"Denver","Denver":"Denver","Broncos":"Denver",
    "Houston Texans":"Houston","Houston":"Houston","Texans":"Houston",
    "Indianapolis Colts":"Indianapolis","Indianapolis":"Indianapolis","Colts":"Indianapolis",
    "Jacksonville Jaguars":"Jacksonville","Jacksonville":"Jacksonville","Jaguars":"Jacksonville","Jags":"Jacksonville",
    "Kansas City Chiefs":"Kansas City","Kansas City":"Kansas City","Chiefs":"Kansas City",
    "Las Vegas Raiders":"Las Vegas","Las Vegas":"Las Vegas","Raiders":"Las Vegas",
    "Los Angeles Chargers":"LA Chargers","L.A. Chargers":"LA Chargers","LA Chargers":"LA Chargers","Chargers":"LA Chargers",
    "Miami Dolphins":"Miami","Miami":"Miami","Dolphins":"Miami",
    "New England Patriots":"New England","New England":"New England","Patriots":"New England",
    "New York Jets":"NY Jets","NY Jets":"NY Jets","Jets":"NY Jets",
    "Pittsburgh Steelers":"Pittsburgh","Pittsburgh":"Pittsburgh","Steelers":"Pittsburgh",
    "Tennessee Titans":"Tennessee","Tennessee":"Tennessee","Titans":"Tennessee",
}
norm = lambda s: re.sub(r"\s+"," ",str(s)).strip()
to_key = lambda s: TEAM_KEY.get(norm(s), norm(s))

# ---------- 1) Load HTML with a session (less bot-blocking) ----------
if CBS_HTML.exists() and CBS_HTML.stat().st_size > 0:
    html = CBS_HTML.read_text(encoding="utf-8", errors="ignore")
    print(f"[CBS] Using local HTML: {CBS_HTML.resolve()}")
else:
    sess = requests.Session()
    sess.headers.update({
        "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/120 Safari/537.36"),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Referer": "https://www.google.com/",
    })
    r = sess.get(CBS_URL, timeout=25)
    r.raise_for_status()
    html = r.text
    # Optional: cache once to stabilize layout across runs
    try:
        CBS_HTML.write_text(html, encoding="utf-8")
        print(f"[CBS] Cached HTML -> {CBS_HTML.resolve()}")
    except Exception as e:
        print(f"[CBS] Warning: could not cache HTML: {e}", file=sys.stderr)

soup = BeautifulSoup(html, "lxml")

# ---------- 2) Try multiple parse strategies ----------
rows = []

# Strategy A: parse explicit tables with Rank + Team columns
for tbl in soup.select("table"):
    try:
        df_tbl = pd.read_html(str(tbl))[0]
    except ValueError:
        continue
    cols = [str(c).strip().lower() for c in df_tbl.columns]
    if any(c in ("rk","rank","#","ranking") for c in cols) and any("team" in c for c in cols):
        df_tbl.columns = [str(c).strip() for c in df_tbl.columns]
        rk_col = next(c for c in df_tbl.columns if c.lower() in ("rk","rank","#","ranking"))
        tm_col = next(c for c in df_tbl.columns if "team" in c.lower())
        for _, r_ in df_tbl.iterrows():
            rk = pd.to_numeric(r_.get(rk_col), errors="coerce")
            tm = norm(r_.get(tm_col))
            if pd.notna(rk) and tm:
                rows.append((int(rk), tm))
        if len(rows) >= 20:
            break

# Strategy B: scan short text nodes like "1 Team", "#1 Team", "1. Team"
def harvest_from_text_nodes(soup):
    out = []
    # Short text nodes first to avoid paragraph noise
    for node in soup.find_all(string=True):
        txt = norm(node)
        if not txt or len(txt) > 100:
            continue
        m = re.match(r"^#?\s*(\d{1,2})[.)]?\s+([A-Za-z .'-]+)$", txt)
        if m:
            out.append((int(m.group(1)), m.group(2)))
    return out

if len(rows) < 20:
    rows += harvest_from_text_nodes(soup)

# Strategy C: scan headings/strong/b for "1 Team" patterns
if len(rows) < 20:
    for tag in soup.find_all(re.compile(r"h[1-6]|strong|b|p|span|div")):
        txt = norm(tag.get_text(" "))
        m = re.match(r"^#?\s*(\d{1,2})[.)]?\s+([A-Za-z .'-]+)$", txt)
        if m:
            rows.append((int(m.group(1)), m.group(2)))

# ---------- 3) Clean & normalize ----------
if not rows:
    print("[CBS] ERROR: Parser found 0 rows. Save the page to ../data/cbs_power.html and rerun.", file=sys.stderr)
    # Still write an empty CSV so the pipeline doesn't break
    empty = pd.DataFrame(columns=["team_key","cbs_rank","team_raw"])
    empty.to_csv(OUT_PATH, index=False)
    print(f"[CBS] Wrote EMPTY CSV -> {OUT_PATH.resolve()}  (0 teams)")
else:
    cbs = (pd.DataFrame(rows, columns=["cbs_rank","team_raw"])
             .assign(cbs_rank=lambda d: pd.to_numeric(d["cbs_rank"], errors="coerce"))
             .dropna(subset=["cbs_rank"]))
    # strip junk and map keys
    cbs["team_raw"] = cbs["team_raw"].str.replace(r"\b\d+-\d+(?:-\d+)?\b", "", regex=True).str.strip()
    cbs["team_key"] = cbs["team_raw"].map(to_key)

    # keep one row per team (best rank) and filter to known NFL teams
    cbs = (cbs.sort_values("cbs_rank")
              .drop_duplicates("team_key", keep="first"))
    cbs = cbs[cbs["team_key"].isin(TEAM_KEY.values())]
    cbs["cbs_rank"] = cbs["cbs_rank"].astype(int)
    cbs = cbs[["team_key","cbs_rank","team_raw"]].sort_values("cbs_rank")

    # Sanity log
    n = len(cbs)
    if n < 28:
        print(f"[CBS] WARNING: Only {n} mapped teams parsed. Layout may have changed.", file=sys.stderr)
    elif n > 32:
        print(f"[CBS] WARNING: Parsed {n} teams (expected ~32). Duplicate/noise likely.", file=sys.stderr)

    # ---------- 4) Save (guaranteed) ----------
    cbs.to_csv(OUT_PATH, index=False)
    print(f"[CBS] Saved -> {OUT_PATH.resolve()}  ({n} teams)")
    display(cbs.head(12))

[CBS] DATA dir: /Users/santiagovillasenor/Library/CloudStorage/Dropbox-HeuristicsFinansoft/Jaime Villasenor/Personal/Santi/ITAM/9_Semestre/survivor/data
[CBS] Will write: /Users/santiagovillasenor/Library/CloudStorage/Dropbox-HeuristicsFinansoft/Jaime Villasenor/Personal/Santi/ITAM/9_Semestre/survivor/data/cbs_rank_2025.csv
[CBS] Cached HTML -> /Users/santiagovillasenor/Library/CloudStorage/Dropbox-HeuristicsFinansoft/Jaime Villasenor/Personal/Santi/ITAM/9_Semestre/survivor/data/cbs_power.html
[CBS] Saved -> /Users/santiagovillasenor/Library/CloudStorage/Dropbox-HeuristicsFinansoft/Jaime Villasenor/Personal/Santi/ITAM/9_Semestre/survivor/data/cbs_rank_2025.csv  (25 teams)


  df_tbl = pd.read_html(str(tbl))[0]


Unnamed: 0,team_key,cbs_rank,team_raw
0,Philadelphia,1,Eagles
1,Buffalo,2,Bills
2,Tampa Bay,3,Buccaneers
3,LA Chargers,4,Chargers
6,Washington,7,Commanders
7,San Francisco,8,49ers
8,Indianapolis,9,Colts
9,LA Rams,10,Rams
10,Baltimore,11,Ravens
11,Kansas City,12,Chiefs


In [60]:
# total NaNs in the whole DF
cbs.isna().sum().sum()

# NaNs per column
cbs.isna().sum()

# percentage of NaNs per column
cbs.isna().mean() * 100

team_key    0.0
cbs_rank    0.0
dtype: float64

###### FOX

In [55]:
# === FOX Sports Power Rankings -> fox_rank_{YEAR}.csv ===
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pathlib import Path

YEAR = 2025
DATA = Path("../data"); DATA.mkdir(parents=True, exist_ok=True)

FOX_URL  = "https://www.foxsports.com/stories/nfl/2025-nfl-power-rankings-week-4-how-many-teams-actually-great"
FOX_HTML = DATA / "fox_power.html"            # optional: save page once and it will use this
OUT_PATH = DATA / f"fox_rank_{YEAR}.csv"

TEAM_KEY = {
    # NFC
    "Arizona Cardinals":"Arizona","Arizona":"Arizona",
    "Atlanta Falcons":"Atlanta","Atlanta":"Atlanta",
    "Carolina Panthers":"Carolina","Carolina":"Carolina",
    "Chicago Bears":"Chicago","Chicago":"Chicago",
    "Dallas Cowboys":"Dallas","Dallas":"Dallas",
    "Detroit Lions":"Detroit","Detroit":"Detroit",
    "Green Bay Packers":"Green Bay","Green Bay":"Green Bay",
    "Los Angeles Rams":"LA Rams","L.A. Rams":"LA Rams","LA Rams":"LA Rams","Rams":"LA Rams",
    "Minnesota Vikings":"Minnesota","Minnesota":"Minnesota",
    "New Orleans Saints":"New Orleans","New Orleans":"New Orleans",
    "New York Giants":"NY Giants","NY Giants":"NY Giants","Giants":"NY Giants",
    "Philadelphia Eagles":"Philadelphia","Philadelphia":"Philadelphia","Eagles":"Philadelphia",
    "San Francisco 49ers":"San Francisco","San Francisco":"San Francisco","49ers":"San Francisco",
    "Seattle Seahawks":"Seattle","Seattle":"Seattle","Seahawks":"Seattle",
    "Tampa Bay Buccaneers":"Tampa Bay","Tampa Bay":"Tampa Bay","Buccaneers":"Tampa Bay","Bucs":"Tampa Bay",
    "Washington Commanders":"Washington","Washington":"Washington","Commanders":"Washington",
    # AFC
    "Baltimore Ravens":"Baltimore","Baltimore":"Baltimore","Ravens":"Baltimore",
    "Buffalo Bills":"Buffalo","Buffalo":"Buffalo","Bills":"Buffalo",
    "Cincinnati Bengals":"Cincinnati","Cincinnati":"Cincinnati","Bengals":"Cincinnati",
    "Cleveland Browns":"Cleveland","Cleveland":"Cleveland","Browns":"Cleveland",
    "Denver Broncos":"Denver","Denver":"Denver","Broncos":"Denver",
    "Houston Texans":"Houston","Houston":"Houston","Texans":"Houston",
    "Indianapolis Colts":"Indianapolis","Indianapolis":"Indianapolis","Colts":"Indianapolis",
    "Jacksonville Jaguars":"Jacksonville","Jacksonville":"Jacksonville","Jaguars":"Jacksonville","Jags":"Jacksonville",
    "Kansas City Chiefs":"Kansas City","Kansas City":"Kansas City","Chiefs":"Kansas City",
    "Las Vegas Raiders":"Las Vegas","Las Vegas":"Las Vegas","Raiders":"Las Vegas",
    "Los Angeles Chargers":"LA Chargers","L.A. Chargers":"LA Chargers","LA Chargers":"LA Chargers","Chargers":"LA Chargers",
    "Miami Dolphins":"Miami","Miami":"Miami","Dolphins":"Miami",
    "New England Patriots":"New England","New England":"New England","Patriots":"New England",
    "New York Jets":"NY Jets","NY Jets":"NY Jets","Jets":"NY Jets",
    "Pittsburgh Steelers":"Pittsburgh","Pittsburgh":"Pittsburgh","Steelers":"Pittsburgh",
    "Tennessee Titans":"Tennessee","Tennessee":"Tennessee","Titans":"Tennessee",
}
norm = lambda s: re.sub(r"\s+"," ",str(s)).strip()
to_key = lambda s: TEAM_KEY.get(norm(s), norm(s))

# --- load HTML (prefer local saved page if present) ---
if FOX_HTML.exists():
    html = FOX_HTML.read_text(encoding="utf-8", errors="ignore")
else:
    r = requests.get(FOX_URL, timeout=25)
    r.raise_for_status()
    html = r.text

soup = BeautifulSoup(html, "lxml")

# --- parse ranks ---
rows = []

# Collect short text nodes that start with "#<rank>"
for node in soup.find_all(string=True):
    txt = norm(node)
    if not txt or len(txt) > 100:  # skip long blobs
        continue
    if re.match(r"^#\s*\d{1,2}\b", txt):
        m = re.match(r"^#\s*(\d{1,2})\s+([A-Za-z .'-]+)", txt)
        if m:
            rows.append((int(m.group(1)), m.group(2)))

# Fallback: headings like "1. Team" or "1 Team"
if len(rows) < 20:
    for tag in soup.find_all(re.compile("h[1-6]|strong|b|p|span|div")):
        txt = norm(tag.get_text(" "))
        m = re.match(r"^#?\s*(\d{1,2})[.)]?\s+([A-Za-z .'-]+)$", txt)
        if m:
            rows.append((int(m.group(1)), m.group(2)))

if not rows:
    raise RuntimeError("FOX parse produced 0 rows. If needed, save the page to ../data/fox_power.html and rerun.")

# --- clean & save ---
fox = pd.DataFrame(rows, columns=["fox_rank","team_raw"]).dropna()
fox["fox_rank"] = pd.to_numeric(fox["fox_rank"], errors="coerce")
fox = fox.dropna(subset=["fox_rank"])
fox["fox_rank"] = fox["fox_rank"].astype(int)

fox["team_key"] = fox["team_raw"].map(to_key)
fox = fox.sort_values("fox_rank").drop_duplicates("team_key", keep="first")

# keep only mapped NFL teams
fox = fox[fox["team_key"].isin(TEAM_KEY.values())]
fox = fox[["team_key","fox_rank","team_raw"]].sort_values("fox_rank")

fox.to_csv(OUT_PATH, index=False)
print(f"Saved FOX ranks -> {OUT_PATH}  ({len(fox)} teams)")
display(fox.head(12))

Saved FOX ranks -> ../data/fox_rank_2025.csv  (32 teams)


Unnamed: 0,team_key,fox_rank,team_raw
0,Philadelphia,1,Philadelphia Eagles
1,Buffalo,2,Buffalo Bills
2,LA Chargers,3,Los Angeles Chargers
3,Detroit,4,Detroit Lions
4,Kansas City,5,Kansas City Chiefs
5,Tampa Bay,6,Tampa Bay Buccaneers
6,Baltimore,7,Baltimore Ravens
7,Green Bay,8,Green Bay Packers
8,Washington,9,Washington Commanders
9,LA Rams,10,Los Angeles Rams


In [61]:
# total NaNs in the whole DF
fox.isna().sum().sum()

# NaNs per column
fox.isna().sum()

# percentage of NaNs per column
fox.isna().mean() * 100


team_key    0.0
fox_rank    0.0
dtype: float64

###### NFL 

In [56]:
# === NFL.com Power Rankings -> nfl_rank_{YEAR}.csv (standalone) ===
import re
import pandas as pd# === NFL.com Power Rankings -> nfl_rank_{YEAR}.csv ===
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pathlib import Path

YEAR = 2025
DATA = Path("../data"); DATA.mkdir(parents=True, exist_ok=True)

NFL_URL  = "https://www.nfl.com/news/nfl-power-rankings-week-4-2025-nfl-season"
NFL_HTML = DATA / "nfl_power.html"              # optional: save page once and it will use this
OUT_PATH = DATA / f"nfl_rank_{YEAR}.csv"

TEAM_KEY = {
    # NFC
    "Arizona Cardinals":"Arizona","Arizona":"Arizona",
    "Atlanta Falcons":"Atlanta","Atlanta":"Atlanta",
    "Carolina Panthers":"Carolina","Carolina":"Carolina",
    "Chicago Bears":"Chicago","Chicago":"Chicago",
    "Dallas Cowboys":"Dallas","Dallas":"Dallas",
    "Detroit Lions":"Detroit","Detroit":"Detroit",
    "Green Bay Packers":"Green Bay","Green Bay":"Green Bay",
    "Los Angeles Rams":"LA Rams","L.A. Rams":"LA Rams","LA Rams":"LA Rams","Rams":"LA Rams",
    "Minnesota Vikings":"Minnesota","Minnesota":"Minnesota",
    "New Orleans Saints":"New Orleans","New Orleans":"New Orleans",
    "New York Giants":"NY Giants","NY Giants":"NY Giants",
    "Philadelphia Eagles":"Philadelphia","Philadelphia":"Philadelphia",
    "San Francisco 49ers":"San Francisco","San Francisco":"San Francisco","49ers":"San Francisco",
    "Seattle Seahawks":"Seattle","Seattle":"Seattle",
    "Tampa Bay Buccaneers":"Tampa Bay","Tampa Bay":"Tampa Bay","Buccaneers":"Tampa Bay","Bucs":"Tampa Bay",
    "Washington Commanders":"Washington","Washington":"Washington",
    # AFC
    "Baltimore Ravens":"Baltimore","Baltimore":"Baltimore",
    "Buffalo Bills":"Buffalo","Buffalo":"Buffalo",
    "Cincinnati Bengals":"Cincinnati","Cincinnati":"Cincinnati",
    "Cleveland Browns":"Cleveland","Cleveland":"Cleveland",
    "Denver Broncos":"Denver","Denver":"Denver",
    "Houston Texans":"Houston","Houston":"Houston",
    "Indianapolis Colts":"Indianapolis","Indianapolis":"Indianapolis",
    "Jacksonville Jaguars":"Jacksonville","Jacksonville":"Jacksonville","Jaguars":"Jacksonville","Jags":"Jacksonville",
    "Kansas City Chiefs":"Kansas City","Kansas City":"Kansas City",
    "Las Vegas Raiders":"Las Vegas","Las Vegas":"Las Vegas",
    "Los Angeles Chargers":"LA Chargers","L.A. Chargers":"LA Chargers","LA Chargers":"LA Chargers","Chargers":"LA Chargers",
    "Miami Dolphins":"Miami","Miami":"Miami",
    "New England Patriots":"New England","New England":"New England",
    "New York Jets":"NY Jets","NY Jets":"NY Jets",
    "Pittsburgh Steelers":"Pittsburgh","Pittsburgh":"Pittsburgh",
    "Tennessee Titans":"Tennessee","Tennessee":"Tennessee",
}
norm = lambda s: re.sub(r"\s+"," ",str(s)).strip()
to_key = lambda s: TEAM_KEY.get(norm(s), norm(s))

# --- load HTML (prefer local saved page if present) ---
if NFL_HTML.exists():
    html = NFL_HTML.read_text(encoding="utf-8", errors="ignore")
else:
    sess = requests.Session()
    sess.headers.update({
        "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/120 Safari/537.36"),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Referer": "https://www.google.com/",
    })
    r = sess.get(NFL_URL, timeout=25)
    r.raise_for_status()
    html = r.text

soup = BeautifulSoup(html, "lxml")

# --- parse ranks ---
# NFL.com card layout often has "Rank" then the number, then the Team name a few nodes later.
lines = [norm(x) for x in soup.get_text("\n").splitlines()]
lines = [ln for ln in lines if ln]

rows = []
i = 0
while i < len(lines):
    if lines[i].lower() == "rank":
        # find next pure rank number
        j = i + 1
        while j < len(lines) and not re.match(r"^\d{1,2}$", lines[j]):
            j += 1
        if j < len(lines):
            rk = int(lines[j])
            # next plausible team name (skip records like 3-0)
            k = j + 1
            team = None
            while k < len(lines):
                cand = lines[k]
                if re.match(r"^\d{1,2}$", cand):
                    k += 1; continue
                if re.search(r"\b\d+-\d+(?:-\d+)?\b", cand):
                    k += 1; continue
                team = cand
                break
            if team:
                team = re.sub(r"\s*\(.*?\)\s*$", "", team)
                rows.append((rk, team))
                i = k
                continue
    i += 1

# Fallback: also match lines like "1. Team" or "#1 Team"
if len(rows) < 20:
    for ln in lines:
        m = re.match(r"^#?\s*(\d{1,2})[.)]?\s+([A-Za-z .'-]+)$", ln)
        if m:
            rows.append((int(m.group(1)), m.group(2)))

if not rows:
    raise RuntimeError("NFL.com parser found 0 rows. If needed, save the page to ../data/nfl_power.html and rerun.")

# --- clean & save ---
df = pd.DataFrame(rows, columns=["nfl_rank","team_raw"]).dropna()
df["nfl_rank"] = pd.to_numeric(df["nfl_rank"], errors="coerce")
df = df.dropna(subset=["nfl_rank"])
df["nfl_rank"] = df["nfl_rank"].astype(int)

# Remove record fragments & normalize names
df["team_raw"] = df["team_raw"].str.replace(r"\b\d+-\d+(?:-\d+)?\b", "", regex=True).str.strip()
df["team_key"] = df["team_raw"].map(to_key)

# One row per team (lowest rank wins), only mapped NFL clubs
df = df.sort_values("nfl_rank").drop_duplicates("team_key", keep="first")
df = df[df["team_key"].isin(TEAM_KEY.values())]
df = df[["team_key","nfl_rank","team_raw"]].sort_values("nfl_rank")

df.to_csv(OUT_PATH, index=False)
print(f"Saved NFL.com ranks -> {OUT_PATH}  ({len(df)} teams)")
display(df.head(12))
import requests
from bs4 import BeautifulSoup
from pathlib import Path

YEAR = 2025
DATA = Path("../data"); DATA.mkdir(parents=True, exist_ok=True)

NFL_URL  = "https://www.nfl.com/news/nfl-power-rankings-week-4-2025-nfl-season"
NFL_HTML = DATA / "nfl_power.html"   # optional: save page once to avoid layout changes/anti-bot
OUT_PATH = DATA / f"nfl_rank_{YEAR}.csv"

# Map names to your schedule's city-style keys (same mapping you’ve used)
TEAM_KEY = {
    "Arizona Cardinals":"Arizona","Arizona":"Arizona",
    "Atlanta Falcons":"Atlanta","Atlanta":"Atlanta",
    "Carolina Panthers":"Carolina","Carolina":"Carolina",
    "Chicago Bears":"Chicago","Chicago":"Chicago",
    "Dallas Cowboys":"Dallas","Dallas":"Dallas",
    "Detroit Lions":"Detroit","Detroit":"Detroit",
    "Green Bay Packers":"Green Bay","Green Bay":"Green Bay",
    "Los Angeles Rams":"LA Rams","L.A. Rams":"LA Rams","LA Rams":"LA Rams","Rams":"LA Rams",
    "Minnesota Vikings":"Minnesota","Minnesota":"Minnesota",
    "New Orleans Saints":"New Orleans","New Orleans":"New Orleans",
    "New York Giants":"NY Giants","NY Giants":"NY Giants",
    "Philadelphia Eagles":"Philadelphia","Philadelphia":"Philadelphia",
    "San Francisco 49ers":"San Francisco","San Francisco":"San Francisco","49ers":"San Francisco",
    "Seattle Seahawks":"Seattle","Seattle":"Seattle",
    "Tampa Bay Buccaneers":"Tampa Bay","Tampa Bay":"Tampa Bay","Buccaneers":"Tampa Bay","Bucs":"Tampa Bay",
    "Washington Commanders":"Washington","Washington":"Washington",
    "Baltimore Ravens":"Baltimore","Baltimore":"Baltimore",
    "Buffalo Bills":"Buffalo","Buffalo":"Buffalo",
    "Cincinnati Bengals":"Cincinnati","Cincinnati":"Cincinnati",
    "Cleveland Browns":"Cleveland","Cleveland":"Cleveland",
    "Denver Broncos":"Denver","Denver":"Denver",
    "Houston Texans":"Houston","Houston":"Houston",
    "Indianapolis Colts":"Indianapolis","Indianapolis":"Indianapolis",
    "Jacksonville Jaguars":"Jacksonville","Jacksonville":"Jacksonville","Jaguars":"Jacksonville","Jags":"Jacksonville",
    "Kansas City Chiefs":"Kansas City","Kansas City":"Kansas City",
    "Las Vegas Raiders":"Las Vegas","Las Vegas":"Las Vegas",
    "Los Angeles Chargers":"LA Chargers","L.A. Chargers":"LA Chargers","LA Chargers":"LA Chargers","Chargers":"LA Chargers",
    "Miami Dolphins":"Miami","Miami":"Miami",
    "New England Patriots":"New England","New England":"New England",
    "New York Jets":"NY Jets","NY Jets":"NY Jets",
    "Pittsburgh Steelers":"Pittsburgh","Pittsburgh":"Pittsburgh",
    "Tennessee Titans":"Tennessee","Tennessee":"Tennessee",
}
norm = lambda s: re.sub(r"\s+"," ",str(s)).strip()
to_key = lambda s: TEAM_KEY.get(norm(s), norm(s))

# 1) Load HTML (prefer local saved copy if present)
if NFL_HTML.exists():
    html = NFL_HTML.read_text(encoding="utf-8", errors="ignore")
else:
    sess = requests.Session()
    sess.headers.update({
        "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/120 Safari/537.36"),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Referer": "https://www.google.com/",
    })
    r = sess.get(NFL_URL, timeout=25)
    r.raise_for_status()
    html = r.text

soup = BeautifulSoup(html, "lxml")

# 2) Parse: NFL.com cards usually show blocks like:
#    "Rank" / "1" / "<Team Name>" / "<record>"
# We'll scan text lines and use a sliding window: when we see "Rank", next non-empty is number, then team.
lines = [norm(x) for x in soup.get_text("\n").splitlines()]
lines = [ln for ln in lines if ln]  # drop empties

rows = []
i = 0
while i < len(lines):
    if lines[i].lower() == "rank":
        # find next numeric line
        j = i + 1
        while j < len(lines) and not re.match(r"^\d{1,2}$", lines[j]):  # pure rank number
            j += 1
        if j < len(lines):
            rk = int(lines[j])
            # next non-empty, non-number line should be team
            k = j + 1
            # skip record lines like "3-0" or "2-1"
            team = None
            while k < len(lines):
                cand = lines[k]
                if re.match(r"^\d{1,2}$", cand):  # another number; unlikely but skip
                    k += 1; continue
                if re.search(r"\b\d+-\d+(?:-\d+)?\b", cand):  # records like 3-0 or 10-6-1
                    k += 1; continue
                team = cand
                break
            if team:
                # clean possible tail like "Rank" or odd labels
                team = re.sub(r"\s*\(.*?\)\s*$", "", team)
                rows.append((rk, team))
                # advance a bit to avoid duplicates
                i = k
                continue
    i += 1

# Fallback: also consider headings like "#1 Team" or "1. Team"
if len(rows) < 20:
    for ln in lines:
        m = re.match(r"^#?\s*(\d{1,2})[.)]?\s+([A-Za-z .'-]+)$", ln)
        if m:
            rows.append((int(m.group(1)), m.group(2)))

if not rows:
    raise RuntimeError("NFL.com parser found 0 rows. If needed, save the page to ../data/nfl_power.html and rerun.")

# 3) Clean & normalize
df = pd.DataFrame(rows, columns=["nfl_rank","team_raw"]).dropna()
df["nfl_rank"] = pd.to_numeric(df["nfl_rank"], errors="coerce")
df = df.dropna(subset=["nfl_rank"])
df["nfl_rank"] = df["nfl_rank"].astype(int)

# Normalize team names
# Strip trailing records that slipped in, and map to keys
df["team_raw"] = df["team_raw"].str.replace(r"\b\d+-\d+(?:-\d+)?\b", "", regex=True).str.strip()
df["team_key"] = df["team_raw"].map(to_key)

# Keep one row per team -> best (lowest) rank if duplicates
df = df.sort_values("nfl_rank").drop_duplicates("team_key", keep="first")

# Filter to known teams and keep final columns
df = df[df["team_key"].isin(TEAM_KEY.values())]
df = df[["team_key","nfl_rank","team_raw"]].sort_values("nfl_rank")

# 4) Save
df.to_csv(OUT_PATH, index=False)
print(f"Saved NFL.com ranks -> {OUT_PATH}  ({len(df)} teams)")
display(df.head(12))

Saved NFL.com ranks -> ../data/nfl_rank_2025.csv  (32 teams)


Unnamed: 0,team_key,nfl_rank,team_raw
0,Philadelphia,1,Philadelphia Eagles
1,Buffalo,2,Buffalo Bills
2,Detroit,3,Detroit Lions
3,LA Chargers,4,Los Angeles Chargers
4,Tampa Bay,5,Tampa Bay Buccaneers
5,Green Bay,6,Green Bay Packers
6,Baltimore,7,Baltimore Ravens
7,Washington,8,Washington Commanders
8,Indianapolis,9,Indianapolis Colts
9,LA Rams,10,Los Angeles Rams


Saved NFL.com ranks -> ../data/nfl_rank_2025.csv  (32 teams)


Unnamed: 0,team_key,nfl_rank,team_raw
0,Philadelphia,1,Philadelphia Eagles
1,Buffalo,2,Buffalo Bills
2,Detroit,3,Detroit Lions
3,LA Chargers,4,Los Angeles Chargers
4,Tampa Bay,5,Tampa Bay Buccaneers
5,Green Bay,6,Green Bay Packers
6,Baltimore,7,Baltimore Ravens
7,Washington,8,Washington Commanders
8,Indianapolis,9,Indianapolis Colts
9,LA Rams,10,Los Angeles Rams


### Merge