In [3]:
#WebScraping
import argparse
import sys
import time
import re
import io
from typing import Optional
from typing import Optional, Dict, List, Tuple

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [4]:
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/118.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Connection": "keep-alive",
}

def build_url(year: int) -> str:
    return f"https://www.spotrac.com/mlb/injured/_/year/{year}/view/player"

def fetch_html(url: str, max_retries: int = 3, backoff: float = 2.0) -> Optional[str]:
    for attempt in range(1, max_retries + 1):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=20)
            if resp.status_code == 200 and "text/html" in resp.headers.get("Content-Type", ""):
                return resp.text
            if resp.status_code in (403, 429, 503):
                time.sleep(backoff * attempt)
                continue
            break
        except requests.RequestException:
            time.sleep(backoff * attempt)
    return None

def parse_table_with_pandas(html: str) -> Optional[pd.DataFrame]:
    try:
        # Use StringIO to avoid the literal-HTML FutureWarning
        tables = pd.read_html(io.StringIO(html))
        if not tables:
            return None
        return max(tables, key=lambda t: t.shape[1])  # pick widest table
    except ValueError:
        return None

def parse_table_with_bs4(html: str) -> Optional[pd.DataFrame]:
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("table")
    if not table:
        return None

    thead = table.find("thead")
    if thead:
        headers = [th.get_text(strip=True) for th in thead.find_all("th")]
    else:
        first_row = table.find("tr")
        headers = [th.get_text(strip=True) for th in first_row.find_all(["th", "td"])] if first_row else []

    rows = []
    for tr in table.find_all("tr"):
        tds = tr.find_all("td")
        if not tds:
            continue
        rows.append([td.get_text(" ", strip=True) for td in tds])

    if not rows:
        return None

    max_len = max(len(r) for r in rows)
    if len(headers) != max_len:
        if len(headers) < max_len:
            headers += [f"col_{i+1}" for i in range(len(headers), max_len)]
        else:
            headers = headers[:max_len]

    return pd.DataFrame(rows, columns=headers)

def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip().replace("\n", " ").replace("  ", " ") for c in df.columns]
    df = df.dropna(axis=1, how="all")
    df = df.replace("", pd.NA).dropna(how="all")
    return df

def try_requests_then_playwright(url: str) -> pd.DataFrame:
    html = fetch_html(url)
    if html:
        for parser in (parse_table_with_pandas, parse_table_with_bs4):
            df = parser(html)
            if isinstance(df, pd.DataFrame) and not df.empty:
                return clean_df(df)

    # Fallback to Playwright (optional dependency)
    try:
        from playwright.sync_api import sync_playwright
    except ImportError as e:
        raise RuntimeError(
            "Requests parsing failed and Playwright is not installed.\n"
            "Install with:\n  pip install playwright\n  playwright install chromium"
        ) from e

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(user_agent=HEADERS["User-Agent"])
        page = context.new_page()
        page.goto(url, wait_until="domcontentloaded", timeout=45000)
        page.wait_for_selector("table", timeout=20000)
        content = page.content()
        browser.close()

    for parser in (parse_table_with_pandas, parse_table_with_bs4):
        df = parser(content)
        if isinstance(df, pd.DataFrame) and not df.empty:
            return clean_df(df)

    raise RuntimeError("Could not locate a data table on the page after rendering.")

# ---------------- Normalizer ----------------

def _pick(df, *cands):
    cand_lc = [c.lower() for c in df.columns]
    for want in cands:
        for i, c in enumerate(cand_lc):
            if want in c:
                return df.columns[i]
    return None

def normalize_spotrac_injured_df(raw: pd.DataFrame) -> pd.DataFrame:
    df = raw.copy()

    col_rank   = _pick(df, "rank")
    col_player = _pick(df, "player")
    col_pos    = _pick(df, "pos")
    col_team   = _pick(df, "team")
    col_reason = _pick(df, "reason")
    col_days   = _pick(df, "days")       
    col_cash   = _pick(df, "cash", "total")

    if any(c is None for c in [col_player, col_reason]):
        raise ValueError(f"Missing required columns. Found: {list(df.columns)}")

    out = pd.DataFrame()
    if col_rank:   out["rank"]   = pd.to_numeric(df[col_rank], errors="coerce").astype("Int64")
    out["player"]  = df[col_player].astype(str).str.strip()
    if col_pos:    out["pos"]    = df[col_pos].astype(str).str.strip()
    if col_team:   out["team"]   = df[col_team].astype(str).str.strip()

    reason = df[col_reason].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()
    out["reason_raw"] = reason

    rx1 = re.compile(
        r"^(?P<il>[^:]+?IL|[^:]+?List|Suspension|Restricted(?: List)?)"
        r"(?:\s*-\s*(?P<inj>[^:]+?))?"
        r"(?:\s*:\s*(?P<start>\d{1,2}/\d{1,2}/\d{2}))?"
        r"(?:-(?P<end>\d{1,2}/\d{1,2}/\d{2}))?$",
        flags=re.IGNORECASE
    )
    rx2 = re.compile(
        r"^(?P<il>[^:]+?)"
        r"(?:\s*-\s*(?P<inj>[^:]+?))?"
        r"(?:\s*:\s*(?P<start>\d{1,2}/\d{1,2}/\d{2}))?"
        r"(?:-(?P<end>\d{1,2}/\d{1,2}/\d{2}))?$",
        flags=re.IGNORECASE
    )

    ext  = reason.str.extract(rx1)
    ext2 = reason.str.extract(rx2)

    for k in ["il", "inj", "start", "end"]:
        out[k] = ext[k].where(ext[k].notna(), ext2[k])

    out = out.rename(columns={"il": "il_type", "inj": "injury"})
    out["il_type"] = out["il_type"].astype(str).str.strip()
    out["injury"]  = out["injury"].astype(str).str.strip()

    for k in ["start", "end"]:
        out[k + "_date"] = pd.to_datetime(out[k], format="%m/%d/%y", errors="coerce")
    out = out.drop(columns=["start", "end"])

    if col_days:
        out["days_missed"] = pd.to_numeric(
            df[col_days].astype(str).str.replace(r"[^\d]", "", regex=True),
            errors="coerce"
        ).astype("Int64")

    if col_cash:
        out["cash_total"] = (
            df[col_cash].astype(str)
            .str.replace(r"[$,]", "", regex=True)
            .str.extract(r"([\d.]+)")[0]
            .pipe(pd.to_numeric, errors="coerce")
        )

    if "cash_total" in out and "days_missed" in out:
        out["cash_per_day"] = (out["cash_total"] / out["days_missed"].astype(float)).round(2)

    cols = ["rank", "player", "pos", "team",
            "il_type", "injury", "start_date", "end_date",
            "days_missed", "cash_total", "cash_per_day", "reason_raw"]
    cols = [c for c in cols if c in out.columns]
    return out[cols]

# ---------------- Multi-year API (no autosave) ----------------

def scrape_spotrac_years(years: List[int], sleep_sec: float = 1.0) -> Tuple[Dict[int, pd.DataFrame], pd.DataFrame]:
    """
    Scrape and normalize Spotrac MLB Injured List for multiple years.
    Returns:
      - tables: {year: cleaned DataFrame}
      - combined: single DataFrame with a 'year' column
    Does NOT write to disk.
    """
    tables: Dict[int, pd.DataFrame] = {}
    frames = []
    for yr in years:
        url = build_url(yr)
        raw = try_requests_then_playwright(url)
        clean = normalize_spotrac_injured_df(raw)
        clean = clean.assign(year=yr)
        tables[yr] = clean
        frames.append(clean)
        if sleep_sec:
            time.sleep(sleep_sec)  # be polite to the site
    combined = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
    return tables, combined


In [5]:
years = [2025]
tables, combined = scrape_spotrac_years(years)

# Inspect
{y: df.shape for y, df in tables.items()}
combined.head()


Unnamed: 0,rank,player,pos,team,il_type,injury,start_date,end_date,days_missed,cash_total,cash_per_day,reason_raw,year
0,1,Anthony Rendon,3B,LAA,60-Day IL,Hip,2025-03-27,2025-09-28,186,37999986,204301.0,60-Day IL - Hip: 3/27/25-9/28/25,2025
1,2,Gerrit Cole,SP,NYY,60-Day IL,Elbow Tommy John,2025-03-27,2025-09-28,186,35999928,193548.0,60-Day IL - Elbow Tommy John: 3/27/25-9/28/25,2025
2,3,Kris Bryant,1B,COL,60-Day IL,Back,2025-04-13,2025-09-28,169,23623665,139785.0,60-Day IL - Back: 4/13/25-9/28/25,2025
3,4,Jordan Montgomery,SP,ARI,60-Day IL,Elbow Tommy John,2025-03-27,2025-09-28,186,22500048,120968.0,60-Day IL - Elbow Tommy John: 3/27/25-9/28/25,2025
4,5,Joe Musgrove,SP,SD,60-Day IL,Elbow Tommy John,2025-03-27,2025-09-28,186,20000022,107527.0,60-Day IL - Elbow Tommy John: 3/27/25-9/28/25,2025


In [None]:
combined.to_csv("mlb_injured_multi_year.csv", index=False)
# or per year:
for y, df in tables.items():
    df.to_parquet(f"mlb_injured_{y}.parquet", index=False)