In [1]:
#WebScraping
import argparse
import sys
import time
import re
import io
from typing import Optional
from typing import Optional, Dict, List, Tuple

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/118.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Connection": "keep-alive",
}

def build_url(year: int) -> str:
    return f"https://www.spotrac.com/mlb/injured/_/year/{year}/view/player"

def fetch_html(url: str, max_retries: int = 3, backoff: float = 2.0) -> Optional[str]:
    for attempt in range(1, max_retries + 1):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=20)
            if resp.status_code == 200 and "text/html" in resp.headers.get("Content-Type", ""):
                return resp.text
            if resp.status_code in (403, 429, 503):
                time.sleep(backoff * attempt)
                continue
            break
        except requests.RequestException:
            time.sleep(backoff * attempt)
    return None

def parse_table_with_pandas(html: str) -> Optional[pd.DataFrame]:
    try:
        # Use StringIO to avoid the literal-HTML FutureWarning
        tables = pd.read_html(io.StringIO(html))
        if not tables:
            return None
        return max(tables, key=lambda t: t.shape[1])  # pick widest table
    except ValueError:
        return None

def parse_table_with_bs4(html: str) -> Optional[pd.DataFrame]:
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("table")
    if not table:
        return None

    thead = table.find("thead")
    if thead:
        headers = [th.get_text(strip=True) for th in thead.find_all("th")]
    else:
        first_row = table.find("tr")
        headers = [th.get_text(strip=True) for th in first_row.find_all(["th", "td"])] if first_row else []

    rows = []
    for tr in table.find_all("tr"):
        tds = tr.find_all("td")
        if not tds:
            continue
        rows.append([td.get_text(" ", strip=True) for td in tds])

    if not rows:
        return None

    max_len = max(len(r) for r in rows)
    if len(headers) != max_len:
        if len(headers) < max_len:
            headers += [f"col_{i+1}" for i in range(len(headers), max_len)]
        else:
            headers = headers[:max_len]

    return pd.DataFrame(rows, columns=headers)

def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    
    """
    df = df.copy()
    df.columns = [c.strip().replace("\n", " ").replace("  ", " ") for c in df.columns]
    df = df.dropna(axis=1, how="all")
    df = df.replace("", pd.NA).dropna(how="all")
    return df

def try_requests_then_playwright(url: str) -> pd.DataFrame:
    """
    
    """
    html = fetch_html(url)
    if html:
        for parser in (parse_table_with_pandas, parse_table_with_bs4):
            df = parser(html)
            if isinstance(df, pd.DataFrame) and not df.empty:
                return clean_df(df)

    # Fallback to Playwright (optional dependency)
    try:
        from playwright.sync_api import sync_playwright
    except ImportError as e:
        raise RuntimeError(
            "Requests parsing failed and Playwright is not installed.\n"
            "Install with:\n  pip install playwright\n  playwright install chromium"
        ) from e

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(user_agent=HEADERS["User-Agent"])
        page = context.new_page()
        page.goto(url, wait_until="domcontentloaded", timeout=45000)
        page.wait_for_selector("table", timeout=20000)
        content = page.content()
        browser.close()

    for parser in (parse_table_with_pandas, parse_table_with_bs4):
        df = parser(content)
        if isinstance(df, pd.DataFrame) and not df.empty:
            return clean_df(df)

    raise RuntimeError("Could not locate a data table on the page after rendering.")

# ---------------- Normalizer ----------------

def _pick(df, *cands):
    """
    
    """
    cand_lc = [c.lower() for c in df.columns]
    for want in cands:
        for i, c in enumerate(cand_lc):
            if want in c:
                return df.columns[i]
    return None

def normalize_spotrac_injured_df(raw: pd.DataFrame) -> pd.DataFrame:
    """
    Normalize Spotrac 'Injured List' table into a tidy, episode-level dataset.
    - Each injury episode becomes its own row (including multiple date ranges per line).
    """
    df = raw.copy()

    col_rank   = _pick(df, "rank")
    col_player = _pick(df, "player")
    col_pos    = _pick(df, "pos")
    col_team   = _pick(df, "team")
    col_reason = _pick(df, "reason")
    col_days   = _pick(df, "days")        # table's days (often season-total)
    col_cash   = _pick(df, "cash", "total")

    if any(c is None for c in [col_player, col_reason]):
        raise ValueError(f"Missing required columns. Found: {list(df.columns)}")

    # Canonicalize text columns
    def _clean(s: pd.Series) -> pd.Series:
        return s.astype(str).str.replace(r"\s+", " ", regex=True).str.strip()

    if col_rank:
        df[col_rank] = pd.to_numeric(df[col_rank], errors="coerce").astype("Int64")
    df[col_player] = _clean(df[col_player])
    if col_pos:    df[col_pos] = _clean(df[col_pos])
    if col_team:   df[col_team] = _clean(df[col_team])

    # Keep original reason with possible newlines preserved
    reason_raw = df[col_reason].astype(str)

    # Regex to capture line-level "IL type - injury : start-end"
    # We allow multiple such patterns per cell.
    rx_line = re.compile(
        r"""
        (?P<il>[^:\n,]+?(?:IL|List|Suspension|Restricted(?:\s+List)?))   # IL type
        (?:\s*-\s*(?P<inj>[^:\n,]+?))?                                   # optional injury text
        \s*:\s*
        (?P<dates>.+?)                                                   # one or more date ranges until line/entry end
        (?=$|\n)                                                         # stop at end or newline
        """,
        re.IGNORECASE | re.VERBOSE
    )

    # Within a line, find all (start-end) date pairs (comma-separated allowed)
    rx_range = re.compile(
        r'(\d{1,2}/\d{1,2}/\d{2})\s*-\s*(\d{1,2}/\d{1,2}/\d{2})'
    )

    records = []

    for _, row in df.iterrows():
        base = {
            "rank":   row[col_rank] if col_rank else pd.NA,
            "player": row[col_player],
            "pos":    row[col_pos] if col_pos else pd.NA,
            "team":   row[col_team] if col_team else pd.NA,
        }
        text = str(row[col_reason]).strip()

        # Split by explicit newlines first (from preserved <br>s); also fall back to scanning entire string.
        candidates = [t.strip() for t in re.split(r'\n+', text) if t.strip()]
        if not candidates:
            candidates = [text]

        matched_any = False
        for cand in candidates:
            # Try to find one or more IL entries inside this candidate
            for m in rx_line.finditer(cand):
                matched_any = True
                il_type = (m.group("il") or "").strip()
                injury  = (m.group("inj") or "").strip()
                datestr = (m.group("dates") or "").strip()

                # Multiple date ranges in one line → explode
                ranges = rx_range.findall(datestr) or [(None, None)]
                for start_s, end_s in ranges:
                    rec = dict(base)
                    rec["il_type"]    = il_type
                    rec["injury"]     = injury
                    rec["reason_raw"] = cand

                    # Dates
                    rec["start_date"] = pd.to_datetime(start_s, format="%m/%d/%y", errors="coerce") if start_s else pd.NaT
                    rec["end_date"]   = pd.to_datetime(end_s,   format="%m/%d/%y", errors="coerce") if end_s else pd.NaT

                    # Cash & days: keep totals off per-episode to avoid implying allocation
                    # If you prefer to carry them through, uncomment below lines.
                    # if col_days:
                    #     rec["days_missed"] = pd.to_numeric(
                    #         str(row[col_days]).replace(",", ""),
                    #         errors="coerce"
                    #     ).astype("Int64")
                    # if col_cash:
                    #     cash = (
                    #         str(row[col_cash])
                    #         .replace("$", "")
                    #         .replace(",", "")
                    #     )
                    #     rec["cash_total"] = pd.to_numeric(cash, errors="coerce")

                    records.append(rec)

        # If we failed to match with rx_line at all, do a more permissive fallback:
        if not matched_any:
            # Try your original (rx1/rx2) single-extract logic as a last resort
            rx1 = re.compile(
                r"^(?P<il>[^:]+?IL|[^:]+?List|Suspension|Restricted(?: List)?)"
                r"(?:\s*-\s*(?P<inj>[^:]+?))?"
                r"(?:\s*:\s*(?P<start>\d{1,2}/\d{1,2}/\d{2}))?"
                r"(?:-(?P<end>\d{1,2}/\d{1,2}/\d{2}))?$",
                flags=re.IGNORECASE
            )
            rx2 = re.compile(
                r"^(?P<il>[^:]+?)"
                r"(?:\s*-\s*(?P<inj>[^:]+?))?"
                r"(?:\s*:\s*(?P<start>\d{1,2}/\d{1,2}/\d{2}))?"
                r"(?:-(?P<end>\d{1,2}/\d{1,2}/\d{2}))?$",
                flags=re.IGNORECASE
            )
            ext = rx1.search(text) or rx2.search(text)
            rec = dict(base)
            rec["il_type"]    = (ext.group("il") if ext else "").strip()
            rec["injury"]     = (ext.group("inj") if ext else "").strip()
            rec["reason_raw"] = text
            rec["start_date"] = pd.to_datetime(ext.group("start"), format="%m/%d/%y", errors="coerce") if ext and ext.group("start") else pd.NaT
            rec["end_date"]   = pd.to_datetime(ext.group("end"),   format="%m/%d/%y", errors="coerce") if ext and ext.group("end") else pd.NaT
            records.append(rec)

    out = pd.DataFrame.from_records(records)

    # Optional numeric fields (commented in the loop) — if you want them, you can compute per-episode days:
    # out["days_missed"] = (out["end_date"] - out["start_date"]).dt.days.add(1)  # inclusive, if desired

    # Order columns
    want = ["rank", "player", "pos", "team",
            "il_type", "injury", "start_date", "end_date", "reason_raw"]
    return out[[c for c in want if c in out.columns]]


# ---------------- Multi-year API (no autosave) ----------------

def scrape_spotrac_years(years: List[int], sleep_sec: float = 1.0) -> Tuple[Dict[int, pd.DataFrame], pd.DataFrame]:
    """
    Scrape and normalize Spotrac MLB Injured List for multiple years.
    Returns:
      - tables: {year: cleaned DataFrame}
      - combined: single DataFrame with a 'year' column
    Does NOT write to disk.
    """
    tables: Dict[int, pd.DataFrame] = {}
    frames = []
    for yr in years:
        url = build_url(yr)
        raw = try_requests_then_playwright(url)
        clean = normalize_spotrac_injured_df(raw)
        clean = clean.assign(year=yr)
        tables[yr] = clean
        frames.append(clean)
        if sleep_sec:
            time.sleep(sleep_sec)  # be polite to the site
    combined = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
    return tables, combined


In [9]:
# years = [2015, 2020, 2021, 2022, 2023, 2024, 2025]

tables, combined = scrape_spotrac_years([i for i in range(2018, 2026)])

# Inspect
{y: df.shape for y, df in tables.items()}
combined

Unnamed: 0,rank,player,pos,team,il_type,injury,start_date,end_date,reason_raw,year
0,1,Miguel Cabrera,DH,DET,10-Day IL,Hamstring,2018-05-04,2018-06-01,10-Day IL - Hamstring: 5/4/18-6/1/18 10-Day I...,2018
1,1,Miguel Cabrera,DH,DET,10-Day IL,Hamstring,2018-06-13,2018-10-01,10-Day IL - Hamstring: 5/4/18-6/1/18 10-Day I...,2018
2,2,Jacoby Ellsbury,CF,NYY,60-Day IL,Oblique,2018-03-29,2018-10-01,60-Day IL - Oblique: 3/29/18-10/1/18,2018
3,3,Yoenis Céspedes,LF,NYM,10-Day IL,Hip,2018-05-16,2018-07-20,10-Day IL - Hip: 5/16/18-7/20/18 60-Day IL - ...,2018
4,3,Yoenis Céspedes,LF,NYM,10-Day IL,Hip,2018-07-24,2018-10-01,10-Day IL - Hip: 5/16/18-7/20/18 60-Day IL - ...,2018
...,...,...,...,...,...,...,...,...,...,...
6605,628,Cade Horton,SP,CHC,15-Day IL,Ribs,2025-09-25,2025-09-28,15-Day IL - Ribs: 9/25/25-9/28/25,2025
6606,629,Everson Pereira,OF,TB,10-Day IL,Back,2025-09-26,2025-09-28,10-Day IL - Back: 9/26/25-9/28/25,2025
6607,630,Brett Baty,2B,NYM,10-Day IL,Oblique,2025-09-27,2025-09-28,10-Day IL - Oblique: 9/27/25-9/28/25,2025
6608,631,Nick Frasso,SP,LAD,60-Day IL,Undisclosed,2025-09-27,2025-09-28,60-Day IL - Undisclosed: 9/27/25-9/28/25,2025


In [10]:
combined.to_csv("mlb_injuries.csv", index=False)
# # or per year:
# for y, df in tables.items():
#     df.to_parquet(f"mlb_injured_{y}.parquet", index=False)