In [1]:
# src/ingest_scrape.py
import requests, json
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
from datetime import datetime

OUT_DIR = Path("data/raw")

def timestamp():
    return datetime.utcnow().strftime("%Y%m%d-%H%M")

def save_df_with_meta(df: pd.DataFrame, prefix: str, meta: dict):
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    fname = f"{prefix}_{timestamp()}.csv"
    metaname = fname.replace(".csv", ".meta.json")
    df.to_csv(OUT_DIR / fname, index=False)
    with open(OUT_DIR / metaname, "w") as f:
        json.dump(meta, f, indent=2)
    return fname

def scrape_wikipedia_table(url: str, table_index: int = 0):
    resp = requests.get(url, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "lxml")
    # robust approach: try pandas.read_html as fallback
    tables = pd.read_html(resp.text)
    if len(tables) > table_index:
        df = tables[table_index]
    else:
        # manual parsing example (if read_html fails)
        table = soup.find_all("table")[table_index]
        rows = []
        for tr in table.find_all("tr"):
            cols = [td.get_text(strip=True) for td in tr.find_all(["td","th"])]
            if cols:
                rows.append(cols)
        df = pd.DataFrame(rows[1:], columns=rows[0])
    meta = {"source":"wikipedia", "url": url, "table_index": table_index, "fetched_at": datetime.utcnow().isoformat()}
    fname = save_df_with_meta(df, "scrape_wikipedia_table", meta)
    return df, fname
