In [1]:
# src/validate.py
import pandas as pd
import numpy as np

def validate_df(df: pd.DataFrame, required_cols: list = None, dtype_map: dict = None, na_thresh: float = 0.2):
    """Return a dict of messages. Empty dict means OK."""
    msgs = {}
    if required_cols:
        missing = [c for c in required_cols if c not in df.columns]
        if missing:
            msgs['missing_cols'] = missing
    if dtype_map:
        coerced = {}
        for col, expected in dtype_map.items():
            if col not in df.columns:
                continue
            try:
                if expected.startswith("datetime"):
                    pd.to_datetime(df[col], errors='raise')
                elif expected in ('float','int'):
                    df[col].astype(float if expected=='float' else int)
                # else skip strict
                coerced[col] = True
            except Exception as e:
                msgs.setdefault('dtype_errors', {})[col] = str(e)
    if na_thresh is not None:
        na_frac = df.isna().mean()
        bad = na_frac[na_frac > na_thresh].to_dict()
        if bad:
            msgs['high_na_fraction'] = {k: float(v) for k,v in bad.items()}
    # Sanity checks: duplicates and monotonic date if 'date' exists
    if 'date' in df.columns:
        if df['date'].duplicated().any():
            msgs['duplicates_in_date'] = int(df['date'].duplicated().sum())
        try:
            monotonic = df['date'].is_monotonic_increasing
            if not monotonic:
                msgs['date_not_monotonic'] = True
        except Exception:
            pass
    return msgs


In [6]:
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
tables = pd.read_html(url)
print(f"Found {len(tables)} tables")

df_scrape = tables[0]   # first table
print(df_scrape.head())
print(df_scrape.info())


Found 2 tables
    Symbol             Security             GICS Sector  \
0      MMM                   3M             Industrials   
1      AOS          A. O. Smith             Industrials   
2      ABT  Abbott Laboratories             Health Care   
3     ABBV               AbbVie             Health Care   
4      ACN            Accenture  Information Technology   
..     ...                  ...                     ...   
498    XYL           Xylem Inc.             Industrials   
499    YUM          Yum! Brands  Consumer Discretionary   
500   ZBRA   Zebra Technologies  Information Technology   
501    ZBH        Zimmer Biomet             Health Care   
502    ZTS               Zoetis             Health Care   

                                GICS Sub-Industry    Headquarters Location  \
0                        Industrial Conglomerates    Saint Paul, Minnesota   
1                               Building Products     Milwaukee, Wisconsin   
2                           Health Care Eq

In [7]:
from datetime import datetime
import os

Path("data/raw").mkdir(parents=True, exist_ok=True)

ts = datetime.now().strftime("%Y%m%d-%H%M")
api_file = f"data/raw/api_alphavantage_AAPL_{ts}.csv"
scrape_file = f"data/raw/scrape_wikipedia_sp500_{ts}.csv"

df_api.to_csv(api_file, index=False)
df_scrape.to_csv(scrape_file, index=False)

print("Saved:", api_file, scrape_file)


Missing: timestamp
Missing: close
NA counts:
 {    0
dtype: int64
Shape: (2, 1)


In [10]:
from pathlib import Path


In [11]:
from datetime import datetime
import os

Path("data/raw").mkdir(parents=True, exist_ok=True)

ts = datetime.now().strftime("%Y%m%d-%H%M")
api_file = f"data/raw/api_alphavantage_AAPL_{ts}.csv"
scrape_file = f"data/raw/scrape_wikipedia_sp500_{ts}.csv"

df_api.to_csv(api_file, index=False)
df_scrape.to_csv(scrape_file, index=False)

print("Saved:", api_file, scrape_file)


Saved: data/raw/api_alphavantage_AAPL_20250821-0333.csv data/raw/scrape_wikipedia_sp500_20250821-0333.csv
