In [1]:
from pathlib import Path
import os
from dotenv import load_dotenv

# import the four helpers
import sys
sys.path.append("src")
from ingest import load_api_data, scrape_table, validate_data, save_csv

# folders
RAW = Path("data/raw")

# secrets
load_dotenv()
print("ALPHAVANTAGE_API_KEY present:", bool(os.getenv("ALPHAVANTAGE_API_KEY")))


ALPHAVANTAGE_API_KEY present: True


In [2]:
df_api = load_api_data("GOOG")
validate_data(df_api, required_cols=["Date", "Close"])  # adjust if Adj Close exists
save_csv(df_api, prefix="api_yfinance_GOOG", out_dir=RAW)

Shape: (5, 7)
NA counts:
 Date         0
Adj Close    0
Close        0
High         0
Low          0
Open         0
Volume       0
dtype: int64
Saved data/raw/api_yfinance_GOOG_20250827-0320.csv


'data/raw/api_yfinance_GOOG_20250827-0320.csv'

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
df_scrape = scrape_table(url, css="table.wikitable")  # simple permitted table
validate_data(df_scrape, need_numeric=True, need_text=True)
save_csv(df_scrape, prefix="scrape_wikipedia_sp500", out_dir=RAW)

Shape: (504, 8)
NA counts:
 Symbol                   0
Security                 0
GICS Sector              0
GICS Sub-Industry        0
Headquarters Location    0
Date added               0
CIK                      0
Founded                  0
dtype: int64
Saved data/raw/scrape_wikipedia_sp500_20250827-0320.csv


  df = pd.read_html(str(node))[0]


'data/raw/scrape_wikipedia_sp500_20250827-0320.csv'

In [4]:
print("Sources and params:")
print("- API: yfinance download for AAPL, period=5d, interval=1d")
print("- Scrape:", url, "selector: table.wikitable")
print("\nValidation:")
print("- API: required columns present, types parsed to datetime and float, non-empty")
print("- Scrape: at least one numeric and one text column, non-empty")
print("\nReproducibility and secrets:")
print("- Saved raw files with timestamp in data/raw")
print("- Used .env locally and did not commit it")
print("\nAssumptions and risks:")
print("- API fields and HTML structure can change")
print("- Network failures can happen, rerun cell to retry")
print("- Respect site terms for scraping")

Sources and params:
- API: yfinance download for AAPL, period=5d, interval=1d
- Scrape: https://en.wikipedia.org/wiki/List_of_S%26P_500_companies selector: table.wikitable

Validation:
- API: required columns present, types parsed to datetime and float, non-empty
- Scrape: at least one numeric and one text column, non-empty

Reproducibility and secrets:
- Saved raw files with timestamp in data/raw
- Used .env locally and did not commit it

Assumptions and risks:
- API fields and HTML structure can change
- Network failures can happen, rerun cell to retry
- Respect site terms for scraping
