In [28]:
import os, json, time, datetime as dt, csv, pathlib
from typing import Dict, List
import requests
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import yfinance as yf
from bs4 import BeautifulSoup
DATA_RAW = pathlib.Path("data/raw")
DATA_RAW.mkdir(parents=True, exist_ok=True)

load_dotenv()
ALPHA_KEY = os.getenv("ALPHAVANTAGE_API_KEY")
print("Loaded ALPHAVANTAGE_API_KEY?", bool(ALPHA_KEY))

Loaded ALPHAVANTAGE_API_KEY? False


In [29]:
def safe_stamp():
    return dt.datetime.now().strftime("%Y%m%d-%H%M%S")

def safe_filename(prefix: str, meta: Dict[str, str]) -> str:
    mid = "_".join([f"{k}-{str(v).replace(' ', '-')[:20]}" for k, v in meta.items()])
    return f"{prefix}_{mid}_{safe_stamp()}.csv"

def validate_df(df: pd.DataFrame, required_cols: List[str], dtypes_map: Dict[str, str]) -> Dict[str, str]:
    msgs = {}
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        msgs['missing_cols'] = f"Missing columns: {missing}"
    for col, dtype in dtypes_map.items():
        if col in df.columns:
            try:
                if dtype == 'datetime64[ns]':
                    pd.to_datetime(df[col])
                elif dtype == 'float':
                    pd.to_numeric(df[col])
            except Exception as e:
                msgs[f'dtype_{col}'] = f"Failed to coerce {col} to {dtype}: {e}"
    na_counts = df.isna().sum().sum()
    msgs['na_total'] = f"Total NA values: {na_counts}"
    return msgs

In [27]:
SYMBOL = "AAPL"
df_api = yf.download(SYMBOL, period="6mo", interval="1d").reset_index()[['Date','Open']]
df_api.columns = ['Date','Open']
df_api = df_api.sort_values('Date').reset_index(drop=True)


fname = safe_filename(prefix="api", meta={"source": "yfinance", "symbol": SYMBOL})
out_path = DATA_RAW / fname
df_api.to_csv(out_path, index=False)
print(df_api)
print("Saved:", out_path)

  df_api = yf.download(SYMBOL, period="6mo", interval="1d").reset_index()[['Date','Open']]
[*********************100%***********************]  1 of 1 completed

          Date        Open
0   2025-02-21  245.349441
1   2025-02-24  244.331928
2   2025-02-25  247.394445
3   2025-02-26  243.733409
4   2025-02-27  238.825414
..         ...         ...
120 2025-08-14  234.059998
121 2025-08-15  234.000000
122 2025-08-18  231.699997
123 2025-08-19  231.279999
124 2025-08-20  229.979996

[125 rows x 2 columns]
Saved: data\raw\api_source-yfinance_symbol-AAPL_20250821-024210.csv





In [31]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Parse the first table
table = soup.find("table", {"id": "constituents"})
rows = table.find_all("tr")

data = []
for row in rows[1:]:
    cols = row.find_all(["td", "th"])
    cols = [c.get_text(strip=True) for c in cols]
    if cols:
        data.append(cols)

# Build DataFrame
headers = [h.get_text(strip=True) for h in rows[0].find_all("th")]
df_scrape = pd.DataFrame(data, columns=headers)

# Validation (e.g., Symbol = text, CIK = numeric)
df_scrape["CIK"] = pd.to_numeric(df_scrape["CIK"], errors="coerce")
assert df_scrape["Symbol"].dtype == object
print("Scraped shape:", df_scrape.shape)

# Save raw data
df_scrape.to_csv("data/raw/sp500_companies.csv", index=False)
print(df_scrape)

Scraped shape: (503, 8)
    Symbol             Security              GICSSector  \
0      MMM                   3M             Industrials   
1      AOS          A. O. Smith             Industrials   
2      ABT  Abbott Laboratories             Health Care   
3     ABBV               AbbVie             Health Care   
4      ACN            Accenture  Information Technology   
..     ...                  ...                     ...   
498    XYL           Xylem Inc.             Industrials   
499    YUM          Yum! Brands  Consumer Discretionary   
500   ZBRA   Zebra Technologies  Information Technology   
501    ZBH        Zimmer Biomet             Health Care   
502    ZTS               Zoetis             Health Care   

                                GICS Sub-Industry    Headquarters Location  \
0                        Industrial Conglomerates    Saint Paul, Minnesota   
1                               Building Products     Milwaukee, Wisconsin   
2                           Healt