In [23]:
import os, json, time, datetime as dt, csv, pathlib
from typing import Dict, List
import requests
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv

DATA_RAW = pathlib.Path("d:/文心远/研究生/5040-Bootcamp/project/data/raw")
DATA_RAW.mkdir(parents=True, exist_ok=True)

load_dotenv()

True

Functions

In [24]:
def safe_stamp():
    return dt.datetime.now().strftime("%Y%m%d-%H%M%S")


def safe_filename(prefix: str, meta: Dict[str, str]) -> str:
    mid = "_".join([f"{k}-{str(v).replace(' ', '-')[:20]}" for k, v in meta.items()])
    return f"{prefix}_{mid}_{safe_stamp()}.csv"


def fetch_stock_data(ticker: str, start: str, end: str) -> pd.DataFrame:
    import yfinance as yf
    df = yf.download(ticker, start=start, end=end)
    df = df.reset_index()
    return df[['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']]


def validate_df(df: pd.DataFrame, required_cols: List[str], dtypes_map: Dict[str, str]) -> Dict[str, str]:
    msgs = {}
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        msgs['missing_cols'] = f"Missing columns: {missing}"
    for col, dtype in dtypes_map.items():
        if col in df.columns:
            try:
                if dtype == 'datetime64[ns]':
                    pd.to_datetime(df[col])
                elif dtype == 'float':
                    pd.to_numeric(df[col])
            except Exception as e:
                msgs[f'dtype_{col}'] = f"Failed to coerce {col} to {dtype}: {e}"
    na_counts = df.isna().sum().sum()
    msgs['na_total'] = f"Total NA values: {na_counts}"
    return msgs

API Ingestion

In [25]:
SYMBOL = "MSFT"

import yfinance as yf
df_api = yf.download(SYMBOL, period="6mo", interval="1d", auto_adjust=False).reset_index()
df_api = df_api[['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']]
df_api.columns = ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']

df_api = df_api.sort_values('date').reset_index(drop=True)

[*********************100%***********************]  1 of 1 completed


In [26]:

fname = safe_filename(prefix="api", meta={"source": "yfinance", "symbol": SYMBOL})
out_path = DATA_RAW / fname
df_api.to_csv(out_path, index=False)
print("Saved:", out_path)

Saved: d:\文心远\研究生\5040-Bootcamp\project\data\raw\api_source-yfinance_symbol-MSFT_20250821-215314.csv


In [27]:
msgs = validate_df(
    df_api,
    required_cols=['date', 'adj_close'],
    dtypes_map={'date': 'datetime64[ns]', 'adj_close': 'float'}
)
print(msgs)

PROJECT_ROOT = pathlib.Path().resolve().parent 


RAW_DIR = PROJECT_ROOT / "data" / "raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

ts = dt.datetime.now().strftime('%Y%m%d-%H%M%S')

csv_path = RAW_DIR / f"api_source-yfinance_symbol-{SYMBOL}_{ts}.csv"
df_api.to_csv(csv_path, index=False)
print("Saved Stage4 CSV:", csv_path)

{'na_total': 'Total NA values: 0'}
Saved Stage4 CSV: D:\文心远\研究生\5040-Bootcamp\project\data\raw\api_source-yfinance_symbol-MSFT_20250821-215314.csv
