In [15]:

import os, pathlib, datetime as dt
import pandas as pd
from dotenv import load_dotenv
from typing import Union

In [16]:
load_dotenv()
RAW_DIR = pathlib.Path(os.getenv("DATA_DIR_RAW", "d:/文心远/研究生/5040-Bootcamp/project/data/raw"))
PROC_DIR = pathlib.Path(os.getenv("DATA_DIR_PROCESSED", "d:/文心远/研究生/5040-Bootcamp/project/data/processed"))
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)
print("RAW_DIR:", RAW_DIR.resolve())
print("PROC_DIR:", PROC_DIR.resolve())

RAW_DIR: D:\文心远\研究生\5040-Bootcamp\project\data\raw
PROC_DIR: D:\文心远\研究生\5040-Bootcamp\project\data\processed


In [17]:
csv_path = RAW_DIR / "api_source-yfinance_symbol-MSFT_20250820-200354.csv"
df = pd.read_csv(csv_path, parse_dates=['date'])
print("Loaded Stage 4 CSV:")
print(df.head())
print(df.info())

Loaded Stage 4 CSV:
        date   adj_close
0 2025-02-21  407.461945
1 2025-02-24  403.259674
2 2025-02-25  397.170837
3 2025-02-26  398.997498
4 2025-02-27  391.810699
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       125 non-null    datetime64[ns]
 1   adj_close  125 non-null    float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 2.1 KB
None


In [18]:
def ensure_dir(path: pathlib.Path):
    path.parent.mkdir(parents=True, exist_ok=True)

def detect_format(path: Union[str, pathlib.Path]):
    suf = str(path).lower()
    if suf.endswith('.csv'): return 'csv'
    if suf.endswith('.parquet') or suf.endswith('.pq') or suf.endswith('.parq'): return 'parquet'
    raise ValueError('Unsupported format for: ' + str(path))

def write_df(df: pd.DataFrame, path: Union[str, pathlib.Path]):
    path = pathlib.Path(path)
    ensure_dir(path)
    fmt = detect_format(path)
    if fmt == 'csv':
        df.to_csv(path, index=False)
    elif fmt == 'parquet':
        try:
            df.to_parquet(path)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e
    return path

def read_df(path: Union[str, pathlib.Path]):
    path = pathlib.Path(path)
    fmt = detect_format(path)
    if fmt == 'csv':
        return pd.read_csv(path, parse_dates=['date']) if 'date' in pd.read_csv(path, nrows=0).columns else pd.read_csv(path)
    elif fmt == 'parquet':
        try:
            return pd.read_parquet(path)
        except Exception as e:
            raise RuntimeError('Parquet engine not available. Install pyarrow or fastparquet.') from e


Save to CSV (raw)

In [19]:
csv_files = list(RAW_DIR.glob("api_source-yfinance_symbol-MSFT_20250820-200354.csv"))
if not csv_files:
    raise FileNotFoundError("No Stage 4 CSV found in RAW_DIR")
csv_path = sorted(csv_files)[-1]
df_api = pd.read_csv(csv_path, parse_dates=['date'])
print("Loaded Stage 4 CSV:", csv_path)
print(df_api.head())

Loaded Stage 4 CSV: d:\文心远\研究生\5040-Bootcamp\project\data\raw\api_source-yfinance_symbol-MSFT_20250820-200354.csv
        date   adj_close
0 2025-02-21  407.461945
1 2025-02-24  403.259674
2 2025-02-25  397.170837
3 2025-02-26  398.997498
4 2025-02-27  391.810699


In [20]:
def ts():
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')
    
csv_path = RAW_DIR / f"MSFT_raw_{ts()}.csv"
write_df(df_api, csv_path)
print("Saved CSV →", csv_path)


Saved CSV → d:\文心远\研究生\5040-Bootcamp\project\data\raw\MSFT_raw_20250820-205444.csv


Save to Parquet (processed)

In [21]:
parq_path = PROC_DIR / f"prices_{ts()}.parquet"
try:
    df.to_parquet(parq_path)  # uses installed engine if available
    print("Saved Parquet →", parq_path)
except Exception as e:
    print("Parquet save failed (engine missing?). Skipping Parquet demo.")
    print("Error:", e)

Saved Parquet → d:\文心远\研究生\5040-Bootcamp\project\data\processed\prices_20250820-205446.parquet


Reload & Validate

In [22]:
def validate_loaded(original: pd.DataFrame, reloaded: pd.DataFrame, cols=('date','adj_close')):
    checks = {
        'shape_equal': original.shape == reloaded.shape,
        'cols_present': all(c in reloaded.columns for c in cols)
    }
    if 'adj_close' in reloaded.columns:
        checks['adj_close_is_numeric'] = pd.api.types.is_numeric_dtype(reloaded['adj_close'])
    if 'date' in reloaded.columns:
        checks['date_is_datetime'] = pd.api.types.is_datetime64_any_dtype(reloaded['date'])
    return checks

df_csv = pd.read_csv(csv_path, parse_dates=['date'])
print('CSV validation:', validate_loaded(df, df_csv))

df_parq = pd.read_parquet(parq_path)
print('Parquet validation:', validate_loaded(df, df_parq))

CSV validation: {'shape_equal': True, 'cols_present': True, 'adj_close_is_numeric': True, 'date_is_datetime': True}
Parquet validation: {'shape_equal': True, 'cols_present': True, 'adj_close_is_numeric': True, 'date_is_datetime': True}
