In [10]:
from dotenv import load_dotenv
import os
from pathlib import Path
from datetime import datetime
import pandas as pd

load_dotenv(override=True)
RAW  = Path(os.getenv("DATA_DIR_RAW", "data/raw"))
PROC = Path(os.getenv("DATA_DIR_PROCESSED", "data/processed"))
RAW.mkdir(parents=True, exist_ok=True)
PROC.mkdir(parents=True, exist_ok=True)

RAW, PROC  # quick check

(PosixPath('data/raw'), PosixPath('data/processed'))

In [11]:
df = pd.DataFrame({
    "Date": pd.to_datetime(["2025-01-01","2025-01-02","2025-01-03"]),
    "Value": [1.0, 2.5, 3.3],
    "Category": ["A","B","A"],
})
df

Unnamed: 0,Date,Value,Category
0,2025-01-01,1.0,A
1,2025-01-02,2.5,B
2,2025-01-03,3.3,A


In [12]:
from src.storage import write_df

ts = datetime.now().strftime("%Y%m%d-%H%M")
csv_path  = RAW  / f"sample_{ts}.csv"
parq_path = PROC / f"sample_{ts}.parquet"

write_df(df, csv_path)
try:
    write_df(df, parq_path)
    wrote_parquet = True
except RuntimeError as e:
    print(e)  # if pyarrow missing, you’ll see the message here
    wrote_parquet = False

csv_path, parq_path, wrote_parquet

(PosixPath('data/raw/sample_20250827-1354.csv'),
 PosixPath('data/processed/sample_20250827-1354.parquet'),
 True)

In [13]:
from src.storage import read_df

def validate_reload(orig: pd.DataFrame, csv_p: Path, parq_p: Path | None = None):
    # CSV round-trip
    df_csv = read_df(csv_p, parse_dates=["Date"])
    assert df_csv.shape == orig.shape, "CSV shape mismatch"
    assert pd.api.types.is_datetime64_any_dtype(df_csv["Date"]), "CSV Date not datetime"
    assert pd.api.types.is_float_dtype(df_csv["Value"]), "CSV Value not float"
    assert pd.api.types.is_object_dtype(df_csv["Category"]), "CSV Category not object/str"

    # Parquet round-trip (if written)
    if parq_p and parq_p.exists():
        df_parq = read_df(parq_p)
        assert df_parq.shape == orig.shape, "Parquet shape mismatch"
        assert pd.api.types.is_datetime64_any_dtype(df_parq["Date"]), "Parquet Date not datetime"
        assert pd.api.types.is_float_dtype(df_parq["Value"]), "Parquet Value not float"
        assert pd.api.types.is_object_dtype(df_parq["Category"]), "Parquet Category not object/str"

    print("Validation passed")

validate_reload(df, csv_path, parq_path if wrote_parquet else None)

Validation passed
