In [1]:
pip install pandas python-dotenv pyarrow


Collecting pandas
  Downloading pandas-2.3.1-cp313-cp313-macosx_10_13_x86_64.whl.metadata (91 kB)
Collecting python-dotenv
  Using cached python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting pyarrow
  Downloading pyarrow-21.0.0-cp313-cp313-macosx_12_0_x86_64.whl.metadata (3.3 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.3.2-cp313-cp313-macosx_10_13_x86_64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.1-cp313-cp313-macosx_10_13_x86_64.whl (11.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hUsing cached python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Downloading pyarrow-21.0.0-cp313-cp313-macosx_12_0_x86_64.whl (32.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import pandas as pd
from dotenv import load_dotenv
from io import StringIO

load_dotenv()
DATA_DIR_RAW = os.getenv("DATA_DIR_RAW", "./data/raw")
DATA_DIR_PROCESSED = os.getenv("DATA_DIR_PROCESSED", "./data/processed")

df = pd.DataFrame({
    "id": [1, 2, 3],
    "value": [10.5, 20.3, 30.7],
    "label": ["A", "B", "C"]
})

def write_df(df, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    ext = path.split('.')[-1].lower()
    if ext == "csv":
        df.to_csv(path, index=False)
    elif ext == "parquet":
        try:
            df.to_parquet(path, index=False)
        except ImportError:
            print("Parquet engine missing.")
    else:
        raise ValueError("Unsupported file type.")

def read_df(path):
    ext = path.split('.')[-1].lower()
    if ext == "csv":
        return pd.read_csv(path)
    elif ext == "parquet":
        try:
            return pd.read_parquet(path)
        except ImportError:
            print("Parquet engine missing. Install pyarrow or fastparquet.")
            return None
    else:
        raise ValueError("Unsupported file type.")

def validate(df1, df2, cols):
    print("Shape matches:", df1.shape == df2.shape)
    dtypes_match = all(df1[cols].dtypes == df2[cols].dtypes)
    print("Dtypes match for columns", cols, ":", dtypes_match)

csv_path = f"{DATA_DIR_RAW}/sample.csv"
parquet_path = f"{DATA_DIR_PROCESSED}/sample.parquet"

write_df(df, csv_path)
write_df(df, parquet_path)

df_csv = read_df(csv_path)
df_parquet = read_df(parquet_path)

validate(df_csv, df_parquet, cols=["id", "value"])


Shape matches: True
Dtypes match for columns ['id', 'value'] : True
