In [1]:
pip install pandas python-dotenv pyarrow


Collecting pyarrow
  Downloading pyarrow-21.0.0-cp310-cp310-macosx_12_0_x86_64.whl.metadata (3.3 kB)
Downloading pyarrow-21.0.0-cp310-cp310-macosx_12_0_x86_64.whl (32.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.7/32.7 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-21.0.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import pandas as pd
from dotenv import load_dotenv
from io import StringIO

# Load env vars
load_dotenv()
DATA_DIR_RAW = os.getenv("DATA_DIR_RAW", "data/raw")
DATA_DIR_PROCESSED = os.getenv("DATA_DIR_PROCESSED", "data/processed")

# Sample DataFrame
df = pd.DataFrame({
    "id": [1, 2, 3],
    "value": [10.5, 20.3, 30.7],
    "label": ["A", "B", "C"]
})

def write_df(df, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    ext = path.split('.')[-1].lower()
    if ext == "csv":
        df.to_csv(path, index=False)
    elif ext == "parquet":
        try:
            df.to_parquet(path, index=False)
        except ImportError:
            print("Parquet engine missing. Install pyarrow or fastparquet.")
    else:
        raise ValueError("Unsupported file type.")

def read_df(path):
    ext = path.split('.')[-1].lower()
    if ext == "csv":
        return pd.read_csv(path)
    elif ext == "parquet":
        try:
            return pd.read_parquet(path)
        except ImportError:
            print("Parquet engine missing. Install pyarrow or fastparquet.")
            return None
    else:
        raise ValueError("Unsupported file type.")

def validate(df1, df2, cols):
    print("Shape matches:", df1.shape == df2.shape)
    dtypes_match = all(df1[cols].dtypes == df2[cols].dtypes)
    print("Dtypes match for columns", cols, ":", dtypes_match)

# Paths
csv_path = f"{DATA_DIR_RAW}/sample.csv"
parquet_path = f"{DATA_DIR_PROCESSED}/sample.parquet"

# Save
write_df(df, csv_path)
write_df(df, parquet_path)

# Load
df_csv = read_df(csv_path)
df_parquet = read_df(parquet_path)

# Validate
validate(df_csv, df_parquet, cols=["id", "value"])


Shape matches: True
Dtypes match for columns ['id', 'value'] : True
