In [2]:
# Cell 0 — Environment Setup
import os
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv

# Load .env variables
load_dotenv()

# Get paths from .env (fallback to defaults if not set)
DATA_DIR_RAW = Path(os.getenv("DATA_DIR_RAW", "data/raw"))
DATA_DIR_PROCESSED = Path(os.getenv("DATA_DIR_PROCESSED", "data/processed"))

# Ensure folders exist
DATA_DIR_RAW.mkdir(parents=True, exist_ok=True)
DATA_DIR_PROCESSED.mkdir(parents=True, exist_ok=True)

print("Raw data path:     ", DATA_DIR_RAW.resolve())
print("Processed data path:", DATA_DIR_PROCESSED.resolve())


Raw data path:      /Users/seanice/Desktop/Bootcamp/bootcamp_Linxi_Pan/homework/homework5/data/raw
Processed data path: /Users/seanice/Desktop/Bootcamp/bootcamp_Linxi_Pan/homework/homework5/data/processed


In [3]:
# Cell 1 — Load Starter Data

# Load CSV into DataFrame
df = pd.read_csv("starter_data.csv")

# Basic info
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("NA counts:\n", df.isna().sum())

# Preview first 5 rows
df.head()

Shape: (10, 3)
Columns: ['category', 'value', 'date']
NA counts:
 category    0
value       0
date        0
dtype: int64


Unnamed: 0,category,value,date
0,A,10,2025-08-01
1,B,15,2025-08-02
2,A,12,2025-08-03
3,B,18,2025-08-04
4,C,25,2025-08-05


In [None]:
# Cell 2 — Save DataFrame to CSV and Parquet

import time
import fastparquet


# Generate timestamp for reproducible filenames
ts = time.strftime("%Y%m%d-%H%M")

# Save to CSV in raw directory
csv_path = DATA_DIR_RAW / f"sample_{ts}.csv"
df.to_csv(csv_path, index=False)
print("Saved CSV to:", csv_path)

# Save to Parquet in processed directory
df.to_parquet(parquet_path, index=False, engine="fastparquet")
parquet_path = DATA_DIR_PROCESSED / f"sample_{ts}.parquet"
print("Saved Parquet to:", parquet_path)


Saved CSV to: data/raw/sample_20250820-1831.csv
Saved Parquet to: data/processed/sample_20250820-1831.parquet


In [16]:
# Cell 3 — Reload and Validate

# Reload CSV
df_csv = pd.read_csv(csv_path)

# Reload Parquet (engine fallback)
try:
    df_parquet = pd.read_parquet(parquet_path, engine="pyarrow")
except Exception:
    df_parquet = pd.read_parquet(parquet_path, engine="fastparquet")

# Validation function
def validate_dataframes(df1, df2):
    results = {}
    results["shape_equal"] = df1.shape == df2.shape
    results["columns_equal"] = list(df1.columns) == list(df2.columns)
    # Compare dtypes
    dtypes_match = all(str(df1[c].dtype) == str(df2[c].dtype) for c in df1.columns)
    results["dtypes_equal"] = dtypes_match
    return results

# Run validation
validation_results = validate_dataframes(df_csv, df_parquet)
print("Validation results:", validation_results)

# Preview
df_csv.head()



Validation results: {'shape_equal': True, 'columns_equal': True, 'dtypes_equal': True}


Unnamed: 0,category,value,date
0,A,10,2025-08-01
1,B,15,2025-08-02
2,A,12,2025-08-03
3,B,18,2025-08-04
4,C,25,2025-08-05


In [17]:
# Cell 4 — Utility Functions

def write_df(df: pd.DataFrame, path: Path):
    """Save DataFrame to CSV or Parquet depending on suffix."""
    path.parent.mkdir(parents=True, exist_ok=True)
    suffix = path.suffix.lower()
    try:
        if suffix == ".csv":
            df.to_csv(path, index=False)
            print("Saved CSV:", path)
        elif suffix == ".parquet":
            try:
                df.to_parquet(path, index=False, engine="pyarrow")
                print("Saved Parquet with pyarrow:", path)
            except Exception as e1:
                print("pyarrow failed:", e1)
                try:
                    df.to_parquet(path, index=False, engine="fastparquet")
                    print("Saved Parquet with fastparquet:", path)
                except Exception as e2:
                    print("Both pyarrow and fastparquet failed:", e2)
        else:
            raise ValueError(f"Unsupported file extension: {suffix}")
    except Exception as e:
        print("Write failed:", e)

def read_df(path: Path) -> pd.DataFrame:
    """Load DataFrame from CSV or Parquet depending on suffix."""
    suffix = path.suffix.lower()
    if not path.exists():
        raise FileNotFoundError(f"File does not exist: {path}")
    
    if suffix == ".csv":
        return pd.read_csv(path)
    elif suffix == ".parquet":
        try:
            return pd.read_parquet(path, engine="pyarrow")
        except Exception:
            return pd.read_parquet(path, engine="fastparquet")
    else:
        raise ValueError(f"Unsupported file extension: {suffix}")

# --- Test utilities ---
test_csv = DATA_DIR_RAW / "util_test.csv"
test_parquet = DATA_DIR_PROCESSED / "util_test.parquet"

write_df(df, test_csv)
write_df(df, test_parquet)

df_loaded_csv = read_df(test_csv)
df_loaded_parquet = read_df(test_parquet)

print("CSV shape:", df_loaded_csv.shape)
print("Parquet shape:", df_loaded_parquet.shape)


Saved CSV: data/raw/util_test.csv
pyarrow failed: A type extension with name pandas.period already defined
Saved Parquet with fastparquet: data/processed/util_test.parquet
CSV shape: (10, 3)
Parquet shape: (10, 3)


## Data Storage Documentation

### Folder Structure
- **data/raw/**  
  Stores raw ingested data in CSV format.  
- **data/processed/**  
  Stores processed data in Parquet format.  

The paths are defined in `.env` file as:



### Formats Used
- **CSV (Comma-Separated Values)**  
  - Human-readable, universal support.  
  - Used for raw storage to ensure transparency.  
- **Parquet (Columnar Storage)**  
  - Efficient for analytical workloads, compressed and faster for large-scale reads.  
  - Used for processed storage to optimize space and performance.  

### How the Code Reads/Writes
- `write_df(df, path)` automatically chooses CSV or Parquet based on file extension.  
- `read_df(path)` loads the file accordingly, with fallback between `pyarrow` and `fastparquet` engines for Parquet.  
- Both functions ensure directories exist and give clear error messages if engines are missing.  

### Validation
- Shapes and columns between CSV and Parquet are compared.  
- Critical columns’ dtypes are checked to ensure consistency.  

### Assumptions & Risks
- CSV files are larger but universally compatible.  
- Parquet requires external engines (`pyarrow` or `fastparquet`), which may cause compatibility issues.  
- Website or API schema changes may affect ingested data structure.  
- `.env` must **not** be committed to GitHub; only `.env.example` should be shared.  
