# 01 — Data Ingestion

This notebook loads the six core datasets, prints row counts, builds the
combined ~40k-row sample, and does a quick sanity check.

In [None]:
import sys
from pathlib import Path

# Ensure project root is on the path
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT))
print("Project root:", PROJECT_ROOT)

In [None]:
from src.data.load_data import load_all_data, save_profile

DATA_DIR = PROJECT_ROOT / "data"
datasets = load_all_data(DATA_DIR)

print(f"{'Dataset':<30s}  {'Rows':>8s}  {'Cols':>5s}")
print("-" * 48)
for name, df in datasets.items():
    print(f"{name:<30s}  {len(df):>8,}  {len(df.columns):>5}")

In [None]:
# Save data profile
profile_path = save_profile(datasets, DATA_DIR / "data_profile.json")
print("Profile saved →", profile_path)

In [None]:
# Build the combined ~40k sample
import subprocess, sys as _sys

script = PROJECT_ROOT / "scripts" / "make_combined_40k.py"
result = subprocess.run(
    [_sys.executable, str(script)],
    cwd=str(PROJECT_ROOT),
    capture_output=True, text=True,
)
print(result.stdout)
if result.returncode != 0:
    print("STDERR:", result.stderr)

In [None]:
import pandas as pd

pq_path = DATA_DIR / "Combined_Dataset_40k.parquet"
combined = pd.read_parquet(pq_path)
print(f"Combined Dataset: {len(combined):,} rows  ×  {len(combined.columns)} cols\n")
combined.head(5)

In [None]:
combined.describe()

In [None]:
# Column types and missing values
info_df = pd.DataFrame({
    "dtype": combined.dtypes,
    "non_null": combined.notna().sum(),
    "missing": combined.isna().sum(),
    "missing_%": (combined.isna().sum() / len(combined) * 100).round(1),
})
info_df