In [14]:
"""
Phase-1 Validation
------------------
Finds <repo>/data/processed/clean_hdb.csv from anywhere and sanity-checks it.
Run as .py or in a notebook; paths auto-resolve.
"""
# 1. Imports
from pathlib import Path
import pandas as pd
import sys

# 2. Locate the CSV
try:
    START_DIR = Path(__file__).resolve().parent      # script mode
except NameError:
    START_DIR = Path.cwd()                           # notebook mode
    print(f"📓 notebook run; cwd = {START_DIR}")

target_rel = Path("data/processed/clean_hdb.csv")
root = START_DIR
while not (root / target_rel).exists():
    if root == root.parent:
        sys.exit(f"❌ Could not find {target_rel} above {START_DIR}")
    root = root.parent

CLEAN_PATH = root / target_rel
print(f"🔍 Using clean file at → {CLEAN_PATH.relative_to(root)}")

# 3. Load
df = pd.read_csv(CLEAN_PATH)
print(f"✅ Loaded {len(df):,} rows · {df.shape[1]} columns")

# 5. Checks
req_cols = ["sale_date","sale_year","sale_month","resale_price",
            "floor_area_sqm","price_per_sqm","lease_remaining_years","flat_age"]
missing = [c for c in req_cols if c not in df.columns]
assert not missing, f"❌ Missing cols: {missing}"
print("✅ Essential columns present")

assert df.duplicated().sum() == 0, "❌ Duplicate rows found"
print("✅ No duplicate rows")

num_cols = ["resale_price","floor_area_sqm",
            "price_per_sqm","lease_remaining_years","flat_age"]
bad = {c:str(df[c].dtype) for c in num_cols if not pd.api.types.is_numeric_dtype(df[c])}
assert not bad, f"❌ Non-numeric dtypes: {bad}"
print("✅ Numeric columns really numeric")

null_pc = df.isna().mean().round(3)
warn = null_pc[null_pc > 0.02]
if not warn.empty:
    print("⚠️  >2 % nulls:\n", warn)
else:
    print("✅ Nulls ≤2 % everywhere")

assert (df["resale_price"] > 1_000).all(), "❌ Weird resale_price values"
print("✅ resale_price values positive")

print("\n🎉 Phase 1 data passes all basic checks.\n")

print(
    df[["resale_price","price_per_sqm",
        "lease_remaining_years","flat_age"]].describe().T
)
print(df.info())
print(df.flat_type.head(50))

📓 notebook run; cwd = /Users/sheenal/Desktop/hdb-price-predictor/rough
🔍 Using clean file at → data/processed/clean_hdb.csv
✅ Loaded 957,585 rows · 49 columns
✅ Essential columns present
✅ No duplicate rows
✅ Numeric columns really numeric
✅ Nulls ≤2 % everywhere
✅ resale_price values positive

🎉 Phase 1 data passes all basic checks.

                          count           mean            std           min  \
resale_price           957585.0  331389.762439  180194.108238  20000.000000   
price_per_sqm          957585.0    3433.137734    1650.306901    252.380952   
lease_remaining_years  957534.0      80.730636      11.069681     40.000000   
flat_age               957534.0      18.161688      10.995329      0.000000   

                            25%            50%            75%           max  
resale_price           198000.0  305000.000000  430000.000000  1.658888e+06  
price_per_sqm            2312.5    3021.978022    4347.826087  1.614894e+04  
lease_remaining_years      74.0  