# CIDM Lab 1 — Simple Final Pipeline (Code Only)
Minimal, correct pipeline for price prediction using **KNN, Decision Tree, Random Forest**.
- Robust `price_type` normalization → `price_monthly`
- Light cleaning & encoding
- 70/15/15 split
- Scaling numerics for KNN
- Baseline + 3 hyperparam sets per model


In [5]:
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

CSV_PATHS = [
    "/mnt/data/apartments_for_rent_classified_100K.csv",
    "/mnt/data/apartments_for_rent_classified_10K.csv",
    "apartments_for_rent_classified_100K.csv",
    "apartments_for_rent_classified_10K.csv",
]

In [6]:
def to_monthly(price, ptype):
    t = (str(ptype) if ptype is not None else "").strip().lower()
    try:
        p = float(price)
    except Exception:
        return np.nan
    if not np.isfinite(p) or p <= 0:
        return np.nan
    if "week" in t:             factor = 52/12
    elif "fortnight" in t or "biweek" in t: factor = 26/12
    elif "day" in t:            factor = 30
    elif "year" in t or "annual" in t:      factor = 1/12
    elif "hour" in t:           factor = 24*30
    else:                       factor = 1.0  # assume monthly
    return p * factor

def load_first_available(paths):
    for p in paths:
        try:
            df = pd.read_csv(p)
            print("Loaded:", p)
            return df
        except Exception:
            continue
    raise FileNotFoundError("Place the CSV next to this notebook or in /mnt/data/.")

df_raw = load_first_available(CSV_PATHS)
print("Shape raw:", df_raw.shape)
df_raw.head(3)

FileNotFoundError: Place the CSV next to this notebook or in /mnt/data/.

In [None]:
# --- Normalize price to monthly and clean
df = df_raw.copy()

if "price_monthly" not in df.columns:
    if "price" not in df.columns or "price_type" not in df.columns:
        raise KeyError("Expected columns 'price' and 'price_type'.")
    df["price_monthly"] = df.apply(lambda r: to_monthly(r.get("price"), r.get("price_type")), axis=1)

# currency filter
if "currency" in df.columns:
    df = df[df["currency"].astype(str).str.upper().eq("USD") | df["currency"].isna()]

# drop invalid target
df = df.dropna(subset=["price_monthly"])
df = df[np.isfinite(df["price_monthly"]) & (df["price_monthly"] > 0)]

# encode binaries
for col in ["fee","has_photo"]:
    if col in df.columns:
        df[col] = (df[col].astype(str).str.lower()
                   .map({"yes":1,"no":0,"thumbnail":1}).fillna(0).astype(int))

# pets flags
if "pets_allowed" in df.columns:
    lower = df["pets_allowed"].fillna("").astype(str).str.lower()
    df["pet_cat_allowed"] = lower.str.contains("cat").astype(int)
    df["pet_dog_allowed"] = lower.str.contains("dog").astype(int)

# coords validity
if "latitude" in df.columns and "longitude" in df.columns:
    lat = pd.to_numeric(df["latitude"], errors="coerce")
    lon = pd.to_numeric(df["longitude"], errors="coerce")
    good = lat.between(-90, 90) & lon.between(-180, 180) & ~((lat.abs()<0.1) & (lon.abs()<0.1))
    df = df[good]

# area sanity
if "square_feet" in df.columns:
    sf = pd.to_numeric(df["square_feet"], errors="coerce")
    df = df[(sf.isna()) | ((sf >= 120) & (sf <= 8000))]

# drop duplicate ids
if "id" in df.columns:
    df = df.drop_duplicates(subset=["id"])

# drop unused columns
drop_cols = ["id","title","body","address","amenities","price_display","price_type","price","source"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

# fill numeric NaNs
for c in df.select_dtypes(include=[np.number]).columns:
    df[c] = df[c].fillna(df[c].median())

# clip top 0.5% price outliers
q995 = df["price_monthly"].quantile(0.995)
df = df[df["price_monthly"] <= q995]

print("Shape clean:", df.shape)
df.head(3)

In [None]:
# --- Split and scale (for KNN) ---
y = df["price_monthly"].astype(float)
X = df.drop(columns=["price_monthly"])

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)

scaler = StandardScaler()
num_cols = X.select_dtypes(include=[np.number]).columns
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols]   = scaler.transform(X_val[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])

print("Splits:", X_train.shape, X_val.shape, X_test.shape)

In [None]:
# --- Eval helper ---
def eval_model(name, model):
    model.fit(X_train, y_train)
    for split, X_, y_ in [("Train",X_train,y_train),
                          ("Val",X_val,y_val),
                          ("Test",X_test,y_test)]:
        pred = model.predict(X_)
        mae = mean_absolute_error(y_, pred)
        mape = mean_absolute_percentage_error(y_, pred) * 100
        r2 = r2_score(y_, pred)
        print(f"{name:<18} {split:<5} | MAE={mae:8.1f} | MAPE={mape:5.1f}% | R2={r2:6.3f}")
    print()

# Baseline
baseline = np.full_like(y_test, y_train.mean())
print("Baseline (mean)  Test | MAE={:.1f} | MAPE={:.1f}% | R2={:.3f}".format(
    mean_absolute_error(y_test, baseline),
    mean_absolute_percentage_error(y_test, baseline) * 100,
    r2_score(y_test, baseline)
))

In [None]:
# --- KNN ---
for k in [3,5,10]:
    eval_model(f"KNN k={k}", KNeighborsRegressor(n_neighbors=k))

# --- Decision Tree ---
for d in [5,10,None]:
    eval_model(f"DecisionTree d={d}", DecisionTreeRegressor(max_depth=d, random_state=42))

# --- Random Forest ---
for n in [10,50,100]:
    eval_model(f"RandomForest n={n}", RandomForestRegressor(n_estimators=n, random_state=42))