In [None]:
# CIDM Lab 1 – Simple solution (KNN, Decision Tree, Random Forest)
# ---------------------------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


# --------------------
# 1. Load data
# --------------------
df = pd.read_csv("apartments_for_rent_classified_100K.csv")

# Keep only numeric price (drop formatted strings)
df = df.dropna(subset=["price"])
df["price"] = pd.to_numeric(df["price"], errors="coerce")
df = df.dropna(subset=["price"])

# Drop unused fields
drop_cols = ["price_display", "price_type", "currency", "title", "body", "address"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Simple derived feature: count of amenities
if "amenities" in df.columns:
    df["amenity_count"] = df["amenities"].fillna("").apply(lambda x: len(str(x).split(",")))
    df = df.drop(columns=["amenities"])

# Encode binary Yes/No to 1/0
for col in ["fee", "has_photo"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.lower().map({"yes": 1, "no": 0, "thumbnail": 1}).fillna(0)

# --------------------
# 2. Data quality report
# --------------------
print("Data Quality Report:")
print(df.info())
print(df.describe(include="all"))

# --------------------
# 3. Preprocessing
# --------------------
# Remove outliers: price > 10000 or square_feet > 8000
if "square_feet" in df.columns:
    df = df[(df["price"] <= 10000) & (df["square_feet"] <= 8000)]

# Features/target
y = df["price"]
X = df.drop(columns=["price"])

# Train/val/test split
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)
# (≈70% train, 15% val, 15% test)

# Scale continuous variables (needed for KNN)
scaler = StandardScaler()
num_cols = X.select_dtypes(include=np.number).columns
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols]   = scaler.transform(X_val[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])

# --------------------
# 4. Evaluation function
# --------------------
def evaluate(model, Xtr, ytr, Xv, yv, Xte, yte):
    metrics = {}
    for split, X_, y_ in [("Train", Xtr, ytr), ("Val", Xv, yv), ("Test", Xte, yte)]:
        pred = model.predict(X_)
        mae = mean_absolute_error(y_, pred)
        mape = mean_absolute_percentage_error(y_, pred) * 100
        rmse = np.sqrt(((y_ - pred) ** 2).mean())
        r2 = r2_score(y_, pred)
        metrics[split] = (mae, mape, rmse, r2)
    return metrics

def print_results(name, metrics):
    print(f"\n{name} results:")
    for split, vals in metrics.items():
        mae, mape, rmse, r2 = vals
        print(f"{split}: MAE={mae:.1f}, MAPE={mape:.1f}%, RMSE={rmse:.1f}, R2={r2:.3f}")

# --------------------
# 5. Models
# --------------------

# Baseline (mean predictor)
y_mean = np.full_like(y_test, y_train.mean())
print("\nBaseline (mean predictor):")
print(f"Test MAE={mean_absolute_error(y_test, y_mean):.1f}, "
      f"MAPE={mean_absolute_percentage_error(y_test, y_mean)*100:.1f}%, "
      f"RMSE={np.sqrt(((y_test - y_mean)**2).mean()):.1f}, "
      f"R2={r2_score(y_test, y_mean):.3f}")

# KNN
for k in [3, 5, 10]:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    metrics = evaluate(knn, X_train, y_train, X_val, y_val, X_test, y_test)
    print_results(f"KNN (k={k})", metrics)

# Decision Tree
for depth in [5, 10, None]:
    dt = DecisionTreeRegressor(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    metrics = evaluate(dt, X_train, y_train, X_val, y_val, X_test, y_test)
    print_results(f"Decision Tree (max_depth={depth})", metrics)

# Random Forest
for n in [10, 50, 100]:
    rf = RandomForestRegressor(n_estimators=n, random_state=42)
    rf.fit(X_train, y_train)
    metrics = evaluate(rf, X_train, y_train, X_val, y_val, X_test, y_test)
    print_results(f"Random Forest (n_estimators={n})", metrics)
