# CIDM Lab Work 1 — P1→P3 (Code-Only Notebook)
**Module:** P170M109 Computational Intelligence and Decision Making  
**Scope:** Data analysis & preprocessing (P1), KNN/DecisionTree/RandomForest modeling (P2), hyperparameter selection & results (P3).  
**Date:** 2025-09-22



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)


## P1. Data analysis and preprocessing
### 1) Load data

In [2]:
# Try common paths (100K preferred); adjust if needed.
CSV_PATHS = [
    "/mnt/data/apartments_for_rent_classified_100K.csv",
    "/mnt/data/apartments_for_rent_classified_10K.csv",
    "apartments_for_rent_classified_100K.csv",
    "apartments_for_rent_classified_10K.csv",
]

def load_first_available(paths):
    for p in paths:
        try:
            df = pd.read_csv(p)
            print("Loaded:", p)
            return df
        except Exception:
            continue
    raise FileNotFoundError("Place the CSV next to this notebook or in /mnt/data/.")

df_raw = load_first_available(CSV_PATHS)
print("Raw shape:", df_raw.shape)
df_raw.head(3)

FileNotFoundError: Place the CSV next to this notebook or in /mnt/data/.

### 2) Determine data types & build data quality report (separate numeric vs categorical)

In [None]:
def infer_feature_types(df: pd.DataFrame):
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    cat_cols = [c for c in df.columns if c not in num_cols]
    return num_cols, cat_cols

def dqr_numeric(df: pd.DataFrame, cols):
    rows = []
    for c in cols:
        s = pd.to_numeric(df[c], errors="coerce")
        n = len(s)
        miss = s.isna().sum()
        desc = s.describe()
        rows.append({
            "feature": c,
            "n": n,
            "missing": int(miss),
            "missing_%": round(100*miss/n, 2),
            "unique": int(s.nunique(dropna=True)),
            "mean": desc.get("mean", np.nan),
            "std": desc.get("std", np.nan),
            "min": desc.get("min", np.nan),
            "p25": desc.get("25%", np.nan),
            "p50": desc.get("50%", np.nan),
            "p75": desc.get("75%", np.nan),
            "max": desc.get("max", np.nan),
        })
    return pd.DataFrame(rows).sort_values(["missing_%","feature"], ascending=[False, True])

def dqr_categorical(df: pd.DataFrame, cols):
    rows = []
    for c in cols:
        s = df[c].astype("string")
        n = len(s)
        miss = s.isna().sum()
        rows.append({
            "feature": c,
            "n": n,
            "missing": int(miss),
            "missing_%": round(100*miss/n, 2),
            "unique": int(s.nunique(dropna=True)),
            "top_value": s.value_counts(dropna=True).index[:1].tolist(),
            "top_freq": s.value_counts(dropna=True).values[:1].tolist(),
        })
    return pd.DataFrame(rows).sort_values(["missing_%","feature"], ascending=[False, True])

num_cols_raw, cat_cols_raw = infer_feature_types(df_raw)
print("Numeric cols (raw):", len(num_cols_raw))
print("Categorical cols (raw):", len(cat_cols_raw))

dqr_num_raw = dqr_numeric(df_raw, num_cols_raw)
dqr_cat_raw = dqr_categorical(df_raw, cat_cols_raw)

print("\nNumeric DQR (raw):")
display(dqr_num_raw.head(20))
print("\nCategorical DQR (raw):")
display(dqr_cat_raw.head(20))

### 3) Pre-normalize `price` to monthly for **before** plots (do not clean yet)

In [None]:
def to_monthly(price, ptype):
    t = (str(ptype) if ptype is not None else "").strip().lower()
    try:
        p = float(price)
    except Exception:
        return np.nan
    if not np.isfinite(p) or p <= 0:
        return np.nan
    if "week" in t:             factor = 52/12
    elif "fortnight" in t or "biweek" in t: factor = 26/12
    elif "day" in t:            factor = 30
    elif "year" in t or "annual" in t:      factor = 1/12
    elif "hour" in t:           factor = 24*30
    else:                       factor = 1.0  # assume monthly
    return p * factor

df_before = df_raw.copy()
if "price" in df_before.columns and "price_type" in df_before.columns:
    df_before["price_monthly"] = df_before.apply(lambda r: to_monthly(r.get("price"), r.get("price_type")), axis=1)
else:
    raise KeyError("Expected 'price' and 'price_type' columns.")

print("Before-shape:", df_before.shape)
df_before[["price","price_type","price_monthly"]].head(5)

### 4) Distributions (BEFORE preprocessing)

In [None]:
# Histograms for key numeric features
fig, axes = plt.subplots(1, 2, figsize=(10,4))
df_before["price_monthly"].dropna().plot(kind="hist", bins=50, ax=axes[0])
axes[0].set_title("price_monthly (before)")
if "square_feet" in df_before.columns:
    pd.to_numeric(df_before["square_feet"], errors="coerce").dropna().plot(kind="hist", bins=50, ax=axes[1])
    axes[1].set_title("square_feet (before)")
plt.tight_layout()
plt.show()

# Boxplots
fig, axes = plt.subplots(1, 2, figsize=(10,4))
df_before[["price_monthly"]].boxplot(ax=axes[0])
axes[0].set_title("price_monthly (before)")
if "square_feet" in df_before.columns:
    pd.to_numeric(df_before["square_feet"], errors="coerce").to_frame().boxplot(ax=axes[1])
    axes[1].set_title("square_feet (before)")
plt.tight_layout()
plt.show()

# Bar plot for a categorical (top 15 states/cities if present)
for cat in ["state","cityname","region"]:
    if cat in df_before.columns:
        vc = df_before[cat].astype(str).value_counts().head(15)
        ax = vc.plot(kind="bar", figsize=(8,3))
        ax.set_title(f"{cat} (top 15)")
        plt.tight_layout()
        plt.show()
        break

### 5) Consider derived features & prepare ABT (Analytics Base Table)
- `amenity_count`: number of listed amenities (if available)
- `pet_cat_allowed` / `pet_dog_allowed` from `pets_allowed`
- `price_per_sqft`: `price_monthly / square_feet` (when square_feet available)


In [None]:
df_clean = df_before.copy()

# Binary encodings
for col in ["fee","has_photo"]:
    if col in df_clean.columns:
        df_clean[col] = (df_clean[col].astype(str).str.lower()
                         .map({"yes":1,"no":0,"thumbnail":1}).fillna(0).astype(int))

# Pets flags
if "pets_allowed" in df_clean.columns:
    lower = df_clean["pets_allowed"].fillna("").astype(str).str.lower()
    df_clean["pet_cat_allowed"] = lower.str.contains("cat").astype(int)
    df_clean["pet_dog_allowed"] = lower.str.contains("dog").astype(int)

# Amenity count
if "amenities" in df_clean.columns:
    df_clean["amenity_count"] = df_clean["amenities"].fillna("").apply(lambda x: len(str(x).split(","))).astype(int)

# price_per_sqft (guard divide-by-zero)
if "square_feet" in df_clean.columns:
    sf = pd.to_numeric(df_clean["square_feet"], errors="coerce")
    df_clean["price_per_sqft"] = df_clean["price_monthly"] / sf.replace(0, np.nan)

# ABT candidates: keep numeric + lat/long + simple binaries
abt_drop = ["id","title","body","address","amenities","price_display","price","price_type","currency","source"]
df_abt = df_clean.drop(columns=[c for c in abt_drop if c in df_clean.columns], errors="ignore")

print("ABT candidate columns:", df_abt.columns.tolist()[:25], "... (total:", len(df_abt.columns), ")")
df_abt.head(3)

### 6) Perform preprocessing (missing values, filters, outliers)
Actions (simple and transparent):
- Keep USD (if `currency` exists).
- Drop invalid coords (lat/lon outside valid ranges or near 0,0).
- Clip extreme outliers: remove top 0.5% by `price_monthly`; keep `square_feet` in [120, 8000].
- Fill remaining numeric NaNs with median.


In [None]:
df_proc = df_abt.copy()

# currency
if "currency" in df_before.columns:
    df_proc = df_proc[df_before["currency"].astype(str).str.upper().eq("USD") | df_before["currency"].isna()]

# coords validity (if present)
if "latitude" in df_proc.columns and "longitude" in df_proc.columns:
    lat = pd.to_numeric(df_proc["latitude"], errors="coerce")
    lon = pd.to_numeric(df_proc["longitude"], errors="coerce")
    good = lat.between(-90, 90) & lon.between(-180, 180) & ~((lat.abs()<0.1) & (lon.abs()<0.1))
    df_proc = df_proc[good]

# square feet sanity (if present)
if "square_feet" in df_proc.columns:
    sf = pd.to_numeric(df_proc["square_feet"], errors="coerce")
    df_proc = df_proc[(sf.isna()) | ((sf >= 120) & (sf <= 8000))]

# remove top 0.5% by price_monthly
q995 = df_proc["price_monthly"].quantile(0.995)
df_proc = df_proc[df_proc["price_monthly"] <= q995]

# fill numeric NAs with median
for c in df_proc.select_dtypes(include=[np.number]).columns:
    df_proc[c] = df_proc[c].fillna(df_proc[c].median())

print("Processed shape:", df_proc.shape)
df_proc.head(3)

### 7) Distributions (AFTER preprocessing)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10,4))
df_proc["price_monthly"].dropna().plot(kind="hist", bins=50, ax=axes[0])
axes[0].set_title("price_monthly (after)")
if "square_feet" in df_proc.columns:
    pd.to_numeric(df_proc["square_feet"], errors="coerce").dropna().plot(kind="hist", bins=50, ax=axes[1])
    axes[1].set_title("square_feet (after)")
plt.tight_layout()
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(10,4))
df_proc[["price_monthly"]].boxplot(ax=axes[0])
axes[0].set_title("price_monthly (after)")
if "square_feet" in df_proc.columns:
    pd.to_numeric(df_proc["square_feet"], errors="coerce").to_frame().boxplot(ax=axes[1])
    axes[1].set_title("square_feet (after)")
plt.tight_layout()
plt.show()

### 8) Train/Val/Test split and standardization demo (for numerics used by KNN)

In [None]:
# 70/15/15 split
y = df_proc["price_monthly"].astype(float)
X = df_proc.drop(columns=["price_monthly"])

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)

# Fit scaler on training numeric columns
scaler = StandardScaler()
num_cols = X.select_dtypes(include=[np.number]).columns
X_train_scaled = X_train.copy()
X_val_scaled = X_val.copy()
X_test_scaled = X_test.copy()
X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val_scaled[num_cols]   = scaler.transform(X_val[num_cols])
X_test_scaled[num_cols]  = scaler.transform(X_test[num_cols])

print("Splits:", X_train.shape, X_val.shape, X_test.shape)

# Show before vs after scaling for a couple variables (if present)
show_vars = [c for c in ["square_feet","price_per_sqft"] if c in num_cols][:2]
for v in show_vars:
    fig, axes = plt.subplots(1,2, figsize=(10,3))
    X_train[v].plot(kind="hist", bins=50, ax=axes[0]); axes[0].set_title(f"{v} (before scale)")
    X_train_scaled[v].plot(kind="hist", bins=50, ax=axes[1]); axes[1].set_title(f"{v} (z-scored)")
    plt.tight_layout(); plt.show()

## P2–P3. Modeling (KNN, DT, RF) and hyperparameter selection

In [None]:
def eval_metrics(y_true, y_pred):
    return {
        "MAE": mean_absolute_error(y_true, y_pred),
        "MAPE_%": mean_absolute_percentage_error(y_true, y_pred)*100,
        "RMSE": np.sqrt(((y_true - y_pred)**2).mean()),
        "R2": r2_score(y_true, y_pred)
    }

def summarize_results(results):
    df = pd.DataFrame(results)
    # order columns
    cols = ["model","params","split","MAE","MAPE_%","RMSE","R2"]
    return df[cols] if all(c in df.columns for c in cols) else df

In [None]:
all_results = []

# Baseline (mean of y_train)
baseline_pred = np.full_like(y_val, y_train.mean())
m = eval_metrics(y_val, baseline_pred)
all_results.append({"model":"Baseline","params":"mean(y_train)","split":"Val", **m})

# --- KNN over k --- (uses scaled features)
for k in [3,5,10]:
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train_scaled, y_train)
    for split, X_, y_ in [("Train", X_train_scaled, y_train),
                          ("Val", X_val_scaled, y_val)]:
        pred = model.predict(X_)
        m = eval_metrics(y_, pred)
        all_results.append({"model":"KNN","params":f"k={k}","split":split, **m})

# --- Decision Tree ---
for depth in [5,10,None]:
    # set min_samples_leaf variations minimally to satisfy requirement
    for minleaf in [1,5,10]:
        model = DecisionTreeRegressor(max_depth=depth, min_samples_leaf=minleaf, random_state=42)
        model.fit(X_train, y_train)  # trees OK on unscaled
        for split, X_, y_ in [("Train", X_train, y_train),
                              ("Val", X_val, y_val)]:
            pred = model.predict(X_)
            m = eval_metrics(y_, pred)
            all_results.append({"model":"DecisionTree","params":f"depth={depth},leaf={minleaf}","split":split, **m})

# --- Random Forest ---
for n in [50,100,200]:
    for minleaf in [1,2]:
        model = RandomForestRegressor(n_estimators=n, min_samples_leaf=minleaf, random_state=42, n_jobs=-1)
        model.fit(X_train, y_train)
        for split, X_, y_ in [("Train", X_train, y_train),
                              ("Val", X_val, y_val)]:
            pred = model.predict(X_)
            m = eval_metrics(y_, pred)
            all_results.append({"model":"RandomForest","params":f"n={n},leaf={minleaf}","split":split, **m})

results_df = summarize_results(all_results)
print("Validation results (sample):")
display(results_df[results_df["split"]=="Val"].sort_values(["model","MAE"]).groupby("model").head(6))

In [None]:
best = {}
for model_name in results_df["model"].unique():
    sub = results_df[(results_df["model"]==model_name) & (results_df["split"]=="Val")]
    if len(sub)==0: 
        continue
    row = sub.sort_values("MAE").iloc[0]
    best[model_name] = row["params"]

print("Best hyperparameters (by lowest Val MAE):")
best

In [None]:
# Retrain best models on Train+Val and evaluate on Test
X_trv_scaled = pd.concat([X_train_scaled, X_val_scaled], axis=0)
y_trv = pd.concat([y_train, y_val], axis=0)

X_trv = pd.concat([X_train, X_val], axis=0)

final_results = []

# Baseline on Test
baseline_test = np.full_like(y_test, y_trv.mean())
final_results.append({"model":"Baseline","params":"mean(tr+val)","split":"Test", **eval_metrics(y_test, baseline_test)})

# KNN
if "KNN" in best:
    k = int(best["KNN"].split("=")[1])
    knn = KNeighborsRegressor(n_neighbors=k).fit(X_trv_scaled, y_trv)
    final_results.append({"model":"KNN","params":best["KNN"],"split":"Test",
                          **eval_metrics(y_test, knn.predict(X_test_scaled))})

# Decision Tree
if "DecisionTree" in best:
    pars = dict(x.split("=") for x in best["DecisionTree"].replace("depth=","depth=").split(",") )
    depth = None if pars["depth"]=="None" else int(pars["depth"])
    leaf = int(pars["leaf"])
    dt = DecisionTreeRegressor(max_depth=depth, min_samples_leaf=leaf, random_state=42).fit(X_trv, y_trv)
    final_results.append({"model":"DecisionTree","params":best["DecisionTree"],"split":"Test",
                          **eval_metrics(y_test, dt.predict(X_test))})

# Random Forest
if "RandomForest" in best:
    pars = dict(x.split("=") for x in best["RandomForest"].split(",") )
    n = int(pars["n"])
    leaf = int(pars["leaf"])
    rf = RandomForestRegressor(n_estimators=n, min_samples_leaf=leaf, random_state=42, n_jobs=-1).fit(X_trv, y_trv)
    final_results.append({"model":"RandomForest","params":best["RandomForest"],"split":"Test",
                          **eval_metrics(y_test, rf.predict(X_test))})

final_df = pd.DataFrame(final_results)
print("Final Test metrics:")
display(final_df)

### Feature Importances (Random Forest, if selected)

In [None]:
if "RandomForest" in best:
    pars = dict(x.split("=") for x in best["RandomForest"].split(",") )
    n = int(pars["n"]); leaf = int(pars["leaf"])
    rf = RandomForestRegressor(n_estimators=n, min_samples_leaf=leaf, random_state=42, n_jobs=-1).fit(
        pd.concat([X_train, X_val]), pd.concat([y_train, y_val])
    )
    imp = pd.DataFrame({"feature": X_train.columns, "importance": rf.feature_importances_})\
            .sort_values("importance", ascending=False)
    display(imp.head(20))

---
**References (for your convenience):**
- scikit-learn KNN: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html  
- scikit-learn Decision Trees: https://scikit-learn.org/stable/modules/tree.html  
- scikit-learn RandomForestRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html  
- Original dataset: https://archive.ics.uci.edu/dataset/555/apartment+for+rent+classified
