# CIDM Lab 1 — P1→P3 (Notebook Only, Code You Can Edit)
**Course:** P170M109 Computational Intelligence and Decision Making  
**This notebook covers:**  
- **P1**: Input analysis & preprocessing (before/after)  
- **P2**: Modeling with **KNN**, **Decision Tree**, **Random Forest**  
- **P3**: Hyperparameter selection & final results



In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)
plt.rcParams['figure.figsize'] = (7, 4)

## P1. Data analysis and preprocessing
### 1) Load data (robust file search)

In [4]:
def find_file_upwards(filename: str, start: Path | None = None) -> Path:
    """Search for `filename` in the current working directory and its parents.
    Returns the full Path if found; raises FileNotFoundError otherwise."""
    start = (start or Path.cwd()).resolve()
    for base in [start, *start.parents]:
        candidate = base / filename
        if candidate.is_file():
            return candidate
    raise FileNotFoundError(
        f"Could not find '{filename}' starting from {start}.\n"
        f"Current working directory: {Path.cwd()}\n"
        "Make sure the filename is correct and the file exists."
    )

# Common dataset names (100K preferred, then 10K)
CANDIDATES = [
    "apartments_for_rent_classified_100K.csv",
    "apartments_for_rent_classified_10K.csv",
]

data_path = None
for name in CANDIDATES:
    try:
        data_path = find_file_upwards(name)
        print("Loaded file:", data_path)
        break
    except FileNotFoundError:
        pass

if data_path is None:
    # Fallback: explicit class paths
    for p in [Path("/mnt/data/apartments_for_rent_classified_100K.csv"),
              Path("/mnt/data/apartments_for_rent_classified_10K.csv")]:
        if p.exists():
            data_path = p
            print("Loaded file:", data_path)
            break

if data_path is None:
    raise FileNotFoundError("Dataset not found. Put the CSV near this notebook or in /mnt/data/.")

df_raw = pd.read_csv(data_path)
print("Raw shape:", df_raw.shape)
df_raw.head(3)

Loaded file: C:\Users\Rokas\Documents\KTU\KTU P170M109 Computational Intelligence and Decision Making\Lab 1\apartments_for_rent_classified_100K.csv


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x92 in position 97583: invalid start byte

### 2) Determine feature types & Data Quality Reports (numeric vs categorical)

In [5]:
def infer_feature_types(df: pd.DataFrame):
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    cat_cols = [c for c in df.columns if c not in num_cols]
    return num_cols, cat_cols

def dqr_numeric(df: pd.DataFrame, cols):
    rows = []
    for c in cols:
        s = pd.to_numeric(df[c], errors="coerce")
        n = len(s); miss = s.isna().sum()
        desc = s.describe()
        rows.append({
            "feature": c, "n": n, "missing": int(miss),
            "missing_%": round(100*miss/n, 2),
            "unique": int(s.nunique(dropna=True)),
            "mean": desc.get("mean", np.nan),
            "std": desc.get("std", np.nan),
            "min": desc.get("min", np.nan),
            "p25": desc.get("25%", np.nan),
            "p50": desc.get("50%", np.nan),
            "p75": desc.get("75%", np.nan),
            "max": desc.get("max", np.nan),
            "skew": s.dropna().skew() if s.notna().any() else np.nan
        })
    return pd.DataFrame(rows).sort_values(["missing_%","feature"], ascending=[False, True])

def dqr_categorical(df: pd.DataFrame, cols):
    rows = []
    for c in cols:
        s = df[c].astype("string")
        n = len(s); miss = s.isna().sum()
        vc = s.value_counts(dropna=True)
        rows.append({
            "feature": c, "n": n, "missing": int(miss),
            "missing_%": round(100*miss/n, 2),
            "unique": int(vc.shape[0]),
            "top_value": vc.index[:1].tolist(),
            "top_freq": vc.values[:1].tolist(),
        })
    return pd.DataFrame(rows).sort_values(["missing_%","feature"], ascending=[False, True])

num_cols_raw, cat_cols_raw = infer_feature_types(df_raw)
print("Numeric cols (raw):", len(num_cols_raw))
print("Categorical cols (raw):", len(cat_cols_raw))

print("\nNumeric DQR (raw):")
display(dqr_numeric(df_raw, num_cols_raw).head(25))

print("\nCategorical DQR (raw):")
display(dqr_categorical(df_raw, cat_cols_raw).head(25))

NameError: name 'df_raw' is not defined

### 3) Create `price_monthly` **before** cleaning for initial distributions

In [None]:
def to_monthly(price, ptype):
    t = (str(ptype) if ptype is not None else "").strip().lower()
    try:
        p = float(price)
    except Exception:
        return np.nan
    if not np.isfinite(p) or p <= 0: return np.nan
    if "week" in t:             factor = 52/12
    elif "fortnight" in t or "biweek" in t: factor = 26/12
    elif "day" in t:            factor = 30
    elif "year" in t or "annual" in t:      factor = 1/12
    elif "hour" in t:           factor = 24*30
    else:                       factor = 1.0  # assume monthly
    return p * factor

df_before = df_raw.copy()
if "price" in df_before.columns and "price_type" in df_before.columns:
    df_before["price_monthly"] = df_before.apply(lambda r: to_monthly(r.get("price"), r.get("price_type")), axis=1)
else:
    raise KeyError("Expected 'price' and 'price_type' in the dataset.")

df_before[["price","price_type","price_monthly"]].head(5)

### 4) Distributions (BEFORE preprocessing)

In [None]:
# Histograms
fig, axes = plt.subplots(1, 2, figsize=(12,4))
df_before["price_monthly"].dropna().plot(kind="hist", bins=50, ax=axes[0]); axes[0].set_title("price_monthly (before)")
if "square_feet" in df_before.columns:
    pd.to_numeric(df_before["square_feet"], errors="coerce").dropna().plot(kind="hist", bins=50, ax=axes[1]); axes[1].set_title("square_feet (before)")
plt.tight_layout(); plt.show()

# Boxplots
fig, axes = plt.subplots(1, 2, figsize=(12,4))
df_before[["price_monthly"]].boxplot(ax=axes[0]); axes[0].set_title("price_monthly (before)")
if "square_feet" in df_before.columns:
    pd.to_numeric(df_before["square_feet"], errors="coerce").to_frame().boxplot(ax=axes[1]); axes[1].set_title("square_feet (before)")
plt.tight_layout(); plt.show()

# Categorical top-15 bar
for cat in ["state", "cityname", "region"]:
    if cat in df_before.columns:
        vc = df_before[cat].astype(str).value_counts().head(15)
        ax = vc.plot(kind="bar", figsize=(10,3)); ax.set_title(f"{cat} (top 15)"); plt.tight_layout(); plt.show()
        break

### 5) Derived features & ABT (Analytics Base Table)

In [None]:
df_clean = df_before.copy()

# Binary encodings
for col in ["fee", "has_photo"]:
    if col in df_clean.columns:
        df_clean[col] = (df_clean[col].astype(str).str.lower()
                         .map({"yes":1,"no":0,"thumbnail":1}).fillna(0).astype(int))

# Pets flags
if "pets_allowed" in df_clean.columns:
    lower = df_clean["pets_allowed"].fillna("").astype(str).str.lower()
    df_clean["pet_cat_allowed"] = lower.str.contains("cat").astype(int)
    df_clean["pet_dog_allowed"] = lower.str.contains("dog").astype(int)

# Amenity count
if "amenities" in df_clean.columns:
    df_clean["amenity_count"] = df_clean["amenities"].fillna("").apply(lambda x: len(str(x).split(","))).astype(int)

# price_per_sqft
if "square_feet" in df_clean.columns:
    sf = pd.to_numeric(df_clean["square_feet"], errors="coerce")
    df_clean["price_per_sqft"] = df_clean["price_monthly"] / sf.replace(0, np.nan)

# ABT: drop text-heavy/IDs
abt_drop = ["id","title","body","address","amenities","price_display","price","price_type","currency","source"]
df_abt = df_clean.drop(columns=[c for c in abt_drop if c in df_clean.columns], errors="ignore")

print("ABT candidate shape:", df_abt.shape)
df_abt.head(3)

### 6) Preprocess (filters, missing values, outliers)

In [None]:
df_proc = df_abt.copy()

# currency (if available in raw)
if "currency" in df_raw.columns:
    mask_usd = df_raw["currency"].astype(str).str.upper().eq("USD") | df_raw["currency"].isna()
    # align by index intersection
    df_proc = df_proc.loc[df_proc.index.intersection(mask_usd[mask_usd].index)]

# coords validity
if {"latitude", "longitude"}.issubset(df_proc.columns):
    lat = pd.to_numeric(df_proc["latitude"], errors="coerce")
    lon = pd.to_numeric(df_proc["longitude"], errors="coerce")
    good = lat.between(-90, 90) & lon.between(-180, 180) & ~((lat.abs()<0.1) & (lon.abs()<0.1))
    df_proc = df_proc[good]

# square feet sanity
if "square_feet" in df_proc.columns:
    sf = pd.to_numeric(df_proc["square_feet"], errors="coerce")
    df_proc = df_proc[(sf.isna()) | ((sf >= 120) & (sf <= 8000))]

# outliers: top 0.5%
q995 = df_proc["price_monthly"].quantile(0.995)
df_proc = df_proc[df_proc["price_monthly"] <= q995]

# fill numeric NaNs
for c in df_proc.select_dtypes(include=[np.number]).columns:
    df_proc[c] = df_proc[c].fillna(df_proc[c].median())

# drop duplicates
if "id" in df_raw.columns and "id" in df_proc.columns:
    df_proc = df_proc.drop_duplicates(subset=["id"])
else:
    df_proc = df_proc.drop_duplicates()

print("Processed shape:", df_proc.shape)
df_proc.head(3)

### 7) Distributions (AFTER preprocessing)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12,4))
df_proc["price_monthly"].dropna().plot(kind="hist", bins=50, ax=axes[0]); axes[0].set_title("price_monthly (after)")
if "square_feet" in df_proc.columns:
    pd.to_numeric(df_proc["square_feet"], errors="coerce").dropna().plot(kind="hist", bins=50, ax=axes[1]); axes[1].set_title("square_feet (after)")
plt.tight_layout(); plt.show()

fig, axes = plt.subplots(1, 2, figsize=(12,4))
df_proc[["price_monthly"]].boxplot(ax=axes[0]); axes[0].set_title("price_monthly (after)")
if "square_feet" in df_proc.columns:
    pd.to_numeric(df_proc["square_feet"], errors="coerce").to_frame().boxplot(ax=axes[1]); axes[1].set_title("square_feet (after)")
plt.tight_layout(); plt.show()

### 8) Train/Val/Test split (70/15/15) and standardization demo (for KNN)

In [None]:
y = df_proc["price_monthly"].astype(float)
X = df_proc.drop(columns=["price_monthly"])

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)

scaler = StandardScaler()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

X_train_scaled = X_train.copy()
X_val_scaled = X_val.copy()
X_test_scaled = X_test.copy()

X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val_scaled[num_cols]   = scaler.transform(X_val[num_cols])
X_test_scaled[num_cols]  = scaler.transform(X_test[num_cols])

print("Shapes:", X_train.shape, X_val.shape, X_test.shape)

# show before vs after scaling (if available)
cand = [c for c in ["square_feet","price_per_sqft","bedrooms","bathrooms"] if c in num_cols]
if cand:
    v = cand[0]
    fig, axes = plt.subplots(1,2, figsize=(12,3))
    X_train[v].plot(kind="hist", bins=50, ax=axes[0]); axes[0].set_title(f"{v} (before scale)")
    pd.Series(X_train_scaled[v]).plot(kind="hist", bins=50, ax=axes[1]); axes[1].set_title(f"{v} (z-scored)")
    plt.tight_layout(); plt.show()

## P2–P3. Modeling and hyperparameter selection

In [None]:
def eval_metrics(y_true, y_pred):
    return {
        "MAE": mean_absolute_error(y_true, y_pred),
        "MAPE_%": mean_absolute_percentage_error(y_true, y_pred)*100,
        "RMSE": np.sqrt(((y_true - y_pred)**2).mean()),
        "R2": r2_score(y_true, y_pred)
    }

all_results = []

# Baseline (mean)
baseline_val = np.full_like(y_val, y_train.mean())
m = eval_metrics(y_val, baseline_val)
all_results.append({"model":"Baseline","params":"mean(y_train)","split":"Val", **m})

# KNN (scaled)
for k in [3,5,10]:
    mdl = KNeighborsRegressor(n_neighbors=k)
    mdl.fit(X_train_scaled, y_train)
    for split, X_, y_ in [("Train", X_train_scaled, y_train), ("Val", X_val_scaled, y_val)]:
        pred = mdl.predict(X_)
        all_results.append({"model":"KNN","params":f"k={k}","split":split, **eval_metrics(y_, pred)})

# Decision Tree
for depth in [5,10,None]:
    for leaf in [1,5,10]:
        mdl = DecisionTreeRegressor(max_depth=depth, min_samples_leaf=leaf, random_state=42)
        mdl.fit(X_train, y_train)
        for split, X_, y_ in [("Train", X_train, y_train), ("Val", X_val, y_val)]:
            pred = mdl.predict(X_)
            all_results.append({"model":"DecisionTree","params":f"depth={depth},leaf={leaf}","split":split, **eval_metrics(y_, pred)})

# Random Forest
for n in [50,100,200]:
    for leaf in [1,2]:
        mdl = RandomForestRegressor(n_estimators=n, min_samples_leaf=leaf, random_state=42, n_jobs=-1)
        mdl.fit(X_train, y_train)
        for split, X_, y_ in [("Train", X_train, y_train), ("Val", X_val, y_val)]:
            pred = mdl.predict(X_)
            all_results.append({"model":"RandomForest","params":f"n={n},leaf={leaf}","split":split, **eval_metrics(y_, pred)})

results_df = pd.DataFrame(all_results)
print("Validation results (best 6 per model by MAE):")
display(results_df[results_df["split"]=="Val"].sort_values(["model","MAE"]).groupby("model").head(6))

In [None]:
best = {}
for mdl in results_df["model"].unique():
    sub = results_df[(results_df["model"]==mdl) & (results_df["split"]=="Val")]
    if len(sub)==0: continue
    best[mdl] = sub.sort_values("MAE").iloc[0]["params"]
best

In [None]:
# Retrain best params on Train+Val, evaluate on Test
X_trv = pd.concat([X_train, X_val], axis=0)
y_trv = pd.concat([y_train, y_val], axis=0)
X_trv_scaled = pd.concat([X_train_scaled, X_val_scaled], axis=0)

final = []

# Baseline
baseline_test = np.full_like(y_test, y_trv.mean())
final.append({"model":"Baseline","params":"mean(tr+val)","split":"Test", **eval_metrics(y_test, baseline_test)})

# KNN
if "KNN" in best:
    k = int(best["KNN"].split("=")[1])
    mdl = KNeighborsRegressor(n_neighbors=k).fit(X_trv_scaled, y_trv)
    final.append({"model":"KNN","params":best["KNN"],"split":"Test", **eval_metrics(y_test, mdl.predict(X_test_scaled))})

# Decision Tree
if "DecisionTree" in best:
    parts = dict(p.split("=") for p in best["DecisionTree"].split(","))
    depth = None if parts["depth"]=="None" else int(parts["depth"])
    leaf = int(parts["leaf"])
    mdl = DecisionTreeRegressor(max_depth=depth, min_samples_leaf=leaf, random_state=42).fit(X_trv, y_trv)
    final.append({"model":"DecisionTree","params":best["DecisionTree"],"split":"Test", **eval_metrics(y_test, mdl.predict(X_test))})

# Random Forest
if "RandomForest" in best:
    parts = dict(p.split("=") for p in best["RandomForest"].split(","))
    n = int(parts["n"]); leaf = int(parts["leaf"])
    mdl = RandomForestRegressor(n_estimators=n, min_samples_leaf=leaf, random_state=42, n_jobs=-1).fit(X_trv, y_trv)
    final.append({"model":"RandomForest","params":best["RandomForest"],"split":"Test", **eval_metrics(y_test, mdl.predict(X_test))})

final_df = pd.DataFrame(final)
print("Final Test metrics:")
display(final_df)

### Random Forest feature importances (optional)

In [None]:
if "RandomForest" in best:
    parts = dict(p.split("=") for p in best["RandomForest"].split(","))
    n = int(parts["n"]); leaf = int(parts["leaf"])
    mdl = RandomForestRegressor(n_estimators=n, min_samples_leaf=leaf, random_state=42, n_jobs=-1)\
            .fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))
    imp = pd.DataFrame({"feature": X_train.columns, "importance": mdl.feature_importances_})\
            .sort_values("importance", ascending=False)
    display(imp.head(20))

---
### References (for convenience)
- KNN (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html  
- Decision Trees (scikit-learn): https://scikit-learn.org/stable/modules/tree.html  
- Random Forest (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html  
- Dataset: https://archive.ics.uci.edu/dataset/555/apartment+for+rent+classified
