# Image Processing and Pattern Recognition - Group Assignment
# Feature Extraction + Decision Tree & Random Forest Classifiers

By: Shuntian Shi  
Purpose: This script reads preprocessed image data from manifest.csv,
trains both Decision Tree and Random Forest models,  
evaluates performance, and visualizes the results.  


In [1]:
# %% 0) Imports and config
import os, json, time, random, gc
import numpy as np
import pandas as pd
from PIL import Image

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score

# Paths
MANIFEST   = r"D:\LocalUser\42177 Project\data\manifest.csv"
ROOT_CLEAN = r"D:\LocalUser\42177 Project\data\clean"
ROOT_DEG   = r"D:\LocalUser\42177 Project\data\degraded"
ART        = r"D:\LocalUser\42177 Project\artifacts"
os.makedirs(ART, exist_ok=True)

# Hyperparameters (aligned with LR/SVM/kNN low-RAM defaults)
TARGET_HW  = (256, 256)
SEED       = 42177
BATCH      = 64
NCOMP      = 64

# Trees
DT_PARAMS = dict(criterion="gini", max_depth=None, random_state=SEED)
RF_PARAMS = dict(n_estimators=200, max_depth=None, n_jobs=-1, random_state=SEED)

random.seed(SEED); np.random.seed(SEED)
EXTS = ('.png','.jpg','.jpeg','.tif','.tiff','.bmp')


In [2]:
# %% 1) Labels from manifest (fixed order)
df = pd.read_csv(MANIFEST)  # expects: id, filepath, label, subset, mag
LABELS = sorted(df['label'].unique().tolist())
LABELS


['babesia',
 'leishmania',
 'plasmodium',
 'toxoplasma1000x',
 'toxoplasma400x',
 'trichomonad',
 'trypanosome']

In [3]:
# %% 2) Streaming IO helpers
def _list_images(folder):
    if not os.path.isdir(folder): return []
    return [os.path.join(folder,f) for f in os.listdir(folder) if f.lower().endswith(EXTS)]

def stream_clean_global(subset, batch=BATCH, target=TARGET_HW):
    paths = []
    for lab in LABELS:
        d = os.path.join(ROOT_CLEAN, subset, lab)
        if not os.path.isdir(d): 
            continue
        paths += [(os.path.join(d, f), lab) for f in os.listdir(d) if f.lower().endswith(EXTS)]
    for i in range(0, len(paths), batch):
        chunk = paths[i:i+batch]
        Xb, yb = [], []
        for p, lab in chunk:
            im = Image.open(p).convert("RGB").resize(target)
            arr = np.asarray(im, dtype=np.float32) / 255.0
            Xb.append(arr.reshape(-1)); yb.append(lab)
        if Xb:
            yield np.stack(Xb), np.array(yb)

def stream_degraded_global(cond, subset="test", batch=BATCH, target=TARGET_HW):
    base = os.path.join(ROOT_DEG, cond, subset)
    if not os.path.isdir(base): return
    paths = []
    for lab in LABELS:
        d = os.path.join(base, lab)
        if not os.path.isdir(d): 
            continue
        paths += [(os.path.join(d, f), lab) for f in os.listdir(d) if f.lower().endswith(EXTS)]
    for i in range(0, len(paths), batch):
        chunk = paths[i:i+batch]
        Xb, yb = [], []
        for p, lab in chunk:
            im = Image.open(p).convert("RGB").resize(target)
            arr = np.asarray(im, dtype=np.float32) / 255.0
            Xb.append(arr.reshape(-1)); yb.append(lab)
        if Xb:
            yield np.stack(Xb), np.array(yb)


In [4]:
# %% 3) Metrics packer (aligned with evaluate_model.mlx)
def evaluate_and_pack(y_true, y_pred, labels=LABELS):
    y_true = np.asarray(y_true).astype(str)
    y_pred = np.asarray(y_pred).astype(str)
    labels = [str(x) for x in labels]
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average="macro", zero_division=0
    )
    return {
        "Accuracy": float(acc),
        "Precision": float(prec),
        "Recall": float(rec),
        "F1": float(f1),
        "ConfusionMatrix": cm.tolist(),
        "Labels": labels
    }


In [6]:
# %% 4) Fit StandardScaler + IncrementalPCA (global stream, low RAM)
scaler = StandardScaler(with_mean=True, with_std=True)
for Xb, _ in stream_clean_global("train"):
    scaler.partial_fit(Xb.astype(np.float32, copy=False))
    del Xb; gc.collect()
print("Scaler fitted (global stream).")

ipca = IncrementalPCA(n_components=NCOMP, batch_size=BATCH)
for Xb, _ in stream_clean_global("train"):
    Xb_std = scaler.transform(Xb.astype(np.float32, copy=False))
    if Xb_std.shape[0] >= NCOMP:
        ipca.partial_fit(Xb_std)
    del Xb, Xb_std; gc.collect()
print(f"IPCA fitted (n_components={NCOMP}).")


Scaler fitted (global stream).
IPCA fitted (n_components=64).


In [7]:
# %% 5) Transform TRAIN/TEST in streams â†’ compact features
Xtr_chunks, ytr_chunks = [], []
for Xb, yb in stream_clean_global("train"):
    Xb_std = scaler.transform(Xb.astype(np.float32, copy=False))
    Xb_pca = ipca.transform(Xb_std)
    Xtr_chunks.append(Xb_pca.astype(np.float32, copy=False))
    ytr_chunks.append(yb)
Xtr = np.vstack(Xtr_chunks); ytr = np.concatenate(ytr_chunks)

Xte_chunks, yte_chunks = [], []
for Xb, yb in stream_clean_global("test"):
    Xb_std = scaler.transform(Xb.astype(np.float32, copy=False))
    Xb_pca = ipca.transform(Xb_std)
    Xte_chunks.append(Xb_pca.astype(np.float32, copy=False))
    yte_chunks.append(yb)
Xte = np.vstack(Xte_chunks); yte = np.concatenate(yte_chunks)


In [8]:
# %% 6) Train Decision Tree and Random Forest
dt = DecisionTreeClassifier(**DT_PARAMS)
rf = RandomForestClassifier(**RF_PARAMS)

print("Training Decision Tree...")
t0 = time.time(); dt.fit(Xtr, ytr); dt_secs = time.time() - t0
print(f"DT training time: {dt_secs/60:.2f} min")

print("Training Random Forest...")
t0 = time.time(); rf.fit(Xtr, ytr); rf_secs = time.time() - t0
print(f"RF training time: {rf_secs/60:.2f} min")


Training Decision Tree...
DT training time: 0.03 min
Training Random Forest...
RF training time: 0.06 min


In [9]:
# %% 7) Evaluate on clean test and save JSON
# Decision Tree
t1 = time.time(); yhat_dt = dt.predict(Xte); dt_test = time.time() - t1
res_dt = evaluate_and_pack(yte, yhat_dt, LABELS)
res_dt["TrainElapsedSeconds"] = round(dt_secs, 2)
res_dt["TestElapsedSeconds"]  = round(dt_test, 2)
out_dt = os.path.join(ART, "results_tree_clean.json")
with open(out_dt, "w") as f: json.dump(res_dt, f, indent=2)
print("Saved:", out_dt)

# Random Forest
t1 = time.time(); yhat_rf = rf.predict(Xte); rf_test = time.time() - t1
res_rf = evaluate_and_pack(yte, yhat_rf, LABELS)
res_rf["TrainElapsedSeconds"] = round(rf_secs, 2)
res_rf["TestElapsedSeconds"]  = round(rf_test, 2)
out_rf = os.path.join(ART, "results_rf_clean.json")
with open(out_rf, "w") as f: json.dump(res_rf, f, indent=2)
print("Saved:", out_rf)


Saved: D:\LocalUser\42177 Project\artifacts\results_tree_clean.json
Saved: D:\LocalUser\42177 Project\artifacts\results_rf_clean.json


In [10]:
# %% 8) Evaluate on degraded test sets (streaming)
conds = [d for d in os.listdir(ROOT_DEG) if os.path.isdir(os.path.join(ROOT_DEG, d))]

for c in conds:
    # Decision Tree
    y_true, y_pred = [], []
    any_batch = False
    for Xb, yb in stream_degraded_global(c, "test", batch=BATCH):
        any_batch = True
        Xb_std = scaler.transform(Xb.astype(np.float32, copy=False))
        Xb_pca = ipca.transform(Xb_std)
        y_pred.extend(dt.predict(Xb_pca).tolist())
        y_true.extend(yb.tolist())
    if any_batch:
        res = evaluate_and_pack(np.array(y_true), np.array(y_pred), LABELS)
        out = os.path.join(ART, f"results_tree_{c}.json")
        with open(out, "w") as f: json.dump(res, f, indent=2)
        print("Saved:", out)

    # Random Forest
    y_true, y_pred = [], []
    any_batch = False
    for Xb, yb in stream_degraded_global(c, "test", batch=BATCH):
        any_batch = True
        Xb_std = scaler.transform(Xb.astype(np.float32, copy=False))
        Xb_pca = ipca.transform(Xb_std)
        y_pred.extend(rf.predict(Xb_pca).tolist())
        y_true.extend(yb.tolist())
    if any_batch:
        res = evaluate_and_pack(np.array(y_true), np.array(y_pred), LABELS)
        out = os.path.join(ART, f"results_rf_{c}.json")
        with open(out, "w") as f: json.dump(res, f, indent=2)
        print("Saved:", out)


Saved: D:\LocalUser\42177 Project\artifacts\results_tree_gaussian_blur_s1.0.json
Saved: D:\LocalUser\42177 Project\artifacts\results_rf_gaussian_blur_s1.0.json
Saved: D:\LocalUser\42177 Project\artifacts\results_tree_gaussian_blur_s2.0.json
Saved: D:\LocalUser\42177 Project\artifacts\results_rf_gaussian_blur_s2.0.json
Saved: D:\LocalUser\42177 Project\artifacts\results_tree_gaussian_noise_s15.json
Saved: D:\LocalUser\42177 Project\artifacts\results_rf_gaussian_noise_s15.json
Saved: D:\LocalUser\42177 Project\artifacts\results_tree_gaussian_noise_s5.json
Saved: D:\LocalUser\42177 Project\artifacts\results_rf_gaussian_noise_s5.json
Saved: D:\LocalUser\42177 Project\artifacts\results_tree_jpeg_q20.json
Saved: D:\LocalUser\42177 Project\artifacts\results_rf_jpeg_q20.json
Saved: D:\LocalUser\42177 Project\artifacts\results_tree_jpeg_q40.json
Saved: D:\LocalUser\42177 Project\artifacts\results_rf_jpeg_q40.json
Saved: D:\LocalUser\42177 Project\artifacts\results_tree_jpeg_q60.json
Saved: D:\L