# 42177 — PCA → SVM (clean + degraded), manifest-driven
- Global streaming for StandardScaler + IncrementalPCA
- BATCH=64, NCOMP=64
- Batch transform to avoid large allocations
- Results save JSON to artifacts/

In [None]:
# %% 0) Imports and config
import os, json, time, random, numpy as np, pandas as pd
from PIL import Image
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score

# Paths
MANIFEST   = r"D:\LocalUser\42177 Project\data\manifest.csv"
ROOT_CLEAN = r"D:\LocalUser\42177 Project\data\clean"
ROOT_DEG   = r"D:\LocalUser\42177 Project\data\degraded"
ART        = r"D:\LocalUser\42177 Project\artifacts"
os.makedirs(ART, exist_ok=True)

# Hyperparameters
TARGET_HW  = (256, 256)
SEED       = 42177
BATCH      = 64
NCOMP      = 64

random.seed(SEED); np.random.seed(SEED)
EXTS = ('.png','.jpg','.jpeg','.tif','.tiff','.bmp')


In [2]:
# %% 1) Labels from manifest (fixed order)
df = pd.read_csv(MANIFEST)  # expects: id, filepath, label, subset, mag
LABELS = sorted(df['label'].unique().tolist())
LABELS


['babesia',
 'leishmania',
 'plasmodium',
 'toxoplasma1000x',
 'toxoplasma400x',
 'trichomonad',
 'trypanosome']

In [3]:
# %% 2) I/O streaming (global stream)
def _list_images(folder):
    if not os.path.isdir(folder): return []
    return [os.path.join(folder,f) for f in os.listdir(folder) if f.lower().endswith(EXTS)]

def stream_clean_global(subset, batch=BATCH, target=TARGET_HW):
    paths = []
    for lab in LABELS:
        d = os.path.join(ROOT_CLEAN, subset, lab)
        if not os.path.isdir(d): 
            continue
        paths += [(os.path.join(d, f), lab) for f in os.listdir(d) if f.lower().endswith(EXTS)]
    for i in range(0, len(paths), batch):
        chunk = paths[i:i+batch]
        Xb, yb = [], []
        for p, lab in chunk:
            im = Image.open(p).convert("RGB").resize(target)
            arr = np.asarray(im, dtype=np.float32) / 255.0
            Xb.append(arr.reshape(-1)); yb.append(lab)
        yield np.stack(Xb), np.array(yb)

def stream_degraded_global(cond, subset="test", batch=BATCH, target=TARGET_HW):
    base = os.path.join(ROOT_DEG, cond, subset)
    if not os.path.isdir(base):
        return
    paths = []
    for lab in LABELS:
        d = os.path.join(base, lab)
        if not os.path.isdir(d): 
            continue
        paths += [(os.path.join(d, f), lab) for f in os.listdir(d) if f.lower().endswith(EXTS)]
    for i in range(0, len(paths), batch):
        chunk = paths[i:i+batch]
        Xb, yb = [], []
        for p, lab in chunk:
            im = Image.open(p).convert("RGB").resize(target)
            arr = np.asarray(im, dtype=np.float32) / 255.0
            Xb.append(arr.reshape(-1)); yb.append(lab)
        if Xb:
            yield np.stack(Xb), np.array(yb)


In [4]:
# %% 3) Metrics packer (aligned with evaluate_model.mlx)
def evaluate_and_pack(y_true, y_pred, labels=LABELS):
    y_true = np.asarray(y_true).astype(str)
    y_pred = np.asarray(y_pred).astype(str)
    labels = [str(x) for x in labels]
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    acc = (y_true == y_pred).mean().item()
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average="macro", zero_division=0
    )
    return {
        "Accuracy": float(acc),
        "Precision": float(prec),
        "Recall": float(rec),
        "F1": float(f1),
        "ConfusionMatrix": cm.tolist(),
        "Labels": labels
    }


In [5]:
# %% 4) Fit StandardScaler + IncrementalPCA (global stream, low RAM)
from sklearn.utils import shuffle
import gc

scaler = StandardScaler(with_mean=True, with_std=True)
for Xb, _ in stream_clean_global("train"):
    scaler.partial_fit(Xb.astype(np.float32, copy=False))
    del Xb; gc.collect()
print("Scaler fitted (global).")

ipca = IncrementalPCA(n_components=NCOMP, batch_size=BATCH)
for Xb, _ in stream_clean_global("train"):
    Xb_std = scaler.transform(Xb.astype(np.float32, copy=False))
    if Xb_std.shape[0] >= NCOMP:
        ipca.partial_fit(Xb_std)
    del Xb, Xb_std; gc.collect()
print(f"IPCA fitted (n_components={NCOMP}).")


Scaler fitted (global).
IPCA fitted (n_components=64).


In [6]:
# %% 5) Transform TRAIN/TEST in streams → compact features
Xtr_chunks, ytr_chunks = [], []
for Xb, yb in stream_clean_global("train"):
    Xb_std = scaler.transform(Xb.astype(np.float32, copy=False))
    Xb_pca = ipca.transform(Xb_std)
    Xtr_chunks.append(Xb_pca.astype(np.float32, copy=False))
    ytr_chunks.append(yb)
Xtr = np.vstack(Xtr_chunks); ytr = np.concatenate(ytr_chunks)

Xte_chunks, yte_chunks = [], []
for Xb, yb in stream_clean_global("test"):
    Xb_std = scaler.transform(Xb.astype(np.float32, copy=False))
    Xb_pca = ipca.transform(Xb_std)
    Xte_chunks.append(Xb_pca.astype(np.float32, copy=False))
    yte_chunks.append(yb)
Xte = np.vstack(Xte_chunks); yte = np.concatenate(yte_chunks)


## Train Linear SVM

In [8]:
from sklearn.svm import LinearSVC

In [None]:
# %% 6) Train Linear SVM (on PCA features)
svm = LinearSVC(C=1.0, loss='squared_hinge', dual=False, max_iter=2000)
import time
print("Training Linear SVM...")
t_train = time.time()
svm.fit(Xtr, ytr)
train_secs = time.time() - t_train
print(f"Training completed in {train_secs/60:.2f} minutes")


Training Linear SVM...
Training completed in 0.04 minutes


## Evaluation

In [10]:
# %% 7) Evaluate on clean test set and save
t_eval = time.time()
yhat = svm.predict(Xte)
test_secs = time.time() - t_eval

res_clean = evaluate_and_pack(yte, yhat, LABELS)
res_clean["TrainElapsedSeconds"] = round(train_secs, 2)
res_clean["TestElapsedSeconds"]  = round(test_secs, 2)

out_clean = os.path.join(ART, "results_svm_clean.json")
with open(out_clean, "w") as f:
    json.dump(res_clean, f, indent=2)
print("Saved:", out_clean)


Saved: D:\LocalUser\42177 Project\artifacts\results_svm_clean.json


In [11]:
# %% 8) Evaluate on degraded test sets (streaming)
conds = [d for d in os.listdir(ROOT_DEG) if os.path.isdir(os.path.join(ROOT_DEG, d))]
for c in conds:
    y_true, y_pred = [], []
    any_batch = False
    for Xb, yb in stream_degraded_global(c, "test", batch=BATCH):
        any_batch = True
        Xb_std = scaler.transform(Xb.astype(np.float32, copy=False))
        Xb_pca = ipca.transform(Xb_std)
        yhat_b = svm.predict(Xb_pca)
        y_true.extend(yb.tolist()); y_pred.extend(yhat_b.tolist())
    if not any_batch:
        print(f"Skip {c}: no files."); continue
    resg = evaluate_and_pack(np.array(y_true), np.array(y_pred), LABELS)
    out = os.path.join(ART, f"results_svm_{c}.json")
    with open(out, "w") as f: json.dump(resg, f, indent=2)
    print("Saved:", out)


Saved: D:\LocalUser\42177 Project\artifacts\results_svm_gaussian_blur_s1.0.json
Saved: D:\LocalUser\42177 Project\artifacts\results_svm_gaussian_blur_s2.0.json
Saved: D:\LocalUser\42177 Project\artifacts\results_svm_gaussian_noise_s15.json
Saved: D:\LocalUser\42177 Project\artifacts\results_svm_gaussian_noise_s5.json
Saved: D:\LocalUser\42177 Project\artifacts\results_svm_jpeg_q20.json
Saved: D:\LocalUser\42177 Project\artifacts\results_svm_jpeg_q40.json
Saved: D:\LocalUser\42177 Project\artifacts\results_svm_jpeg_q60.json
Saved: D:\LocalUser\42177 Project\artifacts\results_svm_motion_blur_k5.json
Saved: D:\LocalUser\42177 Project\artifacts\results_svm_resolution_x2.json
Saved: D:\LocalUser\42177 Project\artifacts\results_svm_resolution_x4.json
