# 42177 — PCA → Logistic Regression (clean + degraded), manifest-driven
- Uses manifest and preprocessed data.
- Streams for scaler/PCA to avoid RAM spikes.
- Evaluates on clean test and all degraded test sets.

In [1]:
# %% [markdown]
# ## 0) Imports and global config

# %%
import os, json, time, random, numpy as np, pandas as pd
from glob import glob
from PIL import Image
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score


In [7]:

# ------------------------- CONFIG -------------------------
# Paths
MANIFEST = r"D:\LocalUser\42177 Project\data\manifest.csv"
ROOT_CLEAN = r"D:\LocalUser\42177 Project\data\clean"
ROOT_DEG = r"D:\LocalUser\42177 Project\data\degraded"
ART = r"D:\LocalUser\42177 Project\artifacts"

# hyperparameters
TARGET_HW = (256, 256)
SEED = 42177
BATCH = 1024          # streaming batch for scaler/IPCA/transform
NCOMP = 256           # PCA components

os.makedirs(ART, exist_ok=True)
random.seed(SEED); np.random.seed(SEED)

## 1) Manifest and label set
- Manifest defines the project’s class list and fixed splits.


In [8]:
# %%
df = pd.read_csv(MANIFEST)  # expects: id, filepath, label, subset, mag
LABELS = sorted(df['label'].unique().tolist())  # deterministic order
LABELS


['babesia',
 'leishmania',
 'plasmodium',
 'toxoplasma1000x',
 'toxoplasma400x',
 'trichomonad',
 'trypanosome']

## 2) I/O helpers
- Read RGB 256×256 images.
- Stream batches for train to reduce RAM.


In [9]:

# ------------------------- DATA HELPERS -------------------------
EXTS = ('.png','.jpg','.jpeg','.tif','.tiff','.bmp')

def _list_images(folder):
    if not os.path.isdir(folder): return []
    return [os.path.join(folder,f) for f in os.listdir(folder) if f.lower().endswith(EXTS)]

def manifest_labels(manifest_path):
    df = pd.read_csv(manifest_path)
    # use sorted labels for deterministic order across environments
    return sorted(df['label'].unique().tolist())

LABELS = manifest_labels(MANIFEST)

def stream_clean_subset(subset, target=TARGET_HW):
    """Yield (Xb, yb) batches from data/clean/<subset>/<label> with Xb in [0,1], flattened."""
    for lab in LABELS:
        folder = os.path.join(ROOT_CLEAN, subset, lab)
        paths = _list_images(folder)
        for i in range(0, len(paths), BATCH):
            chunk = paths[i:i+BATCH]
            Xb, yb = [], []
            for p in chunk:
                im = Image.open(p).convert("RGB").resize(target)
                arr = np.asarray(im, dtype=np.float32) / 255.0
                Xb.append(arr.reshape(-1))   # flatten 256*256*3
                yb.append(lab)
            if Xb:
                yield np.stack(Xb), np.array(yb)

def load_clean_subset_full(subset, target=TARGET_HW):
    """Load entire subset into memory (used after PCA transform, which is small)."""
    X, y = [], []
    for lab in LABELS:
        folder = os.path.join(ROOT_CLEAN, subset, lab)
        for p in _list_images(folder):
            im = Image.open(p).convert("RGB").resize(target)
            arr = np.asarray(im, dtype=np.float32) / 255.0
            X.append(arr.reshape(-1))
            y.append(lab)
    return np.stack(X), np.array(y)

def load_degraded_full(cond, subset="test", target=TARGET_HW):
    """Load degraded images for one condition into memory (then PCA-transform)."""
    X, y = [], []
    base = os.path.join(ROOT_DEG, cond, subset)
    if not os.path.isdir(base): return None, None
    for lab in LABELS:
        folder = os.path.join(base, lab)
        for p in _list_images(folder):
            im = Image.open(p).convert("RGB").resize(target)
            arr = np.asarray(im, dtype=np.float32) / 255.0
            X.append(arr.reshape(-1))
            y.append(lab)
    if not X: return None, None
    return np.stack(X), np.array(y)


## 3) Metric packer
- Accuracy, macro Precision/Recall/F1, and Confusion Matrix with fixed label order.


In [10]:

def evaluate_and_pack(y_true, y_pred, labels=LABELS):
    y_true = np.asarray(y_true).astype(str)
    y_pred = np.asarray(y_pred).astype(str)
    labels = [str(x) for x in labels]
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average="macro", zero_division=0
    )
    return {
        "Accuracy": float(acc),
        "Precision": float(prec),
        "Recall": float(rec),
        "F1": float(f1),
        "ConfusionMatrix": cm.tolist(),
        "Labels": labels
    }


## 4) Fit StandardScaler and IncrementalPCA on train (streaming)
- Two passes: first for scaler, second for PCA on standardized batches.


In [None]:

# ------------------------- TRAIN (SCALER + IPCA + LR) -------------------------
t_total = time.time()

# 1) Fit scaler on train via streaming (no big RAM usage)
scaler = StandardScaler(with_mean=True, with_std=True)
n_seen = 0
for Xb, _ in stream_clean_subset("train"):
    scaler.partial_fit(Xb)
    n_seen += len(Xb)
print(f"Scaler fitted on {n_seen} samples.")

# 2) Fit IncrementalPCA (global stream, safe batch size)
BATCH = 1024
NCOMP = 128

def stream_clean_global(subset, batch=BATCH, target=TARGET_HW):
    """Yield flattened batches combining all classes (no small per-class batches)."""
    paths = []
    for lab in LABELS:
        d = os.path.join(ROOT_CLEAN, subset, lab)
        paths += [(os.path.join(d, f), lab)
                  for f in os.listdir(d) if f.lower().endswith(EXTS)]
    for i in range(0, len(paths), batch):
        chunk = paths[i:i+batch]
        Xb, yb = [], []
        for p, lab in chunk:
            im = Image.open(p).convert("RGB").resize(target)
            arr = np.asarray(im, dtype=np.float32) / 255.0
            Xb.append(arr.reshape(-1))
            yb.append(lab)
        yield np.stack(Xb), np.array(yb)

# 1st pass — fit scaler incrementally
scaler = StandardScaler(with_mean=True, with_std=True)
for Xb, _ in stream_clean_global("train"):
    scaler.partial_fit(Xb)
print("Scaler fitted (global stream).")

# 2nd pass — fit IncrementalPCA incrementally
ipca = IncrementalPCA(n_components=NCOMP, batch_size=BATCH)
for Xb, _ in stream_clean_global("train"):
    Xb_std = scaler.transform(Xb)
    if Xb_std.shape[0] < NCOMP:   # ensure n_samples ≥ n_components
        continue
    ipca.partial_fit(Xb_std)
print(f"IPCA fitted (n_components={NCOMP}).")


Scaler fitted on 14390 samples.
Scaler fitted (global stream).
IPCA fitted (n_components=128).


In [14]:
# 3) Transform TRAIN/TEST in streams → keep only PCA features (N × NCOMP)
Xtr_chunks, ytr_chunks = [], []
for Xb, yb in stream_clean_global("train"):
    Xb = Xb.astype(np.float32, copy=False)
    Xb_std = scaler.transform(Xb)
    Xb_pca = ipca.transform(Xb_std)
    Xtr_chunks.append(Xb_pca.astype(np.float32, copy=False))
    ytr_chunks.append(yb)
Xtr = np.vstack(Xtr_chunks)
ytr = np.concatenate(ytr_chunks)

Xte_chunks, yte_chunks = [], []
for Xb, yb in stream_clean_global("test"):
    Xb = Xb.astype(np.float32, copy=False)
    Xb_std = scaler.transform(Xb)
    Xb_pca = ipca.transform(Xb_std)
    Xte_chunks.append(Xb_pca.astype(np.float32, copy=False))
    yte_chunks.append(yb)
Xte = np.vstack(Xte_chunks)
yte = np.concatenate(yte_chunks)

# 4) Train multinomial Logistic Regression on PCA features
clf = LogisticRegression(
    max_iter=1000,
    solver="lbfgs",
    multi_class="multinomial",
    n_jobs=-1,
    random_state=SEED
)

print("Training Logistic Regression...")
t_train = time.time()
clf.fit(Xtr, ytr)
train_secs = time.time() - t_train
print(f"Training completed in {train_secs/60:.2f} minutes")


Training Logistic Regression...
Training completed in 0.12 minutes


## 5) Evaluate on clean test and save


In [15]:
# ------------------------- EVAL: CLEAN TEST -------------------------
t_eval = time.time()
yhat = clf.predict(Xte)
test_secs = time.time() - t_eval
res_clean = evaluate_and_pack(yte, yhat, LABELS)
res_clean["ElapsedTrainSeconds"] = round(train_secs, 2)
res_clean["ElapsedTestSeconds"]  = round(test_secs, 2)

with open(os.path.join(ART, "results_logreg_clean.json"), "w") as f:
    json.dump(res_clean, f, indent=2)
print("Saved:", os.path.join(ART, "results_logreg_clean.json"))



Saved: D:\LocalUser\42177 Project\artifacts\results_logreg_clean.json


## 7) Evaluate on all degraded test sets (if present)
- Loads each condition, transforms with same scaler/PCA, evaluates, saves.


In [18]:
# ------------------------- EVAL: DEGRADED TESTS (streaming, low RAM) -------------------------
def stream_degraded_global(cond, subset="test", batch=BATCH, target=TARGET_HW):
    paths = []
    base = os.path.join(ROOT_DEG, cond, subset)
    if not os.path.isdir(base):
        return
    for lab in LABELS:
        d = os.path.join(base, lab)
        if not os.path.isdir(d):
            continue
        paths += [(os.path.join(d, f), lab) for f in os.listdir(d) if f.lower().endswith(EXTS)]
    for i in range(0, len(paths), batch):
        chunk = paths[i:i+batch]
        Xb, yb = [], []
        for p, lab in chunk:
            im = Image.open(p).convert("RGB").resize(target)
            arr = np.asarray(im, dtype=np.float32) / 255.0
            Xb.append(arr.reshape(-1)); yb.append(lab)
        if Xb:
            # ensure we yield float32 arrays (avoids accidental upcast to float64)
            yield np.stack(Xb).astype(np.float32, copy=False), np.array(yb)

conds = [d for d in os.listdir(ROOT_DEG) if os.path.isdir(os.path.join(ROOT_DEG, d))]

# Use a smaller local batch for evaluation to avoid large temporary allocations.
# The global BATCH used for training can remain large, but evaluation can use e.g. 128.
EVAL_BATCH = min(BATCH, 128)

for c in conds:
    y_true, y_pred = [], []
    any_batch = False
    for Xb, yb in stream_degraded_global(c, "test", batch=EVAL_BATCH):
        any_batch = True
        # Xb is already float32 from the generator; keep it that way to reduce memory usage.
        Xb_std = scaler.transform(Xb)
        Xb_pca = ipca.transform(Xb_std)
        yhat_b = clf.predict(Xb_pca)
        y_true.extend(yb.tolist())
        y_pred.extend(yhat_b.tolist())

    if not any_batch:
        print(f"Skip {c}: no files.")
        continue

    resg = evaluate_and_pack(np.array(y_true), np.array(y_pred), LABELS)
    out = os.path.join(ART, f"results_logreg_{c}.json")
    with open(out, "w") as f:
        json.dump(resg, f, indent=2)
    print("Saved:", out)

print(f"Total wall time: {(time.time()-t_total)/60:.1f} min")

Saved: D:\LocalUser\42177 Project\artifacts\results_logreg_gaussian_blur_s1.0.json
Saved: D:\LocalUser\42177 Project\artifacts\results_logreg_gaussian_blur_s2.0.json
Saved: D:\LocalUser\42177 Project\artifacts\results_logreg_gaussian_noise_s15.json
Saved: D:\LocalUser\42177 Project\artifacts\results_logreg_gaussian_noise_s5.json
Saved: D:\LocalUser\42177 Project\artifacts\results_logreg_jpeg_q20.json
Saved: D:\LocalUser\42177 Project\artifacts\results_logreg_jpeg_q40.json
Saved: D:\LocalUser\42177 Project\artifacts\results_logreg_jpeg_q60.json
Saved: D:\LocalUser\42177 Project\artifacts\results_logreg_motion_blur_k5.json
Saved: D:\LocalUser\42177 Project\artifacts\results_logreg_resolution_x2.json
Saved: D:\LocalUser\42177 Project\artifacts\results_logreg_resolution_x4.json
Total wall time: 75.5 min
