In [15]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from skimage.feature import hog
from skimage import feature
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
import joblib
import json
import random

from pathlib import Path

from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns

import glob

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, label_binarize

from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, roc_curve, auc
from sklearn.model_selection import cross_val_score, StratifiedKFold
from torchvision.transforms import ToPILImage

import matplotlib.pyplot as plt
%matplotlib inline

# importing malaria dataset class to map bounding boxes on one image
# and skipping any null values with detection collate
import sys
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(parent_dir)
from malaria_dataset import MalariaDataset, detection_collate

In [16]:
# Paths
root_path = os.path.join('..', 'dataset', 'malaria')
train_base_path = os.path.join(root_path, 'training_ds')
test_base_path = os.path.join(root_path, 'testing_ds')
image_path = os.path.join(root_path, 'images')
train_json_path = os.path.join(root_path, 'training.json')
test_json_path = os.path.join(root_path, 'test.json')

FEATURES_DIR = os.path.join(root_path, 'extracted_features')
os.makedirs(FEATURES_DIR, exist_ok=True)

image_sizes = [128]
print("Root Path:", root_path)
print("Train Base Path:", train_base_path)
print("Test Base Path:", test_base_path)

Root Path: ..\dataset\malaria
Train Base Path: ..\dataset\malaria\training_ds
Test Base Path: ..\dataset\malaria\testing_ds


### Model Training

In [17]:
# --- Create a directory to store the trained models ---
MODELS_DIR = os.path.join('.', 'trained_models')
os.makedirs(MODELS_DIR, exist_ok=True)

experiment_results = []
feature_files = glob.glob(os.path.join(FEATURES_DIR, "*.pkl"))

if not feature_files:
    print("ERROR: No feature files found!")
    print(f"Please run the Feature Extraction cell first to create .pkl files in: {FEATURES_DIR}")

# --- Main Training Loop ---
for file_path in feature_files:
    filename = os.path.basename(file_path)
    # Correctly unpack filename assuming format "size_extractorName_features.pkl"
    size, extractor_name, _ = filename.split('_', 2)
    
    print(f"\n{'='*25}")
    print(f"RUNNING EXPERIMENT")
    print(f"Image Size: {size}x{size} | Feature Extractor: {extractor_name}")
    print(f"{'='*25}")
    
    # --- 1. Load Pre-computed Feature Data ---
    data = joblib.load(file_path)
    X_train, y_train = data['X_train'], data['y_train']
    X_test, y_test = data['X_test'], data['y_test']
    le = data['label_encoder']
    
    # --- 2. Build, Train, or Load the Pipeline ---
    # MODIFIED: Changed model name in path to 'rf_model' for clarity
    model_path = os.path.join(MODELS_DIR, f"{size}_{extractor_name}_rf_model.pkl")

    # --- Check if the model is already trained ---
    if os.path.exists(model_path):
        print(f"Loading pre-trained model from: {model_path}")
        pipeline = joblib.load(model_path)
    else:
        # If model not found, define the pipeline and train it
        print("Pre-trained model not found. Training a new model...")
        
        # --- MODIFICATION START ---
        # Replaced the SVM pipeline with a RandomForest pipeline
        pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Scaling is less critical for RF but kept for consistency
            ('smote', SMOTE(random_state=42)),
            ('rf', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)) # Using RandomForest
        ])
        # --- MODIFICATION END ---
        
        pipeline.fit(X_train, y_train) # This is the training line
        
        # --- Save the newly trained pipeline ---
        print(f"Saving trained model to: {model_path}")
        joblib.dump(pipeline, model_path)

    # --- 3. Evaluation ---
    print("\n--- Evaluation Results ---")
    predictions = pipeline.predict(X_test)
    report = classification_report(y_test, predictions, target_names=le.classes_, output_dict=True)
    print(classification_report(y_test, predictions, target_names=le.classes_))

    # --- 4. Store Results for Final Summary ---
    experiment_results.append({
        'image_size': size,
        'feature_extractor': extractor_name,
        'accuracy': report['accuracy'],
        'f1_score_weighted': report['weighted avg']['f1-score']
    })

# --- 5. Display Final Summary Table ---
if experiment_results:
    print(f"\n{'='*30}")
    print("FINAL EXPERIMENT SUMMARY")
    print(f"{'='*30}")
    results_df = pd.DataFrame(experiment_results)
    results_df = results_df.sort_values(by='f1_score_weighted', ascending=False)
    display(results_df)
    
    summary_path = 'model_experiment_summary.csv'
    results_df.to_csv(summary_path, index=False)
    print(f"\nSummary saved to {summary_path}")


RUNNING EXPERIMENT
Image Size: 128x128 | Feature Extractor: HIST
Loading pre-trained model from: .\trained_models\128_HIST_rf_model.pkl

--- Evaluation Results ---
                precision    recall  f1-score   support

     difficult       0.17      0.06      0.09        16
    gametocyte       0.00      0.00      0.00        14
     leukocyte       1.00      0.81      0.89        21
red_blood_cell       0.96      1.00      0.98      6869
          ring       1.00      0.01      0.01       173
      schizont       0.00      0.00      0.00        12
   trophozoite       0.77      0.32      0.45       168

      accuracy                           0.95      7273
     macro avg       0.56      0.31      0.35      7273
  weighted avg       0.95      0.95      0.94      7273


RUNNING EXPERIMENT
Image Size: 128x128 | Feature Extractor: HOG


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Loading pre-trained model from: .\trained_models\128_HOG_rf_model.pkl

--- Evaluation Results ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                precision    recall  f1-score   support

     difficult       0.00      0.00      0.00        16
    gametocyte       0.00      0.00      0.00        14
     leukocyte       0.82      0.43      0.56        21
red_blood_cell       0.95      1.00      0.97      6869
          ring       0.00      0.00      0.00       173
      schizont       0.00      0.00      0.00        12
   trophozoite       0.58      0.11      0.19       168

      accuracy                           0.95      7273
     macro avg       0.33      0.22      0.25      7273
  weighted avg       0.91      0.95      0.92      7273


RUNNING EXPERIMENT
Image Size: 128x128 | Feature Extractor: LBP
Loading pre-trained model from: .\trained_models\128_LBP_rf_model.pkl

--- Evaluation Results ---
                precision    recall  f1-score   support

     difficult       0.00      0.06      0.01        16
    gametocyte       0.04      0.14      0.07        14
     leukocyte       0.38      0.38      0.38    

Unnamed: 0,image_size,feature_extractor,accuracy,f1_score_weighted
0,128,HIST,0.953664,0.936408
1,128,HOG,0.94624,0.924316
2,128,LBP,0.87543,0.903663



Summary saved to model_experiment_summary.csv


## Predictions

In [18]:
OUTPUT_DIR    = os.path.join('.', 'detections')
os.makedirs(OUTPUT_DIR, exist_ok=True)

# If your training labels used these common names, we'll auto-pick the positive label.
MALARIA_POSITIVE_ALIASES = {'ring', 'trophozoite', 'schizont', 'gametocyte'}
COLOR_HDR_BG = (32, 32, 32)
COLOR_HDR_TXT = (255, 255, 255)

In [19]:
# =========================
# ENHANCED MALARIA DETECTOR
# =========================
# Works with your saved RandomForest pipeline:
#   trained_models/{size}_{extractor}_rf_model.pkl
# and your features file:
#   ../dataset/malaria/extracted_features/{size}_{extractor}_features.pkl
#
# Output:
#  - detections/<img>__detections.png         (boxes)
#  - detections/vis/<img>__gt_vs_pred.png     (header + boxes)
#  - detections/<img>__heatmap.png            (prob heatmap overlay)
#  - detections/vis/<img>__topK.png           (mosaic of top windows)
#  - detections/vis/report.csv                (per-image summary)

import os, re, glob, math, json, warnings
from pathlib import Path
import numpy as np
import cv2
import joblib

# Make sure these imports exist so joblib can unpickle your pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

from skimage.feature import hog, local_binary_pattern
from skimage.color import rgb2gray

# -------------------------
# PATHS (kept consistent with your notebook)
# -------------------------
ROOT_PATH       = os.path.join('..', 'dataset', 'malaria')
TRAIN_DIR       = os.path.join(ROOT_PATH, 'training_ds')
TEST_DIR        = os.path.join(ROOT_PATH, 'testing_ds')
FEATURES_DIR    = os.path.join(ROOT_PATH, 'extracted_features')
MODELS_DIR      = os.path.join('.', 'trained_models')
OUTPUT_DIR      = os.path.join('.', 'detections')
VIS_DIR         = os.path.join(OUTPUT_DIR, 'vis')
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(VIS_DIR, exist_ok=True)

# For GT inference if you don't have JSON: parent folder name
POSITIVE_ALIASES = {'parasitized', 'infected', 'positive', 'malaria', 'parasite', 'malaria_cell'}

# -------------------------
# Small drawing helpers
# -------------------------
def _put_text(img, text, org, scale=0.6, color=(255,255,255), thickness=1, bg=(0,0,0)):
    pad = 2
    font = cv2.FONT_HERSHEY_SIMPLEX
    (tw, th), baseline = cv2.getTextSize(text, font, scale, thickness)
    x, y = org
    cv2.rectangle(img, (x, y - th - 2*pad), (x + tw + 2*pad, y + baseline), bg, -1)
    cv2.putText(img, text, (x + pad, y - pad), font, scale, color, thickness, cv2.LINE_AA)

def _draw_header(img_bgr, left_text, scale=0.7):
    out = img_bgr.copy()
    H, W = out.shape[:2]
    bar_h = max(30, int(30 * scale))
    cv2.rectangle(out, (0, 0), (W, bar_h + 10), (32, 32, 32), -1)
    _put_text(out, left_text, (10, bar_h), scale=scale, color=(255,255,255), bg=(32,32,32))
    return out

# -------------------------
# Non-Max Suppression
# -------------------------
def nms(boxes, scores, iou_threshold=0.3, top_k=None):
    if not boxes:
        return []
    boxes = np.asarray(boxes, dtype=float)
    scores = np.asarray(scores, dtype=float)
    x1, y1, x2, y2 = boxes.T
    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]
    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        if top_k is not None and len(keep) >= top_k:
            break
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])
        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-9)
        inds = np.where(iou <= iou_threshold)[0]
        order = order[inds + 1]
    return keep

# -------------------------
# Pyramid + sliding window (handles upsampling tiny images)
# -------------------------
def image_pyramid(img, scale=1.25, min_size=(64,64), window_size=(128,128)):
    H0, W0 = img.shape[:2]
    # Upsample if the original is smaller than the window
    scale_up = max(window_size[0] / W0, window_size[1] / H0, 1.0)
    if scale_up > 1.0:
        new_w = int(round(W0 * scale_up))
        new_h = int(round(H0 * scale_up))
        img_up = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
        yield scale_up, img_up   # note: factor here is how many times larger than original
        current = img_up.copy()
        current_scale = scale_up
    else:
        yield 1.0, img.copy()
        current = img.copy()
        current_scale = 1.0

    while True:
        new_w = int(current.shape[1] / scale)
        new_h = int(current.shape[0] / scale)
        if new_w < min_size[0] or new_h < min_size[1]:
            break
        current = cv2.resize(current, (new_w, new_h), interpolation=cv2.INTER_AREA)
        current_scale /= scale
        yield current_scale, current

def sliding_window(image, window_size, step=16):
    H, W = image.shape[:2]
    wW, wH = window_size
    if W < wW or H < wH:
        return
    for y in range(0, H - wH + 1, step):
        for x in range(0, W - wW + 1, step):
            patch = image[y:y+wH, x:x+wW]
            yield x, y, patch

# -------------------------
# Auto-configure HOG to match your trained pipeline
# -------------------------
def get_expected_feature_len(pipeline):
    exp = None
    if 'scaler' in pipeline.named_steps and hasattr(pipeline.named_steps['scaler'], 'n_features_in_'):
        exp = int(pipeline.named_steps['scaler'].n_features_in_)
    elif 'rf' in pipeline.named_steps and hasattr(pipeline.named_steps['rf'], 'n_features_in_'):
        exp = int(pipeline.named_steps['rf'].n_features_in_)
    return exp

def hog_feature_len(size, orientations, ppc, cpb):
    # compute HOG length by running once on zeros
    dummy = np.zeros((size, size), dtype=np.float32)
    feats = hog(dummy,
                orientations=orientations,
                pixels_per_cell=ppc,
                cells_per_block=cpb,
                block_norm='L2-Hys',
                transform_sqrt=False,
                feature_vector=True)
    return feats.size

def auto_configure_hog(model_size, expected_len):
    # Try a few common combos; add more if you trained differently
    candidates = [
        dict(orientations=9, pixels_per_cell=(16,16), cells_per_block=(2,2)),
        dict(orientations=9, pixels_per_cell=(8,8),   cells_per_block=(2,2)),
        dict(orientations=8, pixels_per_cell=(16,16), cells_per_block=(2,2)),
        dict(orientations=9, pixels_per_cell=(32,32), cells_per_block=(2,2)),
    ]
    for cand in candidates:
        if hog_feature_len(model_size, cand['orientations'], cand['pixels_per_cell'], cand['cells_per_block']) == expected_len:
            return cand
    # Fallback: stick with the classic 1764-dim (for 128, 9, 16,2,2) if expected is close:
    default = dict(orientations=9, pixels_per_cell=(16,16), cells_per_block=(2,2))
    warnings.warn(f"[HOG] Could not auto-match expected feature length {expected_len}. Using default {default}.")
    return default

# -------------------------
# Load model + label encoder
# -------------------------
def load_pipeline_and_label_encoder(model_size, extractor_name):
    size_str = str(int(model_size))
    model_path = os.path.join(MODELS_DIR, f"{size_str}_{extractor_name}_rf_model.pkl")
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model not found: {model_path}")

    pipeline = joblib.load(model_path)

    # Try to get label encoder from features file (preferred)
    feats_path = os.path.join(FEATURES_DIR, f"{size_str}_{extractor_name}_features.pkl")
    le = None
    if os.path.exists(feats_path):
        data = joblib.load(feats_path)
        le = data.get('label_encoder', None)

    # Positive class inference (requires label names ideally)
    if le is not None:
        pos_label_name = None
        for name in le.classes_:
            if name.strip().lower() in POSITIVE_ALIASES or name in {'Parasitized', 'Infected'}:
                pos_label_name = name
                break
        if pos_label_name is None and len(le.classes_) >= 2:
            pos_label_name = le.classes_[0]
    else:
        pos_label_name = 'Parasitized'  # sensible default for NIH malaria

    # Map pos label to the rf.classes_ index
    rf = pipeline.named_steps.get('rf', None)
    if rf is None:
        raise RuntimeError("Pipeline does not have a step named 'rf'.")
    if le is not None:
        encoded_pos = le.transform([pos_label_name])[0]
    else:
        # Fallback guess if we lack a LabelEncoder
        encoded_pos = 1 if hasattr(rf, 'classes_') and 1 in rf.classes_ else rf.classes_[0]

    rf_class_order = list(rf.classes_)
    if encoded_pos not in rf_class_order:
        raise RuntimeError(f"Positive encoded label {encoded_pos} not in rf.classes_={rf_class_order}")
    pos_index = rf_class_order.index(encoded_pos)

    # Configure HOG to match expected feature length
    expected_len = get_expected_feature_len(pipeline)
    if expected_len is None:
        expected_len = 1764  # common for 128/HOG(16,2,2)
    hog_cfg = auto_configure_hog(int(model_size), expected_len)

    return pipeline, le, pos_label_name, pos_index, hog_cfg

# -------------------------
# Feature extraction (HOG/LBP; HOG is default for your RF)
# -------------------------
def extract_features(patch_bgr, size, extractor_name, hog_cfg):
    if patch_bgr.shape[:2] != (size, size):
        patch_bgr = cv2.resize(patch_bgr, (size, size), interpolation=cv2.INTER_AREA)
    gray = cv2.cvtColor(patch_bgr, cv2.COLOR_BGR2GRAY).astype(np.float32)
    gray /= 255.0

    if extractor_name.lower() == 'hog':
        feats = hog(gray,
                    orientations=hog_cfg['orientations'],
                    pixels_per_cell=hog_cfg['pixels_per_cell'],
                    cells_per_block=hog_cfg['cells_per_block'],
                    block_norm='L2-Hys',
                    transform_sqrt=False,
                    feature_vector=True)
        return feats.astype(np.float32)

    elif extractor_name.lower() == 'lbp':
        P, R = 8, 1
        lbp = local_binary_pattern(gray, P=P, R=R, method='uniform')
        hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, P+3), range=(0, P+2), density=True)
        return hist.astype(np.float32)

    elif extractor_name.lower() in {'hog_lbp', 'lbp_hog'}:
        feats_hog = hog(gray,
                        orientations=hog_cfg['orientations'],
                        pixels_per_cell=hog_cfg['pixels_per_cell'],
                        cells_per_block=hog_cfg['cells_per_block'],
                        block_norm='L2-Hys',
                        transform_sqrt=False,
                        feature_vector=True)
        P, R = 8, 1
        lbp = local_binary_pattern(gray, P=P, R=R, method='uniform')
        hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, P+3), range=(0, P+2), density=True)
        return np.hstack([feats_hog.astype(np.float32), hist.astype(np.float32)])

    else:
        raise ValueError(f"Unknown extractor '{extractor_name}'.")

# -------------------------
# Core detection
# -------------------------
def detect_image(
    image_path,
    model_size=128,
    extractor_name='hog',
    step=None,                 # if None -> model_size//4
    pyramid_scale=1.25,
    prob_threshold=0.80,
    nms_iou=0.25,
    top_k=None
):
    pipeline, le, pos_label_name, pos_index, hog_cfg = load_pipeline_and_label_encoder(model_size, extractor_name)

    img = cv2.imread(image_path)
    if img is None:
        raise FileNotFoundError(f"Could not read image: {image_path}")
    H0, W0 = img.shape[:2]

    win_size = (int(model_size), int(model_size))
    if step is None:
        step = max(8, int(model_size // 4))

    # Soft heat accumulation at window centers
    prob_accum = np.zeros((H0, W0), dtype=np.float32)
    prob_count = np.zeros((H0, W0), dtype=np.float32)

    boxes, scores = [], []
    rf = pipeline.named_steps['rf']

    # scan pyramid
    for scale_factor, im_scaled in image_pyramid(img, scale=pyramid_scale, min_size=win_size, window_size=win_size):
        for x, y, patch in sliding_window(im_scaled, window_size=win_size, step=step):
            feats = extract_features(patch, size=model_size, extractor_name=extractor_name, hog_cfg=hog_cfg).reshape(1, -1)
            proba = pipeline.predict_proba(feats)[0]  # RandomForest supports this
            p_pos = float(proba[pos_index])

            # record box if above threshold
            if p_pos >= prob_threshold:
                x1 = int(x / scale_factor)
                y1 = int(y / scale_factor)
                x2 = int((x + win_size[0]) / scale_factor)
                y2 = int((y + win_size[1]) / scale_factor)
                # clamp
                x1 = max(0, min(W0-1, x1)); x2 = max(0, min(W0-1, x2))
                y1 = max(0, min(H0-1, y1)); y2 = max(0, min(H0-1, y2))

                boxes.append([x1, y1, x2, y2])
                scores.append(p_pos)

            # heatmap accumulation
            cx = int((x + win_size[0] / 2) / scale_factor)
            cy = int((y + win_size[1] / 2) / scale_factor)
            if 0 <= cx < W0 and 0 <= cy < H0:
                prob_accum[cy, cx] += p_pos
                prob_count[cy, cx] += 1.0

    # average heat
    with np.errstate(invalid='ignore'):
        heatmap = np.where(prob_count > 0, prob_accum / np.maximum(prob_count, 1e-6), 0.0)

    # NMS
    keep = nms(boxes, scores, iou_threshold=nms_iou, top_k=top_k)
    boxes_nms = [boxes[i] for i in keep]
    scores_nms = [scores[i] for i in keep]

    has_malaria = len(boxes_nms) > 0

    # Save base overlays
    base = Path(image_path).stem
    overlay_path = os.path.join(OUTPUT_DIR, f"{base}__detections.png")
    heatmap_path = os.path.join(OUTPUT_DIR, f"{base}__heatmap.png")

    vis = img.copy()
    for (x1, y1, x2, y2), s in zip(boxes_nms, scores_nms):
        cv2.rectangle(vis, (x1, y1), (x2, y2), (0,255,0), 2)
        _put_text(vis, f"{pos_label_name} {s:.2f}", (x1, max(0, y1-5)), scale=0.6, color=(0,0,0), bg=(0,255,0))
    cv2.imwrite(overlay_path, vis)

    hm_norm = (heatmap / (heatmap.max() + 1e-6) * 255.0).astype(np.uint8)
    hm_color = cv2.applyColorMap(hm_norm, cv2.COLORMAP_JET)
    hm_overlay = cv2.addWeighted(img, 0.6, hm_color, 0.4, 0)
    cv2.imwrite(heatmap_path, hm_overlay)

    return {
        "has_malaria": bool(has_malaria),
        "positive_label": pos_label_name,
        "boxes": boxes_nms,
        "scores": scores_nms,
        "overlay_png": overlay_path,
        "heatmap_png": heatmap_path
    }

# -------------------------
# Visual: GT vs Pred header + top-k windows mosaic
# -------------------------
def infer_gt_from_parent(img_path):
    parent = Path(img_path).parent.name.strip().lower()
    is_pos = any(alias in parent for alias in POSITIVE_ALIASES)
    return "Positive" if is_pos else "Negative"

def save_gt_vs_pred_banner(image_path, detection_dict, out_dir=VIS_DIR):
    os.makedirs(out_dir, exist_ok=True)
    overlay = cv2.imread(detection_dict['overlay_png'])
    if overlay is None:
        overlay = cv2.imread(image_path)
    gt_label   = infer_gt_from_parent(image_path)
    pred_label = "Positive" if detection_dict['has_malaria'] else "Negative"
    header = f"GT: {gt_label}   |   Pred: {pred_label}   |   {detection_dict['positive_label']} windows: {len(detection_dict['boxes'])}"
    scale = max(0.6, min(overlay.shape[0], overlay.shape[1]) / 900.0)
    out = _draw_header(overlay, header, scale=scale)
    out_path = os.path.join(out_dir, f"{Path(image_path).stem}__gt_vs_pred.png")
    cv2.imwrite(out_path, out)
    return out_path, gt_label, pred_label

def save_topk_mosaic(image_path, boxes, scores, k=6, out_dir=VIS_DIR):
    if not boxes:
        return None
    os.makedirs(out_dir, exist_ok=True)
    img = cv2.imread(image_path)
    ord_idx = np.argsort(-np.array(scores))[:k]
    tiles = []
    for i in ord_idx:
        x1,y1,x2,y2 = [int(v) for v in boxes[i]]
        crop = img[max(0,y1):max(0,y2), max(0,x1):max(0,x2)].copy()
        if crop.size == 0:
            continue
        # add a mini banner on each crop
        _put_text(crop, f"{scores[i]:.2f}", (5, 20), scale=0.6, bg=(0,0,0))
        tiles.append(cv2.resize(crop, (128,128)))
    if not tiles:
        return None
    # make a simple grid
    rows = math.ceil(len(tiles)/3)
    while len(tiles) < rows*3:
        tiles.append(np.zeros_like(tiles[0]))
    grid = []
    for r in range(rows):
        row = cv2.hconcat(tiles[r*3:(r+1)*3])
        grid.append(row)
    mosaic = cv2.vconcat(grid)
    out_path = os.path.join(out_dir, f"{Path(image_path).stem}__topK.png")
    cv2.imwrite(out_path, mosaic)
    return out_path

# -------------------------
# High-level helpers
# -------------------------
def predict_and_visualize(
    image_path,
    model_size=128,
    extractor_name='hog',
    step=None,
    pyramid_scale=1.25,
    prob_threshold=0.80,
    nms_iou=0.25
):
    det = detect_image(
        image_path=image_path,
        model_size=model_size,
        extractor_name=extractor_name,
        step=step,
        pyramid_scale=pyramid_scale,
        prob_threshold=prob_threshold,
        nms_iou=nms_iou
    )
    banner_path, gt_label, pred_label = save_gt_vs_pred_banner(image_path, det, out_dir=VIS_DIR)
    topk_path = save_topk_mosaic(image_path, det['boxes'], det['scores'], k=6, out_dir=VIS_DIR)
    return {
        "image": image_path,
        "gt": gt_label,
        "pred": pred_label,
        "num_windows": len(det['boxes']),
        "overlay": det['overlay_png'],
        "banner": banner_path,
        "heatmap": det['heatmap_png'],
        "topk": topk_path
    }

def run_on_path(
    path,                          # file or directory
    model_size=128,
    extractor_name='hog',
    prob_threshold=0.80,
    nms_iou=0.25,
    pyramid_scale=1.25,
    step=None,
    exts=('.png','.jpg','.jpeg','.tif','.bmp')
):
    paths = []
    p = Path(path)
    if p.is_dir():
        for ext in exts:
            paths.extend(glob.glob(str(p / f"*{ext}")))
            # also walk immediate subfolders (e.g., Parasitized/ *.png)
            for sub in p.iterdir():
                if sub.is_dir():
                    paths.extend(glob.glob(str(sub / f"*{ext}")))
    else:
        if p.suffix.lower() in exts and p.exists():
            paths = [str(p)]
    if not paths:
        raise FileNotFoundError(f"No images found under: {path}")

    rows = []
    for i, img_path in enumerate(sorted(paths)):
        try:
            res = predict_and_visualize(
                image_path=img_path,
                model_size=model_size,
                extractor_name=extractor_name,
                step=step,
                pyramid_scale=pyramid_scale,
                prob_threshold=prob_threshold,
                nms_iou=nms_iou
            )
            rows.append(res)
            print(f"[{i+1}/{len(paths)}] {Path(img_path).name}: GT={res['gt']} Pred={res['pred']} windows={res['num_windows']}")
        except Exception as e:
            print(f"[ERROR] {img_path}: {e}")

    # CSV summary
    import pandas as pd
    df = pd.DataFrame(rows)
    csv_path = os.path.join(VIS_DIR, "report.csv")
    df.to_csv(csv_path, index=False)
    print(f"\nSaved {len(rows)} results. Report: {csv_path}")
    return df


In [21]:
MODEL_SIZE = 128
EXTRACTOR  = 'hog'

some_image = r"C:\Users\shera\Desktop\fyp\fyp\dataset\malaria\images\fed6ba05-36a5-45dc-a4e1-9baa7de2c622.png"


def predict_and_visualize_safe(img_or_list, **kwargs):
    if isinstance(img_or_list, (list, tuple)):
        if not img_or_list:
            raise FileNotFoundError("Empty list of images.")
        img_path = img_or_list[0]
    else:
        img_path = img_or_list
    return predict_and_visualize(image_path=img_path, **kwargs)

# usage:
res = predict_and_visualize_safe(
    some_image,  # works whether this is a str path or a list from glob
    model_size=MODEL_SIZE,
    extractor_name=EXTRACTOR,
    step=32,
    pyramid_scale=1.25,
    prob_threshold=0.80,
    nms_iou=0.25
)
