In [8]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from skimage.feature import hog
from skimage import feature
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
import joblib
import json
import random
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns

import glob

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, label_binarize

from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, roc_curve, auc
from sklearn.model_selection import cross_val_score, StratifiedKFold
from torchvision.transforms import ToPILImage

In [9]:
# --- Enhancements: imports for fast SVM / multi-label ---
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.utils.multiclass import type_of_target
from scipy.sparse import issparse
import numpy as np, time, os, json, joblib

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.sparse import issparse
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import os, glob, joblib, pandas as pd


In [10]:
# Paths
root_path = os.path.join('..', 'dataset', 'malaria')
train_base_path = os.path.join(root_path, 'training_ds')
test_base_path = os.path.join(root_path, 'testing_ds')

# --- NEW: Create a directory to store the trained models ---
MODELS_DIR = os.path.join('.', 'trained_models')
os.makedirs(MODELS_DIR, exist_ok=True)


FEATURES_DIR = os.path.join(root_path, 'extracted_features')
os.makedirs(FEATURES_DIR, exist_ok=True)

image_sizes = [128]
print("Root Path:", root_path)
print("Train Base Path:", train_base_path)
print("Test Base Path:", test_base_path)

Root Path: ..\dataset\malaria
Train Base Path: ..\dataset\malaria\training_ds
Test Base Path: ..\dataset\malaria\testing_ds


### Model Training

In [11]:
experiment_results = []
detailed_results = [] # <-- NEW: List to store detailed per-class results

feature_files = glob.glob(os.path.join(FEATURES_DIR, "*.pkl"))

if not feature_files:
    print("ERROR: No feature files found!")
    print(f"Please run the Feature Extraction cell first to create .pkl files in: {FEATURES_DIR}")

# --- Main Training Loop ---
for file_path in feature_files:
    filename = os.path.basename(file_path)
    size, extractor_name, _ = filename.split('_', 2)
    
    print(f"\n{'='*25}")
    print(f"RUNNING EXPERIMENT")
    print(f"Image Size: {size}x{size} | Feature Extractor: {extractor_name}")
    print(f"{'='*25}")
    
    # --- 1. Load Pre-computed Feature Data ---
    data = joblib.load(file_path)
    X_train, y_train = data['X_train'], data['y_train']
    X_test,  y_test  = data['X_test'],  data['y_test']
    le = data['label_encoder']

    # --- 2. Build, Train, or Load the Pipeline ---
    model_path = os.path.join(MODELS_DIR, f"{size}_{extractor_name}_model.pkl")

    if os.path.exists(model_path):
        print(f"Loading pre-trained model from: {model_path}")
        pipeline = joblib.load(model_path)
    else:
        print("Pre-trained model not found. Training a new model...")
        scaler = StandardScaler(with_mean=not issparse(X_train))
        pipeline = ImbPipeline(steps=[
            ('scaler', scaler),
            ('smote', SMOTE(random_state=42)),
            ('svm', SVC(kernel='rbf', C=10, gamma=0.01, class_weight='balanced',
                        cache_size=1000, max_iter=10000, random_state=42))
        ])
        pipeline.fit(X_train, y_train)
        print(f"Saving trained model to: {model_path}")
        joblib.dump(pipeline, model_path)

    # --- 3. Evaluation ---
    print("\n--- Evaluation Results ---")
    predictions = pipeline.predict(X_test)
    # output_dict=True is the key to capturing the detailed results
    report = classification_report(y_test, predictions, target_names=le.classes_, output_dict=True)
    print(classification_report(y_test, predictions, target_names=le.classes_))

    # --- 4. Store Results for Final Summary ---
    experiment_results.append({
        'image_size': size,
        'feature_extractor': extractor_name,
        'accuracy': report.get('accuracy', None),
        'f1_score_weighted': report['weighted avg']['f1-score']
    })
    
    # --- NEW: Capture the detailed per-class results ---
    for class_name, metrics in report.items():
        # We only want to log the class rows, which are dictionaries containing performance metrics
        if isinstance(metrics, dict):
            detailed_results.append({
                'image_size': size,
                'feature_extractor': extractor_name,
                'class': class_name,
                'precision': metrics.get('precision'),
                'recall': metrics.get('recall'),
                'f1-score': metrics.get('f1-score'),
                'support': metrics.get('support')
            })
    # --- END OF NEW CODE ---


# --- 5. Display and Save Final Summary Table ---
if experiment_results:
    print(f"\n{'='*30}")
    print("FINAL EXPERIMENT SUMMARY")
    print(f"{'='*30}")
    results_df = pd.DataFrame(experiment_results)
    results_df = results_df.sort_values(by='f1_score_weighted', ascending=False)
    display(results_df)
    
    summary_path = 'model_experiment_summary.csv'
    results_df.to_csv(summary_path, index=False)
    print(f"\nSummary saved to {summary_path}")

# --- NEW: Save and Display the Detailed Report ---
if detailed_results:
    print(f"\n{'='*30}")
    print("DETAILED CLASSIFICATION REPORT")
    print(f"{'='*30}")
    
    # Create a DataFrame from the detailed results
    detailed_df = pd.DataFrame(detailed_results)
    
    # Save the detailed report to a new CSV file
    detailed_summary_path = 'detailed_svm_report.csv'
    detailed_df.to_csv(detailed_summary_path, index=False)
    
    print(f"Detailed report saved to {detailed_summary_path}")
    
    # Display the detailed table for review
    display(detailed_df)
# --- END OF NEW CODE ---


RUNNING EXPERIMENT
Image Size: 128x128 | Feature Extractor: HIST
Loading pre-trained model from: .\trained_models\128_HIST_model.pkl

--- Evaluation Results ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                precision    recall  f1-score   support

     difficult       0.40      0.12      0.19        16
    gametocyte       0.00      0.00      0.00        14
     leukocyte       1.00      0.38      0.55        21
red_blood_cell       0.95      1.00      0.98      6869
          ring       0.25      0.01      0.01       173
      schizont       0.00      0.00      0.00        12
   trophozoite       0.76      0.26      0.39       168

      accuracy                           0.95      7273
     macro avg       0.48      0.25      0.30      7273
  weighted avg       0.93      0.95      0.93      7273


RUNNING EXPERIMENT
Image Size: 128x128 | Feature Extractor: HOG
Loading pre-trained model from: .\trained_models\128_HOG_model.pkl

--- Evaluation Results ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                precision    recall  f1-score   support

     difficult       0.00      0.00      0.00        16
    gametocyte       0.00      0.00      0.00        14
     leukocyte       0.00      0.00      0.00        21
red_blood_cell       0.94      1.00      0.97      6869
          ring       0.00      0.00      0.00       173
      schizont       0.00      0.00      0.00        12
   trophozoite       0.00      0.00      0.00       168

      accuracy                           0.94      7273
     macro avg       0.13      0.14      0.14      7273
  weighted avg       0.89      0.94      0.92      7273


RUNNING EXPERIMENT
Image Size: 128x128 | Feature Extractor: LBP
Loading pre-trained model from: .\trained_models\128_LBP_model.pkl

--- Evaluation Results ---
                precision    recall  f1-score   support

     difficult       0.01      0.38      0.02        16
    gametocyte       0.01      0.14      0.03        14
     leukocyte       0.15      0.67      0.25       

Unnamed: 0,image_size,feature_extractor,accuracy,f1_score_weighted
0,128,HIST,0.951052,0.932698
1,128,HOG,0.944452,0.917472
2,128,LBP,0.815757,0.872467



Summary saved to model_experiment_summary.csv

DETAILED CLASSIFICATION REPORT
Detailed report saved to detailed_svm_report.csv


Unnamed: 0,image_size,feature_extractor,class,precision,recall,f1-score,support
0,128,HIST,difficult,0.4,0.125,0.190476,16.0
1,128,HIST,gametocyte,0.0,0.0,0.0,14.0
2,128,HIST,leukocyte,1.0,0.380952,0.551724,21.0
3,128,HIST,red_blood_cell,0.95332,0.998981,0.975617,6869.0
4,128,HIST,ring,0.25,0.00578,0.011299,173.0
5,128,HIST,schizont,0.0,0.0,0.0,12.0
6,128,HIST,trophozoite,0.758621,0.261905,0.389381,168.0
7,128,HIST,macro avg,0.480277,0.253231,0.302642,7273.0
8,128,HIST,weighted avg,0.927603,0.951052,0.932698,7273.0
9,128,HOG,difficult,0.0,0.0,0.0,16.0
