In [None]:
import os
import pandas as pd
from skimage.feature import hog
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import joblib
from collections import Counter

import glob

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
%matplotlib inline

# importing malaria dataset class to map bounding boxes on one image
# and skipping any null values with detection collate
import sys
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(parent_dir)
from malaria_dataset import MalariaDataset, detection_collate

In [4]:
# Paths
root_path = os.path.join('..', 'dataset', 'malaria')
train_base_path = os.path.join(root_path, 'training_ds')
test_base_path = os.path.join(root_path, 'testing_ds')
image_path = os.path.join(root_path, 'images')
train_json_path = os.path.join(root_path, 'training.json')
test_json_path = os.path.join(root_path, 'test.json')

FEATURES_DIR = os.path.join(root_path, 'extracted_features2')
os.makedirs(FEATURES_DIR, exist_ok=True)

image_sizes = [128]
print("Root Path:", root_path)
print("Train Base Path:", train_base_path)
print("Test Base Path:", test_base_path)

Root Path: ..\dataset\malaria
Train Base Path: ..\dataset\malaria\training_ds
Test Base Path: ..\dataset\malaria\testing_ds


### Model Training

In [None]:
MODELS_DIR = os.path.join('.', 'trained_models2')
os.makedirs(MODELS_DIR, exist_ok=True)

experiment_results = []
detailed_results = [] 

feature_files = glob.glob(os.path.join(FEATURES_DIR, "*.pkl"))

if not feature_files:
    print("ERROR: No feature files found!")
    print(f"Please run the Feature Extraction cell first to create .pkl files in: {FEATURES_DIR}")



#  Main Training Loop
for file_path in feature_files:
    filename = os.path.basename(file_path)
    size, extractor_name, _ = filename.split('_', 2)
    
    print(f"\n{'='*25}")
    print(f"RUNNING EXPERIMENT")
    print(f"Image Size: {size}x{size} | Feature Extractor: {extractor_name}")
    print(f"{'='*25}")
    
    data = joblib.load(file_path)
    X_train, y_train = data['X_train'], data['y_train']
    X_test, y_test = data['X_test'], data['y_test']
    le = data['label_encoder']
    
    model_path = os.path.join(MODELS_DIR, f"{size}_{extractor_name}_rf_model.pkl")

    if os.path.exists(model_path):
        print(f"Loading pre-trained model from: {model_path}")
        pipeline = joblib.load(model_path)
    else:
        print("Pre-trained model not found. Training a new model...")

        target_stats = Counter(y_train)
        print("Original training set class distribution:", target_stats)

        # 2. Define our "smart" sampling strategy
        # We will boost any class with fewer than 1000 samples up to 1000.
        # Classes with more than 1000 samples will be left as they are.
        sampling_strategy_dict = {}
        floor_samples = 1000 
        for class_index, num_samples in target_stats.items():
            if num_samples < floor_samples:
                sampling_strategy_dict[class_index] = floor_samples
            else:
                sampling_strategy_dict[class_index] = num_samples

        print("New SMOTE sampling strategy:", sampling_strategy_dict)

        smote_with_dict = SMOTE(random_state=42, sampling_strategy=sampling_strategy_dict)
        
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('smote', smote_with_dict),
            ('rf', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))
        ])     
        
        pipeline.fit(X_train, y_train)
        print(f"Saving trained model to: {model_path}")
        joblib.dump(pipeline, model_path)

    # --- 3. Evaluation ---
    print("\n--- Evaluation Results ---")
    predictions = pipeline.predict(X_test)
    # The output_dict=True is the key to capturing the results
    report = classification_report(y_test, predictions, target_names=le.classes_, output_dict=True)
    print(classification_report(y_test, predictions, target_names=le.classes_))

    # --- 4. Store Results for Final Summary ---
    experiment_results.append({
        'image_size': size,
        'feature_extractor': extractor_name,
        'accuracy': report['accuracy'],
        'f1_score_weighted': report['weighted avg']['f1-score']
    })
    
    for class_name, metrics in report.items():
        # We only want to log the class rows, which are dictionaries
        if isinstance(metrics, dict):
            detailed_results.append({
                'image_size': size,
                'feature_extractor': extractor_name,
                'class': class_name,
                'precision': metrics.get('precision'),
                'recall': metrics.get('recall'),
                'f1-score': metrics.get('f1-score'),
                'support': metrics.get('support')
            })

if experiment_results:
    print(f"\n{'='*30}")
    print("FINAL EXPERIMENT SUMMARY")
    print(f"{'='*30}")
    results_df = pd.DataFrame(experiment_results)
    results_df = results_df.sort_values(by='f1_score_weighted', ascending=False)
    display(results_df)
    
    summary_path = 'model_experiment_summary.csv'
    results_df.to_csv(summary_path, index=False)
    print(f"\nSummary saved to {summary_path}")

# --- NEW: Save and Display the Detailed Report ---
if detailed_results:
    print(f"\n{'='*30}")
    print("DETAILED CLASSIFICATION REPORT")
    print(f"{'='*30}")
    
    # Create a DataFrame from the detailed results
    detailed_df = pd.DataFrame(detailed_results)
    
    detailed_summary_path = 'detailed_classification_report.csv'
    detailed_df.to_csv(detailed_summary_path, index=False)
    
    print(f"Detailed report saved to {detailed_summary_path}")
    
    # Display the detailed table for review
    display(detailed_df)


RUNNING EXPERIMENT
Image Size: 128x128 | Feature Extractor: HIST
Loading pre-trained model from: .\trained_models2\128_HIST_rf_model.pkl

--- Evaluation Results ---
                precision    recall  f1-score   support

     difficult       0.17      0.06      0.09        16
    gametocyte       0.00      0.00      0.00        14
     leukocyte       1.00      0.81      0.89        21
red_blood_cell       0.96      1.00      0.98      6869
          ring       1.00      0.01      0.01       173
      schizont       0.00      0.00      0.00        12
   trophozoite       0.77      0.32      0.45       168

      accuracy                           0.95      7273
     macro avg       0.56      0.31      0.35      7273
  weighted avg       0.95      0.95      0.94      7273


RUNNING EXPERIMENT
Image Size: 128x128 | Feature Extractor: HOG


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Pre-trained model not found. Training a new model...
Original training set class distribution: Counter({np.int64(3): 76165, np.int64(6): 1416, np.int64(0): 430, np.int64(4): 349, np.int64(5): 178, np.int64(1): 142, np.int64(2): 82})
New SMOTE sampling strategy: {np.int64(0): 1000, np.int64(1): 1000, np.int64(2): 1000, np.int64(3): 76165, np.int64(4): 1000, np.int64(5): 1000, np.int64(6): 1416}
Saving trained model to: .\trained_models2\128_HOG_rf_model.pkl

--- Evaluation Results ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                precision    recall  f1-score   support

     difficult       0.00      0.00      0.00        16
    gametocyte       0.00      0.00      0.00        14
     leukocyte       1.00      0.19      0.32        21
red_blood_cell       0.94      1.00      0.97      6869
          ring       0.00      0.00      0.00       173
      schizont       0.00      0.00      0.00        12
   trophozoite       0.00      0.00      0.00       168

      accuracy                           0.95      7273
     macro avg       0.28      0.17      0.18      7273
  weighted avg       0.90      0.95      0.92      7273


RUNNING EXPERIMENT
Image Size: 128x128 | Feature Extractor: LBP
Pre-trained model not found. Training a new model...
Original training set class distribution: Counter({np.int64(3): 76165, np.int64(6): 1416, np.int64(0): 430, np.int64(4): 349, np.int64(5): 178, np.int64(1): 142, np.int64(2): 82})
New SMOTE sampling strategy: {np.int64(0): 1000, np.int64(1): 1000, np.int64(2): 10

Unnamed: 0,image_size,feature_extractor,accuracy,f1_score_weighted
0,128,HIST,0.953664,0.936408
2,128,LBP,0.907603,0.919813
1,128,HOG,0.945002,0.918655



Summary saved to model_experiment_summary.csv

DETAILED CLASSIFICATION REPORT
Detailed report saved to detailed_classification_report.csv


Unnamed: 0,image_size,feature_extractor,class,precision,recall,f1-score,support
0,128,HIST,difficult,0.166667,0.0625,0.090909,16.0
1,128,HIST,gametocyte,0.0,0.0,0.0,14.0
2,128,HIST,leukocyte,1.0,0.809524,0.894737,21.0
3,128,HIST,red_blood_cell,0.956116,0.999127,0.977148,6869.0
4,128,HIST,ring,1.0,0.00578,0.011494,173.0
5,128,HIST,schizont,0.0,0.0,0.0,12.0
6,128,HIST,trophozoite,0.771429,0.321429,0.453782,168.0
7,128,HIST,macro avg,0.556316,0.314051,0.346867,7273.0
8,128,HIST,weighted avg,0.947866,0.953664,0.936408,7273.0
9,128,HOG,difficult,0.0,0.0,0.0,16.0
