# Phase 2

## Data Loading

In [22]:
import os
import json
import numpy as np
import random
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from tqdm import tqdm

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.utils.data.dataloader import default_collate
from torchvision import transforms
from torchvision.models import resnet18
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import functional as F
from sklearn.metrics import accuracy_score

In [6]:
root_path = r"dataset\malaria"

In [None]:

def count_images_per_class(base_folder):
    """
    Counts the number of images in each class folder within the base directory.
    Returns a dictionary with class names and their image counts.
    """
    if not os.path.isdir(base_folder):
        print(f"Error: Directory not found at '{base_folder}'")
        return None

    class_counts = {}
    print(f"Scanning for images in '{base_folder}'...")

    # os.walk will traverse the directory tree
    for root, _, files in os.walk(base_folder):
        class_name = os.path.basename(root)

        # Skip the top-level directory itself
        if class_name == os.path.basename(base_folder):
            continue

        # Count images with supported extensions
        image_count = sum(1 for filename in files
                         if filename.lower().endswith(('.png', '.jpg', '.jpeg')))

        if image_count > 0:  # Only add classes with images
            class_counts[class_name] = image_count

    return class_counts

if __name__ == '__main__':
    # Define the path to your folder of images
    try:
        DATA_DIR = os.path.join(root_path, 'resized_images_by_classes')

    except NameError:
        DATA_DIR = 'resized_images_by_classeseseseseses'  # Fallback if root_path isn't defined

    # Get the counts of images per class
    class_counts = count_images_per_class(DATA_DIR)

    if class_counts:
        print("\nImage counts per class:")
        total_images = 0
        for class_name, count in sorted(class_counts.items()):
            print(f"{class_name}: {count} images")
            total_images += count
        print(f"\nTotal number of images: {total_images}")
        print(f"Total number of classes: {len(class_counts)}")

Scanning for images in 'dataset\malaria\resized_images_by_classes'...

Image counts per class:
difficult: 441 images
gametocyte: 144 images
leukocyte: 103 images
red_blood_cell: 77418 images
ring: 353 images
schizont: 179 images
trophozoite: 1473 images

Total number of images: 80111
Total number of classes: 7


## Data sampler

In [11]:
def get_image_paths_and_labels(base_folder):
    """
    Walks through subdirectories and collects image paths and their corresponding
    class labels derived from the folder names.
    """
    if not os.path.isdir(base_folder):
        print(f"Error: Directory not found at '{base_folder}'")
        return None

    image_data = []
    print(f"Scanning for images in '{base_folder}'...")

    # os.walk will traverse the directory tree
    for root, _, files in os.walk(base_folder):
        class_name = os.path.basename(root)

        # Skip the top-level directory itself, only process class subfolders
        if class_name == os.path.basename(base_folder):
            continue

        for filename in files:
            if filename.lower().endswith(('.png', '.jpg', 'jpeg')):
                image_path = os.path.join(root, filename)
                image_data.append((image_path, class_name))

    return image_data


In [13]:
class CustomCellDataset(Dataset):
    def __init__(self, image_data, transform=None):
        """
        Args:
            image_data (list): A list of tuples (image_path, class_name).
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.image_data = image_data
        self.transform = transform

        # Create a mapping from class name (string) to class index (integer)
        self.classes = sorted(list(set(item[1] for item in image_data)))
        self.class_to_idx = {class_name: i for i, class_name in enumerate(self.classes)}
        self.idx_to_class = {i: class_name for class_name, i in self.class_to_idx.items()}

    def __len__(self):
        # This returns the total number of images in the dataset.
        return len(self.image_data)

    def __getitem__(self, idx):
        # This method loads and returns a single sample from the dataset at the given index.

        # Get the path and string label for the given index
        image_path, class_name = self.image_data[idx]

        # Load the image using Pillow
        image = Image.open(image_path).convert("RGB")

        # Get the integer label from the class name using our mapping
        label = self.class_to_idx[class_name]

        # Apply transforms if they exist
        if self.transform:
            image = self.transform(image)

        return image, label

# --- Step 3: Define Your Transformations ---
# This is where you define the on-the-fly augmentation and normalization.
# IMPORTANT: Since your images are already resized, we DO NOT need transforms.Resize() here.

train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.1),
    transforms.ToTensor(), # Converts the image to a PyTorch Tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Normalizes the tensor
])

# --- Step 4: Put It All Together and Create Your Dataset Object ---
if __name__ == '__main__':
    # Define the path to your folder of RESIZED images.
    # Assuming 'root_path' is defined, e.g., root_path = '/content/drive/MyDrive/fyp/malaria_dataset'
    try:
        RESIZED_DATA_DIR = os.path.join(root_path, 'resized_images_by_classes')
    except NameError:
        RESIZED_DATA_DIR = 'resized_images_by_classes' # Fallback if root_path isn't defined

    # 1. Get the list of all image paths and labels
    all_images = get_image_paths_and_labels(RESIZED_DATA_DIR)

    if all_images:
        # 2. Create an instance of your custom dataset
        # This is the dataset object you will use for everything that follows.
        my_dataset = CustomCellDataset(
            image_data=all_images,
            transform=train_transform
        )

        print(f"\nSuccessfully created a dataset object!")
        print(f"Total number of images found and loaded: {len(my_dataset)}")
        print(f"Number of classes: {len(my_dataset.classes)}")
        print(f"Classes found: {my_dataset.classes}")

        # --- (Optional) How to Verify It Worked ---
        print("\n--- Verifying a single sample from the dataset ---")

        # Let's get the first image and its label from the dataset
        image_tensor, label_index = my_dataset[0]

        print(f"Type of the image data: {type(image_tensor)}")
        print(f"Shape of the image tensor: {image_tensor.shape}")
        print(f"Label index: {label_index}")

        # You can see the shape is [3, 224, 224] (Channels, Height, Width), which confirms
        # the ToTensor() and resizing (from your previous step) worked correctly.

Scanning for images in 'dataset\malaria\resized_images_by_classes'...

Successfully created a dataset object!
Total number of images found and loaded: 80111
Number of classes: 7
Classes found: ['difficult', 'gametocyte', 'leukocyte', 'red_blood_cell', 'ring', 'schizont', 'trophozoite']

--- Verifying a single sample from the dataset ---
Type of the image data: <class 'torch.Tensor'>
Shape of the image tensor: torch.Size([3, 128, 128])
Label index: 0


In [23]:
# Get the class counts, sorted by class name to ensure consistent order
counts = train_class_counts.sort_index()
print("--- Class Counts for Weight Calculation ---")
print(counts)

# Calculate a weight for each class (less frequent class = higher weight)
class_weights = 1. / torch.tensor(counts.values, dtype=torch.float)
print("\n--- Calculated Weight per Class ---")
print(class_weights)

# Create a list containing the weight for EVERY sample in the dataset
# This is the crucial step that assigns the correct weight to each of your 80,000 images
sample_weights = [
    class_weights[my_dataset.class_to_idx[label]]
    for _, label in my_dataset.image_data
]

# --- Step 2: Create the Sampler ---
# The sampler will use these weights to perform balanced sampling
sampler = WeightedRandomSampler(
    weights=torch.DoubleTensor(sample_weights),
    num_samples=len(my_dataset),
    replacement=True
)
print("\nWeightedRandomSampler created successfully.")

# --- Step 3: Create the Final Training DataLoader ---
BATCH_SIZE = 64  # You can tune this based on your GPU memory
NUM_WORKERS = 4  # Use multiple workers to load data in parallel

# IMPORTANT: When using a 'sampler', you must NOT use 'shuffle=True'.
# The sampler handles the randomization in a balanced way.
train_loader = DataLoader(
    dataset=my_dataset,
    batch_size=BATCH_SIZE,
    sampler=sampler,
    num_workers=NUM_WORKERS
)

print("\n✅ Final Training DataLoader is ready!")

--- Class Counts for Weight Calculation ---
category
difficult           441
gametocyte          144
leukocyte           103
red blood cell    77420
ring                353
schizont            179
trophozoite        1473
Name: count, dtype: int64

--- Calculated Weight per Class ---
tensor([2.2676e-03, 6.9444e-03, 9.7087e-03, 1.2917e-05, 2.8329e-03, 5.5866e-03,
        6.7889e-04])

WeightedRandomSampler created successfully.

✅ Final Training DataLoader is ready!


In [None]:
# Get one batch of data from new train_loader
images, labels = next(iter(train_loader))

# Print the shape of the batch
print(f"\n--- Verifying one batch from train_loader ---")
print(f"Shape of the images tensor batch: {images.shape}") # Should be [64, 3, 224, 224]
print(f"Shape of the labels tensor batch: {labels.shape}")   # Should be [64]

# Count the occurrences of each class IN THIS BATCH
# This will show the effect of the sampler!
print(f"\nClass distribution within this single batch:")
print(torch.bincount(labels))

## Data training

In [15]:
# Paths to your JSON files and image directory
train_json_path = os.path.join(root_path, 'training.json')
test_json_path = os.path.join(root_path, 'test.json')

image_root_dir = root_path
image_path = os.path.join(image_root_dir, 'images')

In [17]:

# --- Step 1: Helper Function to Get All Image Paths ---
# This function scans your directory and creates a list of (image_path, class_name) tuples.

def get_image_paths_and_labels(base_folder):
    """
    Walks through subdirectories and collects image paths and their corresponding
    class labels derived from the folder names.
    """
    if not os.path.isdir(base_folder):
        print(f"Error: Directory not found at '{base_folder}'")
        return None

    image_data = []
    print(f"Scanning for images in '{base_folder}'...")

    # os.walk will traverse the directory tree
    for root, _, files in os.walk(base_folder):
        class_name = os.path.basename(root)

        # Skip the top-level directory itself, only process class subfolders
        if class_name == os.path.basename(base_folder):
            continue

        for filename in files:
            if filename.lower().endswith(('.png', '.jpg', 'jpeg')):
                image_path = os.path.join(root, filename)
                image_data.append((image_path, class_name))

    return image_data

# --- Step 2: The Custom PyTorch Dataset Class ---
# This class takes the list from Step 1 and knows how to load and transform an image
# when the DataLoader asks for it.

class CustomCellDataset(Dataset):
    def __init__(self, image_data, transform=None):
        """
        Args:
            image_data (list): A list of tuples (image_path, class_name).
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.image_data = image_data
        self.transform = transform

        # Create a mapping from class name (string) to class index (integer)
        self.classes = sorted(list(set(item[1] for item in image_data)))
        self.class_to_idx = {class_name: i for i, class_name in enumerate(self.classes)}
        self.idx_to_class = {i: class_name for class_name, i in self.class_to_idx.items()}

    def __len__(self):
        # This returns the total number of images in the dataset.
        return len(self.image_data)

    def __getitem__(self, idx):
        # This method loads and returns a single sample from the dataset at the given index.

        # Get the path and string label for the given index
        image_path, class_name = self.image_data[idx]

        # Load the image using Pillow
        image = Image.open(image_path).convert("RGB")

        # Get the integer label from the class name using our mapping
        label = self.class_to_idx[class_name]

        # Apply transforms if they exist
        if self.transform:
            image = self.transform(image)

        return image, label

# --- Step 3: Define Your Transformations ---
# This is where you define the on-the-fly augmentation and normalization.
# IMPORTANT: Since your images are already resized, we DO NOT need transforms.Resize() here.

train_transform = transforms.Compose([
    # No transforms.Resize() needed!
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.1),
    transforms.ToTensor(), # Converts the image to a PyTorch Tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Normalizes the tensor
])

# --- Step 4: Put It All Together and Create Your Dataset Object ---
if __name__ == '__main__':
    # Define the path to your folder of RESIZED images.
    # Assuming 'root_path' is defined, e.g., root_path = '/content/drive/MyDrive/fyp/malaria_dataset'
    try:
        RESIZED_DATA_DIR = os.path.join(root_path, 'resized_images_by_classes')
    except NameError:
        RESIZED_DATA_DIR = 'resized_images_by_classes' # Fallback if root_path isn't defined

    # 1. Get the list of all image paths and labels
    all_images = get_image_paths_and_labels(RESIZED_DATA_DIR)

    if all_images:
        # 2. Create an instance of your custom dataset
        # This is the dataset object you will use for everything that follows.
        my_dataset = CustomCellDataset(
            image_data=all_images,
            transform=train_transform
        )

        print(f"\nSuccessfully created a dataset object!")
        print(f"Total number of images found and loaded: {len(my_dataset)}")
        print(f"Number of classes: {len(my_dataset.classes)}")
        print(f"Classes found: {my_dataset.classes}")

        # --- (Optional) How to Verify It Worked ---
        print("\n--- Verifying a single sample from the dataset ---")
        # Let's get the first image and its label from the dataset
        image_tensor, label_index = my_dataset[0]

        print(f"Type of the image data: {type(image_tensor)}")
        print(f"Shape of the image tensor: {image_tensor.shape}")
        print(f"Label index: {label_index}")

        # You can see the shape is [3, 224, 224] (Channels, Height, Width), which confirms
        # the ToTensor() and resizing (from your previous step) worked correctly.

Scanning for images in 'dataset\malaria\resized_images_by_classes'...

Successfully created a dataset object!
Total number of images found and loaded: 80111
Number of classes: 7
Classes found: ['difficult', 'gametocyte', 'leukocyte', 'red_blood_cell', 'ring', 'schizont', 'trophozoite']

--- Verifying a single sample from the dataset ---
Type of the image data: <class 'torch.Tensor'>
Shape of the image tensor: torch.Size([3, 128, 128])
Label index: 0


In [20]:
# Loading training data metadata
with open(train_json_path, 'r') as f:
    training_data = json.load(f)

# Extract all object categories into a list
all_categories = []
for item in training_data:
    for obj in item['objects']:
        all_categories.append(obj['category'])

# Create a pandas DataFrame for easy counting and plotting
df = pd.DataFrame(all_categories, columns=['category'])

train_class_counts = df['category'].value_counts()
print(train_class_counts)

category
red blood cell    77420
trophozoite        1473
difficult           441
ring                353
schizont            179
gametocyte          144
leukocyte           103
Name: count, dtype: int64


In [None]:

# --- Step 1: Calculate Weights for the Sampler ---

# Get the class counts, sorted by class name to ensure consistent order
counts = train_class_counts.sort_index()
print("--- Class Counts for Weight Calculation ---")
print(counts)

# Calculate a weight for each class (less frequent class = higher weight)
class_weights = 1. / torch.tensor(counts.values, dtype=torch.float)
print("\n--- Calculated Weight per Class ---")
print(class_weights)

# Create a list containing the weight for EVERY sample in the dataset
# This is the crucial step that assigns the correct weight to each of your 80,000 images
sample_weights = [
    class_weights[my_dataset.class_to_idx[label]]
    for _, label in my_dataset.image_data
]

# --- Step 2: Create the Sampler ---
# The sampler will use these weights to perform balanced sampling
sampler = WeightedRandomSampler(
    weights=torch.DoubleTensor(sample_weights),
    num_samples=len(my_dataset),
    replacement=True
)
print("\nWeightedRandomSampler created successfully.")

# --- Step 3: Create the Final Training DataLoader ---
BATCH_SIZE = 64  # You can tune this based on your GPU memory
NUM_WORKERS = 4  # Use multiple workers to load data in parallel

# IMPORTANT: When using a 'sampler', you must NOT use 'shuffle=True'.
# The sampler handles the randomization in a balanced way.
train_loader = DataLoader(
    dataset=my_dataset,
    batch_size=BATCH_SIZE,
    sampler=sampler,
    num_workers=NUM_WORKERS
)

print("\n✅ Final Training DataLoader is ready!")

--- Class Counts for Weight Calculation ---
category
difficult           441
gametocyte          144
leukocyte           103
red blood cell    77420
ring                353
schizont            179
trophozoite        1473
Name: count, dtype: int64

--- Calculated Weight per Class ---
tensor([2.2676e-03, 6.9444e-03, 9.7087e-03, 1.2917e-05, 2.8329e-03, 5.5866e-03,
        6.7889e-04])


NameError: name 'WeightedRandomSampler' is not defined

In [None]:
# Get one batch of data from your new train_loader
images, labels = next(iter(train_loader))

# Print the shape of the batch
print(f"\n--- Verifying one batch from train_loader ---")
print(f"Shape of the images tensor batch: {images.shape}") # Should be [64, 3, 224, 224]
print(f"Shape of the labels tensor batch: {labels.shape}")   # Should be [64]

# Count the occurrences of each class IN THIS BATCH
# This will show the effect of the sampler!
print(f"\nClass distribution within this single batch:")
print(torch.bincount(labels))


--- Verifying one batch from train_loader ---
Shape of the images tensor batch: torch.Size([64, 3, 224, 224])
Shape of the labels tensor batch: torch.Size([64])

Class distribution within this single batch:
tensor([ 4,  6,  7,  0,  1, 46])


In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import numpy as np

# --- 1. SETUP FOR FEATURE EXTRACTION ---

# Use a DataLoader to process images in batches.
# IMPORTANT: shuffle must be False to keep features and labels aligned.
feature_loader = DataLoader(
    dataset=my_dataset,
    batch_size=64,
    shuffle=False, # Must be False!
    num_workers=2
)

# Load a pre-trained ResNet-50 model
model = models.resnet50(pretrained=True)

# Modify the model by removing its final classification layer.
# This turns it into a feature extractor.
feature_extractor = nn.Sequential(*list(model.children())[:-1])

# Set up device (GPU or CPU) and put the model in evaluation mode
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
feature_extractor.to(device)
feature_extractor.eval()


# --- 2. EXTRACT FEATURES FROM ALL IMAGES ---

print(f"--- Starting feature extraction on {len(my_dataset)} images ---")
all_features = []
all_labels = []

# Loop through all data in the DataLoader
with torch.no_grad(): # Deactivates gradient calculation for speed
    for images, labels in tqdm(feature_loader):
        images = images.to(device)

        # Get the features from the model
        features = feature_extractor(images)

        # Flatten the features to a 1D vector per image
        features = features.view(features.size(0), -1)

        # Store the features and labels
        all_features.append(features.cpu())
        all_labels.append(labels.cpu())

# Combine all batches into single tensors, then convert to NumPy arrays
X = torch.cat(all_features).numpy()
y = torch.cat(all_labels).numpy()

print("\n--- Feature Extraction Complete ---")
print(f"Feature matrix shape (X): {X.shape}")
print(f"Labels vector shape (y): {y.shape}")


# --- 3. TRAIN AND EVALUATE TRADITIONAL ML MODELS ---

# Split the extracted features into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

print(f"\n--- Data split into {len(X_train)} training and {len(X_test)} testing samples ---")


# --- A. Support Vector Machine (SVM) ---
print("\n--- Training Support Vector Machine ---")
# Using probability=True can be useful but is slower
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train, y_train)

print("--- Evaluating SVM ---")
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.4f}")
print("\nSVM Classification Report:")
print(classification_report(y_test, y_pred_svm, target_names=my_dataset.classes))


# --- B. Random Forest ---
print("\n--- Training Random Forest ---")
# n_jobs=-1 uses all available CPU cores to speed up training
rf_model = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

print("--- Evaluating Random Forest ---")
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=my_dataset.classes))

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:03<00:00, 28.3MB/s]


--- Starting feature extraction on 2693 images ---


100%|██████████| 43/43 [19:37<00:00, 27.38s/it]



--- Feature Extraction Complete ---
Feature matrix shape (X): (2693, 2048)
Labels vector shape (y): (2693,)

--- Data split into 2019 training and 674 testing samples ---

--- Training Support Vector Machine ---
--- Evaluating SVM ---
SVM Accuracy: 0.6751

SVM Classification Report:
              precision    recall  f1-score   support

   difficult       0.38      0.45      0.41       110
  gametocyte       0.41      0.36      0.38        36
   leukocyte       1.00      0.88      0.94        26
        ring       0.78      0.82      0.80        88
    schizont       0.42      0.33      0.37        45
 trophozoite       0.78      0.77      0.77       369

    accuracy                           0.68       674
   macro avg       0.63      0.60      0.61       674
weighted avg       0.68      0.68      0.68       674


--- Training Random Forest ---
--- Evaluating Random Forest ---
Random Forest Accuracy: 0.6395

Random Forest Classification Report:
              precision    recall  f1-

## SVM

## Random Forest

## XG Boost

## Cross Validation & Evaluation

Use k-fold CV (e.g., k=5)


Compute:


Accuracy


Precision


Recall


F1-score


ROC-AUC


Generate confusion matrix and ROC curves.
