Import Libraries

In [None]:
import numpy as np

from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import ADASYN

import albumentations as aug
import cv2

import os

import sys

Directory to Kaggle's Alzheimer MRI Preprocessed Dataset.

Link: https://www.kaggle.com/datasets/sachinkumar413/alzheimer-mri-dataset

Alzheimer's Disease Dementia Classes: Non Demented, Very Mild Demented, Mild Demented, Moderate Demented

In [None]:
# ENTER PARENT PATH - Example: C:\Users\{USERNAME}\ai-alzheimer-detection
parent_path = r"ENTER PARENT PATH"

kaggle_dir = r"assets\Kaggle"
kaggle_path = os.path.join(parent_path, kaggle_dir)

kaggle_dataset_dir = r"alzheimer_mri_preprocessed_dataset"
kaggle_raw_dir = r"alzheimer_mri_preprocessed_dataset\raw"

kaggle_dataset_path = os.path.join(kaggle_path, kaggle_dataset_dir)
kaggle_raw_path = os.path.join(kaggle_path, kaggle_raw_dir)

In [None]:
classes = ["Non_Demented", "Very_Mild_Demented", "Mild_Demented", "Moderate_Demented"]
encoded_classes = {status: idx for idx, status in enumerate(classes)}

In [None]:
# add parent to path
sys.path.append(parent_path)

Loading images and labels (classes)

In [None]:
from alzheimersdetection import Dataset

X, y = Dataset.load_dataset(classes, kaggle_raw_path)

In [None]:
Dataset.printShapes(X, y)

Split the data into 80% training and 20% testing data. Ensure same class distribution using stratify=y (class/label).

Further split the training data into 75% training and 25% validation respectively.

Ratio: 60% Training : 20% Validation : 20% Testing

In [None]:
train_size = 0.80
test_size = 0.20
validation_size = 0.25

X_train, X_test, y_train, y_test = train_test_split(X,  y, test_size=test_size, stratify=y)

print("Before Validation - Training Data Shape:", X_train.shape)
print("Before Validation - Training Label Shape:", y_train.shape)

X_train, X_val, y_train, y_val = train_test_split(X_train,  y_train, test_size=validation_size, stratify=y_train)

See the overall sizes and verify the split occured correctly for each classes

In [None]:
print("Training Size: ", X_train.shape)
print("Test Size: ", X_test.shape)
print("Validation Size: ", X_val.shape)

print("\nClasses encoded for reference: ", encoded_classes, "\n")

unique = np.unique(y, return_counts=True)
print("Original: ", unique)

unique = np.unique(y_train, return_counts=True)
print("Training Split: ", unique)

unique = np.unique(y_val, return_counts=True)
print("Validation Split: ", unique)

unique = np.unique(y_test, return_counts=True)
print("Testing Split: ", unique)

Save the testing and validation data onto a compressed Numpy Archive (*.npz)

Image is saved for viewing purposes

In [None]:
max_val = X_test.min()
print(max_val)

test_dir = fr"{kaggle_dataset_path}\Test_Data"
test_npz = "test_data.npz"
Dataset.save_images_npz(X_test, y_test, classes, test_dir, test_npz)

In [None]:
validation_dir = fr"{kaggle_dataset_path}\Validation_Data"
validation_npz = "val_data.npz"
Dataset.save_images_npz(X_val, y_val, classes, validation_dir, validation_npz)

In [None]:
import stats.statistics as Statistics

title = "AD Classification Distribution with Training Dataset"

unique = np.unique(y_train, return_counts=True)
size = unique[1].tolist()

sample_dist = (classes, size)

print(encoded_classes)
print(unique)
Statistics.pieChartClassificationPlot(sample_dist, title)

To further balance the dataset, we need to employ more techniques. One of which is data augmentation.
Method to balance the data augmentation process is to define class-specific augmentation rates.

In [None]:
data_transforms = aug.Compose(
    [
        aug.Resize(height=128, width=128),
        aug.HorizontalFlip(p=0.5),
        aug.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=0, border_mode=cv2.BORDER_CONSTANT, value=0, p=1),
        aug.RandomBrightnessContrast(brightness_limit=0.0, contrast_limit=0.4, p=1)
    ]
)

X_aug, y_aug = [], []
rates = [1, 1, 2, 3]

# Function to augment
def augment(image, transform):
    augmented_image = transform(image=np.array(image))["image"]  # Extract augmented image from Albumentations output
    return augmented_image

for i, (image, label) in enumerate(zip(X_train, y_train)):
    for j in range(rates[label]):
        augment_image = augment(image, data_transforms)
        X_aug.append(augment_image)
        y_aug.append(label)

X_aug = np.array(X_aug)
y_aug = np.array(y_aug)

In [None]:
X_train = np.concatenate((X_train, X_aug), axis=0)
y_train = np.concatenate((y_train, y_aug), axis=0)

Class Distribution after Data Augmentation

In [None]:
title = "AD Classification Distribution after Data Augmentation"

unique = np.unique(y_train, return_counts=True)
size = unique[1].tolist()

sample_dist = (classes, size)

print(encoded_classes)
print(unique)
Statistics.pieChartClassificationPlot(sample_dist, title)

In [None]:
print("Shape Before:", X_train.shape)

# Preprocess data (normalize pixel values)
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train.reshape(-1, X_train.shape[1] * X_train.shape[2]))  # Reshape for normalization

print("Shape After: ", X_train.shape)

The dataset is still imbalanced and to fix this, we need to increase the minority class's representation (oversampling). This allows us to have a more balanced dataset.

We will be using Adaptive Synthetic Sampling (ADASYN) to oversample the minority classes.

Optimal Results: ~25% distribution across all AD classifications.

In [None]:
print("AD Classification Distribution before Oversampling")
class_counts = Counter(y_train)
print(class_counts)

# Visualize class imbalance before oversampling
title = "Class Distribution before Oversampling"
x_label = "CDR Rating"
y_label = "Number of Images"
Statistics.barClassificationPlot(sample=class_counts, title=title, x_label=x_label, y_label=y_label)

In [None]:
k = 5 # This is the k-neighbors which will be used for ADASYN

adasyn = ADASYN(n_neighbors=k)

X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

print("Original dataset shape counter: %s" % Counter(y_resampled))

In [None]:
print(X_resampled.shape)

# Reshape the 2d np array back to a 3d np array
size = X_resampled.shape[0]
X_resampled = X_resampled.reshape(size, 128, 128)

# Invert normalization
X_resampled = (X_resampled * 255).astype(np.uint8)  # Scale back to 0-255 and convert to uint8 for PyTorch

print(X_resampled.shape)
Dataset.showImage(X_resampled, len(X_resampled)-1)

In [None]:
# Print class distribution after oversampling
print("AD Classification Distribution after Oversampling")
class_counts_balanced = Counter(y_resampled)
print(class_counts_balanced)

Dataset.printShapes(X_resampled, y_resampled)

# Visualize class distribution after oversampling
title = "Class Distribution after Oversampling"
Statistics.barClassificationPlot(sample=class_counts_balanced, title=title, x_label=x_label, y_label=y_label)

Save the training data onto a compressed Numpy Archive (*.npz)

Image is saved for viewing purposes

In [None]:
train_dir = fr"{kaggle_dataset_path}\Train_Data"
train_npz = "augmented_adasyn_train_data.npz"
Dataset.save_images_npz(X_resampled, y_resampled, classes, train_dir, train_npz)

------
<p style="text-align: center;"> Made with ❤️ </p>
<p style="text-align: center;"> Darwin Xue </p>