In [None]:
# Standard library imports
import os

# Third-party libraries
import numpy as np
import pandas as pd
from skimage.transform import rotate
from sklearn.model_selection import train_test_split

# Keras and TensorFlow imports
import keras
from keras.models import Model, Sequential
from keras.layers import Dense, Concatenate, Flatten, GlobalAveragePooling2D, GlobalMaxPooling2D, Dropout, Input
from keras.optimizers import Adam
from keras.metrics import Precision, Recall
from keras.utils import Sequence
from keras.preprocessing.image import load_img, img_to_array

# Pre-trained models from Keras and TensorFlow
from keras.applications.vgg19 import VGG19
from keras.applications.densenet import DenseNet121
from tensorflow.keras.applications import InceptionV3, ResNet50, EfficientNetV2S
from tensorflow.keras.applications.resnet50 import preprocess_input

1 GLOBAL VARIABLES:

In [None]:
# General training hyperparameters
EPOCHS = 20  # Number of times the model will iterate over the entire dataset
BATCH_SIZE = 32  # Number of samples processed per batch during training

# Image-related constants
IMG_HEIGHT = 224  # Input image height (in pixels)
IMG_WIDTH = 224  # Input image width (in pixels)
MODELS_INPUT_SHAPE = (IMG_HEIGHT, IMG_WIDTH, 3)  # Model input shape (height, width, channels)

# Data-related constants
VALIDATION_DATASET_SIZE = 0.2  # Fraction of data to be used for validation (20%)
FILENAME_COLUMN = 'id'  # Column name for unique file identifiers
TARGET_VARIABLE = 'T1'  # Column name for the target variable (classification label)
DATA_LOCATION = ""  # Path to dataset storage

# Optimization constants
ADAM_COEFFICIENT = 0.001  # Learning rate for the Adam optimizer

2 DATA LOADING:

In [None]:
original_data = pd.read_csv(DATA_LOCATION + 'binary_dataset.csv')
original_data[FILENAME_COLUMN] = original_data[FILENAME_COLUMN].apply(lambda x: DATA_LOCATION + 'images/' + x + '.jpg')

2.2 DEALING WITH IMBALANCED DATA

In [None]:

def return_datasets(data_df):
    """
    Balances the dataset by oversampling the minority class and then splits it into training and testing sets.

    Args:
        data_df (pd.DataFrame): The input DataFrame containing features and the target column 'T1'.

    Returns:
        tuple: Two DataFrames (X_train and X_test) with features and target variable included.
    """

    # Determine the imbalance in class distribution (number of samples to add)
    gapnum = abs(data_df[data_df.T1 == 1].shape[0] - data_df[data_df.T1 != 1].shape[0])

    # Randomly sample additional instances from the minority class (T1 == 1)
    temp_df = data_df.iloc[np.random.choice(np.where(data_df[TARGET_VARIABLE] == 1)[0], size=gapnum)]

    # Add the oversampled data back into the original dataset
    data_df = pd.concat([data_df, temp_df], ignore_index=True)

    # Shuffle the dataset to ensure randomness
    data_df = data_df.sample(frac=1).reset_index(drop=True)

    # Separate the target variable ('T1') from the features
    target_df = data_df.T1
    data_df.drop(columns=[TARGET_VARIABLE], inplace=True)

    # Split the data into training and validation sets
    X_train, X_test, y_train, y_test = train_test_split(
        data_df, target_df, test_size=VALIDATION_DATASET_SIZE, random_state=42
    )

    # Reattach the target variable to the training and test sets
    X_train[TARGET_VARIABLE] = y_train
    X_test[TARGET_VARIABLE] = y_test

    # Return the processed training and validation datasets
    return X_train.reset_index(drop=True), X_test.reset_index(drop=True)

# Prepare the dataset by applying the function
train_df, val_df = return_datasets(original_data)

2.3 DATAGENERATORKERAS CLASS:

In [None]:
class DataGeneratorKeras(Sequence):
    """
    Keras Data Generator for loading and preprocessing image datasets.

    This generator loads images from a dataset, applies optional augmentation and preprocessing,
    and returns batches of images along with their corresponding labels.

    Attributes:
        dataset (pd.DataFrame): DataFrame containing image file paths and target labels.
        batch_size (int): Number of samples per batch.
        image_shape (tuple): Shape of the input images (height, width, channels).
        filename_column (str): Column name containing image file paths.
        target_column (list): Column(s) containing target labels.
        augmentation (bool): Whether to apply data augmentation.
        preprocessing_fn (callable): Function to apply preprocessing to images.
    """

    def __init__(self, dataset, augmentation=False, preprocessing_fn=None,
                 filename_column='id', target_column=['T1'], image_shape=(224, 224, 3), batch_size=16):
        """
        Initializes the data generator.

        Args:
            dataset (pd.DataFrame): DataFrame containing image paths and target labels.
            augmentation (bool, optional): Whether to apply data augmentation. Defaults to False.
            preprocessing_fn (callable, optional): Function for additional image preprocessing. Defaults to None.
            filename_column (str, optional): Column name with image file paths. Defaults to 'id'.
            target_column (list, optional): List of column names containing labels. Defaults to ['T1'].
            image_shape (tuple, optional): Shape of the input images. Defaults to (224, 224, 3).
            batch_size (int, optional): Number of samples per batch. Defaults to 16.
        """
        super().__init__()
        self.dataset = dataset
        self.batch_size = batch_size
        self.image_shape = image_shape
        self.filename_column = filename_column
        self.target_column = target_column
        self.augmentation = augmentation
        self.preprocessing_fn = preprocessing_fn

    def __len__(self):
        """
        Computes the number of batches per epoch.

        Returns:
            int: Number of batches per epoch.
        """
        return len(self.dataset) // self.batch_size

    def on_epoch_end(self):
        """
        Shuffles the dataset at the end of each epoch to improve training variability.
        """
        self.dataset = self.dataset.sample(frac=1).reset_index(drop=True)

    def __getitem__(self, idx):
        """
        Generates a batch of data.

        Args:
            idx (int): Index of the batch.

        Returns:
            tuple: ((images, images), labels), where images are preprocessed input data,
                   and labels are the corresponding target values.
        """
        images = np.empty((self.batch_size, *self.image_shape), dtype=np.float32)
        labels = np.empty((self.batch_size, len(self.target_column)), dtype=np.float32)

        for i in range(self.batch_size):
            img_index = idx * self.batch_size + i
            image_path = self.dataset.iloc[img_index][self.filename_column]
            image = img_to_array(load_img(image_path, target_size=self.image_shape[:2]))

            # Apply optional augmentation
            if self.augmentation:
                image = rotate(image, np.random.uniform(-30, 30), preserve_range=True)
                if np.random.choice([True, False]):  # Random horizontal flip
                    image = np.flip(image, axis=1)

            # Apply optional preprocessing function
            if self.preprocessing_fn:
                image = self.preprocessing_fn(image)

            images[i] = image
            labels[i] = self.dataset.iloc[img_index][self.target_column].values.astype(np.float32)

        return images, labels


2.4 GENERATE THE DATAGENERATOR TO BE USED IN THE MODEL TRAINING:

In [None]:
train_datagen = DataGeneratorKeras(dataset = train_df, augmentation = True, preprocessing_fn = preprocess_input, batch_size = 32)
valid_datagen = DataGeneratorKeras(dataset = val_df, augmentation = False, preprocessing_fn = preprocess_input, batch_size = 32)

3 MODEL ARCHITECTURE:

In [None]:
# Define the input
inputs = Input(shape=MODELS_INPUT_SHAPE)

# Load ResNet50 with the input tensor
#include_top = False because otherwise it would include the layers for the 1000 classes.
#base_model = VGG19(weights='imagenet', input_tensor=inputs)
#base_model = InceptionV3(weights='imagenet', input_tensor=inputs)
base_model = ResNet50(weights='imagenet', input_tensor=inputs)
#base_model = EfficientNetV2S(weights='imagenet', input_tensor=inputs)

base_model.trainable = False

# Add global average pooling layer
x = Flatten(name='flatten2')(base_model.output)
#x = GlobalAveragePooling2D()(base_model.output)
for units in [256, 128, 64, 32, 16]:
    x = Dense(units=units, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)

# Create the model
model = Model(inputs, outputs)

# Compile the model
model.compile(optimizer = Adam(ADAM_COEFFICIENT), loss = 'binary_crossentropy', metrics = ['accuracy', Precision(), Recall()])

4 MODEL TRAINING:

In [None]:
model.fit(
    x = train_datagen,
    epochs = EPOCHS,
    validation_data = valid_datagen
)