# Imports

In [None]:
!pip install tensorflow==2.10.1  -q gwpy

In [None]:
!pip uninstall matplotlib
!pip install matplotlib==3.1.3

In [None]:
!pip install albumentations

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
from google.colab import drive
import tensorflow as tf
import numpy as np

import random
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator,load_img,img_to_array,array_to_img
from sklearn.metrics import confusion_matrix
from keras.callbacks import *
import random
from keras import backend as K
from PIL import Image
import cv2
import shutil
from tensorflow.keras.preprocessing.image import ImageDataGenerator
tfk = tf.keras
tfkl = tf.keras.layers
print(tf.__version__)
import albumentations as A
from itertools import combinations
import json
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from typing import Tuple
import scipy
from skimage.measure import label as label_fn
from skimage import filters
from sklearn.model_selection import StratifiedKFold
import albumentations as A

# Mount the My Drive folder

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount = True)

# Env setup

In [None]:
%cd /gdrive/MyDrive/tuberculosis-pneumonia-classification

In [None]:
SEED = 4224
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

labels_path = 'data/labels_train_clean.csv'
all_data_no_duplicates_path = 'data/train_all_no_duplicates'
clean_data_path = 'data/train_clean/'
noisy_data_path = 'data/train_noisy/'

train_percentage = 0.8
validation_percentage = 0.15
test_percentage = 0.2
img_size = (224,224)
batch_size = 16

# Data generator

In [None]:
class CustomGenerator(tf.keras.utils.Sequence):
  """
    CustomGenerator inheriting from tf.keras.utils.Sequence.

    We have to implement 3 main methods:
      - __init__: save dataset params like directory, filenames, etc.
      - __len__: return the total number of samples in the dataset (number of batches)
      - __getitem__: return a single batch of paired images masks
  """

  def __init__(self, 
               dataframe, # dataframe of the dataset  
               base_path,
               preprocessing_function=None, # Preprocessing function (e.g., the one used for transfer learning)
               batch_size=16, # Batch size
               out_shape = (100,100),
               shuffle=False,
               categorical = True,
               augment = False,
               seed = SEED,
               flow_from_directory = True,
               preprocess_input = False):
    
    # Get all filenames
    if isinstance(base_path, Tuple):
      self.filenames = []
      for p in base_path:

        paths = self.folderToPaths(p, full_path = False)

        for pa in paths:
          if pa in set(dataframe.file):
            self.filenames.append(os.path.join(p, pa))


    else:
        self.filenames = [os.path.join(base_path, img_path) for img_path in list(dataframe.file)]

    self.labels = tfk.utils.to_categorical(list(dataframe.label)) if categorical else list(dataframe.label)

    # Set indices list in [0, len(subset_filenames)]
    self.indices = np.arange(len(self.filenames))

    # Save dataset parameters as class attributes
    self.base_path = base_path
    self.preprocessing_function = preprocessing_function
    self.out_shape = out_shape
    self.batch_size = batch_size
    self.shuffle = shuffle
    self.augment = augment
    self.seed = seed
    self.flow_from_directory =flow_from_directory
    self.data_augmentation = A.Compose([
    A.RandomBrightnessContrast(brightness_limit = 0.05, contrast_limit=0.05, p=0.5),
    A.ShiftScaleRotate(p = 0.8, rotate_limit = 20, scale_limit = 0.3, border_mode =  cv2.BORDER_CONSTANT, value = 0),
    A.CLAHE(p=0.2)
    ])
    self.preprocess_input = preprocess_input

    if not self.flow_from_directory:
      self.images = self.load_all_imgs()

  def augmentation(self, images):
    return self.data_augmentation(image = images)


  def __filterNoisyOnClahe(self, image):
    clahe = cv2.createCLAHE(clipLimit = 300, tileGridSize = (50, 50))
    im1 = cv2.resize(image, (400, 400))
    im1 = scipy.ndimage.gaussian_laplace(im1, sigma = 6)
    im1 = clahe.apply(im1)
    var1 = np.var(im1)
    if var1 > 800:
      image= cv2.medianBlur(image, ksize=5)
      return scipy.ndimage.uniform_filter(image, size=3)
    else:
      return image


  def __sharpenImage(self, image):
    sharpen_kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])

    sharpened = cv2.filter2D(image, -1, sharpen_kernel)

    return sharpened

  def __invert_image(self, img):

    otsu_thresh = filters.threshold_otsu(img)
    masked_image = (img > otsu_thresh) * 1.0
    valsROI1, _ = np.histogram(masked_image[220:244, 220:244], bins=2, range=(0, 1))
    valsROI2, _ = np.histogram(masked_image[0:25, 0:25], bins=2, range=(0, 1))
    valsROI3, _ = np.histogram(masked_image[0:25, 220:244], bins=2, range=(0, 1))
    valsROI4, _ = np.histogram(masked_image[220:244, 0:25], bins=2, range=(0, 1))
    
    valsTot = valsROI1 + valsROI2 + valsROI3 + valsROI4
    
    labels = label_fn(masked_image)

    if len(np.unique(labels)) < 100:
        if valsTot[0] > valsTot[1]:
            return img
        else:
            return 255 - img
    return img


  def __filterBlurred(self, image):
    clahe = cv2.createCLAHE(clipLimit = 1.8, tileGridSize = (4, 4))
    minThresh = 2
    im1 = cv2.resize(image, (400, 400))
    im1 = scipy.ndimage.gaussian_laplace(im1, sigma = 2)
    im1 = clahe.apply(im1)
    _, im1 = cv2.threshold(im1, minThresh, 255, cv2.THRESH_BINARY)
    var1 = np.var(im1)
    if var1 < 5:
        return self.__sharpenImage(image)
    else:
        return image 

  def __filterUnderexposed(self, image):
    im1 = cv2.resize(image, (400, 400))
    mean1 = np.mean(im1)
    if mean1 < 71:
      clahe = cv2.createCLAHE(clipLimit = 2, tileGridSize = (2, 2))
      image = clahe.apply(image)
      return image
    else:
      return image


  def preprocess(self, image):

    image = self.__invert_image(image)
    image = self.__filterUnderexposed(image)
    image = self.__filterNoisyOnClahe(image)
    image = self.__filterBlurred(image)
   

    return image

  def __len__(self):
    # Return the length of the dataset (number of batches)
    # that is given by #images // batch_size
    return len(self.filenames) // self.batch_size

  def on_epoch_start(self):
    # Shuffle indices after each epoch
    if self.shuffle == True:
        np.random.shuffle(self.indices)

  def load_all_imgs(self):
      images = []
      for f in self.filenames:
        image = cv2.imread(f, 0)
        image = cv2.resize(image, (self.out_shape))
        if self.preprocess_input:
          image = self.preprocess(image)
        images.append(image)

      return np.array(images)

  def get_image_and_label(self, index):

    if not self.flow_from_directory:
      image = self.images[index]
      if self.augment:
        image = self.augmentation(image)
      image = np.squeeze(image)
      image = np.stack([image, image, image], axis = -1)
      curr_label = self.labels[index]
    else:
      curr_filename = self.filenames[index] # Get filename at index
      curr_label = self.labels[index]
      image = cv2.imread(curr_filename, 0)
      image = cv2.resize(image, (self.out_shape))
      if self.preprocess_input:
        image = self.preprocess(image)

      if self.augment:
        image = self.augmentation(image)['image']

      image = np.stack([image, image, image], axis = -1)


    return image, curr_label

  def __getitem__(self, index):
    # In this function we generate a batch (of size self.batch_size) of images and corresponding masks
    
    # Get 'self.batch_size' indices
    current_indices = self.indices[index*self.batch_size:(index*self.batch_size)+self.batch_size]

    """if len(current_indices) == 0:
      current_indices = self.indices[len(self.indices)-self.batch_size:len(self.indices)]"""

    # Init lists that will contain images and masks
    batch_images = []
    batch_labels = []

    # Cycle over the indices
    for idx in current_indices:
      # Get single image/mask at index 'idx'
      image, label = self.get_image_and_label(idx)

      # Apply the preprocessing function
      if self.preprocessing_function is not None:
        image = self.preprocessing_function(image)

      # Append both image and mask (with added batch dimension) to the corresponding batch lists
      batch_images.append(np.expand_dims(image, 0))
      batch_labels.append(label)
     
    # Finally, obtain a final batch by concatenating all the images over the batch dimension
    batch_images = np.concatenate(batch_images, axis=0)
    batch_labels = np.array(batch_labels)

    return batch_images, batch_labels


  def folderToPaths(
        self,
        full_img_dir,
        full_path = True
):

    x_paths_list = []

    full_img_dir = full_img_dir

    for full in os.listdir(full_img_dir):
         if full_path:
            x_paths_list.append(os.path.join(full_img_dir, full))
         else:
          x_paths_list.append(full)
    
    x_paths_list.sort()
    return x_paths_list

# Cyclical LR

In [None]:
class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}
        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())

# ConvNext Experiment 1: train with all data (no duplicates)

## Data loading

In [None]:
def encode(x):
  if x == 'N':
    return 0
  elif x == 'P':
    return 1
  else:
    return 2

In [None]:
def folderToPaths(
        full_img_dir,
        full_path = True
):

    x_paths_list = []

    full_img_dir = full_img_dir

    for full in os.listdir(full_img_dir):
         if full_path:
            x_paths_list.append(os.path.join(full_img_dir, full))
         else:
          x_paths_list.append(full)
    
    x_paths_list.sort()
    return x_paths_list

In [None]:
labelsDF = pd.read_csv(labels_path)
display(labelsDF.head(20))

In [None]:
labelsDF.label = labelsDF.label.apply(lambda x: encode(x))
display(labelsDF.head(20))

In [None]:
len(set(labelsDF.file)) # 1 acquisition per patienty

In [None]:
all_data_no_duplicates_path_list = folderToPaths(full_img_dir = all_data_no_duplicates_path)

In [None]:
train_val, test = train_test_split(labelsDF, test_size = test_percentage, shuffle = True, stratify = labelsDF.label, random_state = SEED)
train, val = train_test_split(train_val, test_size = validation_percentage, shuffle = True, stratify = train_val.label, random_state = SEED)

In [None]:
train_gen = CustomGenerator(dataframe = train, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True)
valid_gen = CustomGenerator(dataframe = val, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True)
test_gen = CustomGenerator(dataframe = test, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True)

In [None]:
dataset_labels = np.array(list(set(labelsDF.label)), dtype=int)

In [None]:
iterator = iter(train_gen)
images, labels = next(iterator)
fig, axis = plt.subplots(4, 4, figsize = (20, 20))

axis = axis.flatten()

for i in range(images.shape[0]):
  axis[i].imshow(images[i].squeeze(), cmap='gray')
  axis[i].set_axis_off()

plt.show()

## Transfer learning model - no augmentation - no class weights

In [None]:
supernet1 = tf.keras.applications.convnext.ConvNeXtTiny(
    include_top=False,
    weights="imagenet",
    input_shape=(224,224,3)
)


count = 1
print(len(supernet1.layers))
for layer in supernet1.layers:
    if count < 80:
        layer.trainable = False
    else:
        layer.trainable = True
    count = count + 1

In [None]:

from keras.layers import Dense,Flatten,GlobalAveragePooling2D, MaxPooling2D, BatchNormalization,Concatenate, Resizing
from keras import regularizers

inputs = tfk.Input((224,224,3))

y = supernet1(inputs)

y = tf.keras.layers.GlobalAveragePooling2D()(y)

y = tf.keras.layers.Dense(256, activation='relu', kernel_initializer = tfk.initializers.HeUniform(SEED))(y)

outputs2 = tf.keras.layers.Dense(3, activation='softmax', kernel_initializer = tfk.initializers.GlorotUniform(SEED))(y)

tl_model_exp1 = tfk.Model(inputs=inputs, outputs=outputs2, name='model')

tl_model_exp1.compile(loss=tfk.losses.CategoricalCrossentropy(), optimizer=tfk.optimizers.Adam(learning_rate = 1e-4), metrics='accuracy')
tl_model_exp1.summary()

In [None]:
training_samples = int(len(train_gen)*batch_size)
step_size = 6*training_samples // batch_size

clr = CyclicLR(
    mode='triangular',
    base_lr=1e-5, 
    max_lr=1e-4,
    step_size= step_size)

history = tl_model_exp1.fit(train_gen,
    epochs = 40,
    use_multiprocessing = True,
    workers = 8,
    validation_data = valid_gen,
    callbacks = [tfk.callbacks.EarlyStopping(monitor= 'val_accuracy', mode='max', patience=10, restore_best_weights=True), clr],
).history

### Training plots

In [None]:
# Plot the training
plt.figure(figsize=(20,5))
plt.plot(history['loss'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_loss'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Category Crossentropy')
plt.grid(alpha=.3)

plt.figure(figsize=(20,5))
plt.plot(history['accuracy'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_accuracy'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Accuracy')
plt.grid(alpha=.3)

plt.show()

In [None]:
with open('data_for_report/cnext_exp1_historyj.json' , 'w') as fp:
    json.dump(history, fp)

In [None]:
tl_model_exp1.save("model_exp1.h5")

### Model testing

In [None]:
# Predict the test set with the CNN
predictions = tl_model_exp1.predict(test_gen)

In [None]:
y = np.argmax(tfk.utils.to_categorical(list(test.label))[:-2], axis = -1)
pred = np.argmax(predictions, axis=-1)
target_names = ['N', 'P', 'T']
cm = confusion_matrix(y, pred, normalize="true")



# Compute the classification metrics
accuracy = accuracy_score(y, pred)
precision = precision_score(y, pred, average='macro')
recall = recall_score(y, pred, average='macro')
f1 = f1_score(y, pred, average='macro')
print('Accuracy:',accuracy.round(4))
print('Precision:',precision.round(4))
print('Recall:',recall.round(4))
print('F1:',f1.round(4))
print(classification_report(y, pred, target_names=target_names, digits=4))
# Plot the confusion matrix
plt.figure(figsize=(10,10))
sns.heatmap(cm.T, xticklabels=[0,1,2], yticklabels=[0,1,2])
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()

## Transfer learning model - no augmentation - class weights

In [None]:
train_gen = CustomGenerator(dataframe = train, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, categorical = False, flow_from_directory=False)
valid_gen = CustomGenerator(dataframe = val, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, categorical = False, flow_from_directory=False)
test_gen = CustomGenerator(dataframe = test, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, categorical = False, flow_from_directory=False)

In [None]:
class_weights = class_weight.compute_class_weight('balanced',
                                                 classes = dataset_labels,
                                                 y = train.label)
class_weights

In [None]:
supernet1 = tf.keras.applications.convnext.ConvNeXtTiny(
    include_top=False,
    weights="imagenet",
    input_shape=(224,224,3)
)


count = 1
print(len(supernet1.layers))
for layer in supernet1.layers:
    if count < 80:
        layer.trainable = False
    else:
        layer.trainable = True
    count = count + 1

In [None]:
!pip install focal-loss

In [None]:
from focal_loss import SparseCategoricalFocalLoss

In [None]:

from keras.layers import Dense,Flatten,GlobalAveragePooling2D, MaxPooling2D, BatchNormalization,Concatenate, Resizing
from keras import regularizers

inputs = tfk.Input((224,224,3))


y = supernet1(inputs)

y = tf.keras.layers.GlobalAveragePooling2D()(y)

y = tf.keras.layers.Dense(256, activation='relu', kernel_initializer = tfk.initializers.HeUniform(SEED))(y)

outputs2 = tf.keras.layers.Dense(3, activation='softmax', kernel_initializer = tfk.initializers.GlorotUniform(SEED))(y)

tl_model_exp2 = tfk.Model(inputs=inputs, outputs=outputs2, name='model')

tl_model_exp2.compile(loss=SparseCategoricalFocalLoss(class_weight = class_weights, gamma=2), optimizer=tfk.optimizers.Adam(learning_rate = 1e-4), metrics='accuracy')

tl_model_exp2.summary()

In [None]:
training_samples = int(len(train_gen)*batch_size)
step_size = 6*training_samples // batch_size

clr = CyclicLR(
    mode='triangular',
    base_lr=1e-5, 
    max_lr=1e-4,
    step_size= step_size)

history = tl_model_exp2.fit(train_gen,
    epochs = 50,
    use_multiprocessing = True,
    workers = 8,
    validation_data = valid_gen,
    callbacks = [tfk.callbacks.EarlyStopping(monitor= 'val_accuracy', mode='max', patience=15, restore_best_weights=True), clr],
).history

### Training plots

In [None]:
# Plot the training
plt.figure(figsize=(20,5))
plt.plot(history['loss'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_loss'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Category Crossentropy')
plt.grid(alpha=.3)

plt.figure(figsize=(20,5))
plt.plot(history['accuracy'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_accuracy'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Accuracy')
plt.grid(alpha=.3)

plt.show()

In [None]:
with open('data_for_report/cnext_exp2_historyj.json' , 'w') as fp:
    json.dump(history, fp)

### Model testing

In [None]:
# Predict the test set with the CNN
predictions = tl_model_exp2.predict(test_gen)

In [None]:
y = np.argmax(tfk.utils.to_categorical(list(test.label))[:-2], axis = -1)
pred = np.argmax(predictions, axis=-1)
target_names = ['N', 'P', 'T']
cm = confusion_matrix(y, pred, normalize="true")

# Compute the classification metrics
accuracy = accuracy_score(y, pred)
precision = precision_score(y, pred, average='macro')
recall = recall_score(y, pred, average='macro')
f1 = f1_score(y, pred, average='macro')
print('Accuracy:',accuracy.round(4))
print('Precision:',precision.round(4))
print('Recall:',recall.round(4))
print('F1:',f1.round(4))
print(classification_report(y, pred, target_names=target_names, digits=4))
# Plot the confusion matrix
plt.figure(figsize=(10,10))
sns.heatmap(cm.T, xticklabels=[0,1,2], yticklabels=[0,1,2])
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()

## Transfer learning model - augmentation - no class weights

In [None]:
train_gen = CustomGenerator(dataframe = train, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, categorical = True, augment = True)
valid_gen = CustomGenerator(dataframe = val, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, categorical = True, augment = True)
test_gen = CustomGenerator(dataframe = test, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, categorical = True, augment = True)

In [None]:
supernet1 = tf.keras.applications.convnext.ConvNeXtTiny(
    include_top=False,
    weights="imagenet",
    input_shape=(224,224,3)
)


count = 1
print(len(supernet1.layers))
for layer in supernet1.layers:
    if count < 80:
        layer.trainable = False
    else:
        layer.trainable = True
    count = count + 1

In [None]:

from keras.layers import Dense,Flatten,GlobalAveragePooling2D, MaxPooling2D, BatchNormalization,Concatenate, Resizing
from keras import regularizers

inputs = tfk.Input(shape = (224,224,3))

y = supernet1(inputs)



y = tf.keras.layers.GlobalAveragePooling2D()(y)

y = tf.keras.layers.Dense(256, activation='relu', kernel_initializer = tfk.initializers.HeUniform(SEED))(y)

outputs2 = tf.keras.layers.Dense(3, activation='softmax', kernel_initializer = tfk.initializers.GlorotUniform(SEED))(y)

tl_model = tfk.Model(inputs=inputs, outputs=outputs2, name='model')

tl_model.compile(loss=tfk.losses.CategoricalCrossentropy(), optimizer=tfk.optimizers.Adam(learning_rate = 1e-4), metrics='accuracy')

tl_model.summary()

In [None]:
import logging
tf.get_logger().setLevel(logging.ERROR)

In [None]:
training_samples = int(len(train_gen)*batch_size)
step_size = 6*training_samples // batch_size

clr = CyclicLR(
    mode='triangular',
    base_lr=1e-5, 
    max_lr=1e-4,
    step_size= step_size)

history = tl_model.fit(train_gen,
    epochs = 55,
    workers = 8,
    use_multiprocessing = True,
    validation_data = valid_gen,
    callbacks = [tfk.callbacks.EarlyStopping(monitor= 'val_accuracy', mode='max', patience=20, restore_best_weights=True), clr],
).history

### Training plots

In [None]:
# Plot the training
plt.figure(figsize=(20,5))
plt.plot(history['loss'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_loss'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Category Crossentropy')
plt.grid(alpha=.3)

plt.figure(figsize=(20,5))
plt.plot(history['accuracy'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_accuracy'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Accuracy')
plt.grid(alpha=.3)

plt.show()

In [None]:
with open('data_for_report/cnext_exp3_historyj.json' , 'w') as fp:
    json.dump(history, fp)

### Model testing

In [None]:
# Predict the test set with the CNN
predictions = tl_model.predict(test_gen)

In [None]:
y = np.argmax(tfk.utils.to_categorical(list(test.label))[:-2], axis = -1)
pred = np.argmax(predictions, axis=-1)
target_names = ['N', 'P', 'T']
cm = confusion_matrix(y, pred)



# Compute the classification metrics
accuracy = accuracy_score(y, pred)
precision = precision_score(y, pred, average='macro')
recall = recall_score(y, pred, average='macro')
f1 = f1_score(y, pred, average='macro')
print('Accuracy:',accuracy.round(4))
print('Precision:',precision.round(4))
print('Recall:',recall.round(4))
print('F1:',f1.round(4))
print(classification_report(y, pred, target_names=target_names, digits=4))
# Plot the confusion matrix
plt.figure(figsize=(10,10))
sns.heatmap(cm.T, xticklabels=[0,1,2], yticklabels=[0,1,2])
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()

# EffNet Experiment: train with data preprocessing + 1vsAll



## Common steps

In [None]:
def encode(x):
  if x == 'T':
    return 0
  elif x == 'P':
    return 1
  else:
    return 2

def encode2(x, target_label):
  if x == target_label:
    return 1
  else:
    return 0

In [None]:
def folderToPaths(
        full_img_dir,
        full_path = True
):

    x_paths_list = []

    full_img_dir = full_img_dir

    for full in os.listdir(full_img_dir):
         if full_path:
            x_paths_list.append(os.path.join(full_img_dir, full))
         else:
          x_paths_list.append(full)
    
    x_paths_list.sort()
    return x_paths_list

In [None]:
labelsDF = pd.read_csv(labels_path)
display(labelsDF.head(20))

In [None]:
len(set(labelsDF.file)) # 1 acquisition per patient

In [None]:
train_val, test = train_test_split(labelsDF, test_size = test_percentage, shuffle = True, stratify = labelsDF.label, random_state = SEED)
train, val = train_test_split(train_val, test_size = validation_percentage, shuffle = True, stratify = train_val.label, random_state = SEED)

In [None]:
train_ovaN = train.copy()
train_ovaN.label =  train.label.apply(lambda x: encode2(x, 'N'))

val_ovaN= val.copy()
val_ovaN.label =  val.label.apply(lambda x: encode2(x, 'N'))

test_ovaN = test.copy()
test_ovaN.label =  test.label.apply(lambda x: encode2(x, 'N'))


train_ovaT = train.copy()
train_ovaT.label =  train.label.apply(lambda x: encode2(x, 'T'))

val_ovaT= val.copy()
val_ovaT.label =  val.label.apply(lambda x: encode2(x, 'T'))

test_ovaT = test.copy()
test_ovaT.label =  test.label.apply(lambda x: encode2(x, 'T'))


train_ovaP = train.copy()
train_ovaP.label =  train.label.apply(lambda x: encode2(x, 'P'))

val_ovaP= val.copy()
val_ovaP.label =  val.label.apply(lambda x: encode2(x, 'P'))

test_ovaP = test.copy()
test_ovaP.label =  test.label.apply(lambda x: encode2(x, 'P'))

In [None]:
display(test_ovaT.head(20))

In [None]:
unique, counts = np.unique(train.label, return_counts=True)
print(counts)

In [None]:
batch_size = 64

In [None]:
train_gen_ovaT = CustomGenerator(dataframe = train_ovaT, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False, augment = False)
valid_gen_ovaT = CustomGenerator(dataframe = val_ovaT, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False,  augment = False)
test_gen_ovaT = CustomGenerator(dataframe = test_ovaT, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = False, flow_from_directory=True, preprocess_input = True, categorical = False,  augment = False)

train_gen_ovaN = CustomGenerator(dataframe = train_ovaN, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False)
valid_gen_ovaN = CustomGenerator(dataframe = val_ovaN, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False)
test_gen_ovaN = CustomGenerator(dataframe = test_ovaN, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = False, flow_from_directory=True, preprocess_input = True, categorical = False)

train_gen_ovaP = CustomGenerator(dataframe = train_ovaP, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False)
valid_gen_ovaP = CustomGenerator(dataframe = val_ovaP, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False)
test_gen_ovaP =  CustomGenerator(dataframe = test_ovaP, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = False, flow_from_directory=True, preprocess_input = True, categorical = False)

In [None]:
iterator = iter(train_gen_ovaT)
images, labels = next(iterator)
fig, axis = plt.subplots(4, 4, figsize = (20, 20))

axis = axis.flatten()

for i in range(16):
  axis[i].imshow(images[i].squeeze(), cmap='gray')
  axis[i].set_axis_off()

plt.show()

In [None]:
iterator = iter(test_gen_ovaT)
images, labels = next(iterator)
fig, axis = plt.subplots(4, 4, figsize = (20, 20))

axis = axis.flatten()

for i in range(16):
  axis[i].imshow(images[i].squeeze(), cmap='gray')
  axis[i].set_axis_off()

plt.show()

## First binary classifier: N and focal loss

In [None]:
supernet2 = tf.keras.applications.EfficientNetV2B3(
    include_top=False,
    weights="imagenet",
    input_shape=(224,224,3)
)

supernet2._name = "effnet2"



count = 1
print(len(supernet2.layers))
for layer in supernet2.layers:
    if count < 80:
        layer.trainable = False
    else:
        layer.trainable = True
    count = count + 1

In [None]:

from keras.layers import Dense,Flatten,GlobalAveragePooling2D, MaxPooling2D, BatchNormalization,Concatenate, Resizing
from keras import regularizers

inputs = tfk.Input((224,224,3))

y2 = supernet2(inputs)

y2 = tf.keras.layers.GlobalAveragePooling2D()(y2)
y2 = tf.keras.layers.Dense(256, activation='relu', kernel_initializer = tfk.initializers.HeUniform(SEED))(y2)
outputs = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer = tfk.initializers.GlorotUniform(SEED))(y2)

# Connect input and output through the Model class
tl_model_exp3 = tfk.Model(inputs=inputs, outputs=outputs, name='model')

# Compile the model
tl_model_exp3.compile(loss=tfk.losses.BinaryFocalCrossentropy(True), optimizer=tfk.optimizers.Adam(learning_rate = 1e-4), metrics='accuracy')

tl_model_exp3.summary()

In [None]:
training_samples = int(len(train_gen_ovaN)*batch_size)
step_size = 6*training_samples // batch_size

clr = CyclicLR(
    mode='triangular',
    base_lr=1e-5, 
    max_lr=1e-4,             
    step_size= step_size
    )

history = tl_model_exp3.fit(
    train_gen_ovaN,
    workers=8,
    use_multiprocessing=True,
    epochs = 50,
    validation_data = valid_gen_ovaN,
    callbacks = [tfk.callbacks.EarlyStopping(monitor= 'val_accuracy', mode='max', patience=10, restore_best_weights=True), clr],
).history

### Training plots

In [None]:
# Plot the training
plt.figure(figsize=(20,5))
plt.plot(history['loss'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_loss'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Category Crossentropy')
plt.grid(alpha=.3)

plt.figure(figsize=(20,5))
plt.plot(history['accuracy'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_accuracy'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Accuracy')
plt.grid(alpha=.3)

plt.show()

### Model testing

In [None]:
# Predict the test set with the CNN
predictions = tl_model_exp3.predict(test_gen_ovaN, use_multiprocessing = True, workers = 8)

In [None]:
print(predictions)

In [None]:
y = list(test_ovaN.label)[:2368]
pred = (predictions > 0.5).astype(int)
target_names = ['O', 'N']
cm = confusion_matrix(y, pred, normalize="true")



# Compute the classification metrics
accuracy = accuracy_score(y, pred)
precision = precision_score(y, pred, average='macro')
recall = recall_score(y, pred, average='macro')
f1 = f1_score(y, pred, average='macro')
print('Accuracy:',accuracy.round(4))
print('Precision:',precision.round(4))
print('Recall:',recall.round(4))
print('F1:',f1.round(4))
print(classification_report(y, pred, target_names=target_names, digits=4))
# Plot the confusion matrix
plt.figure(figsize=(10,10))
sns.heatmap(cm.T, xticklabels=[0,1,2], yticklabels=[0,1,2])
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()

In [None]:
tl_model_exp3.save("model_exp3_N2.h5") # N2 is with focal loss

## Second binary classifier: T and focal loss

In [None]:
supernet2 = tf.keras.applications.EfficientNetV2B3(
    include_top=False,
    weights="imagenet",
    input_shape=(224,224,3)
)


supernet2._name = "effnet2"



count = 1
print(len(supernet2.layers))
for layer in supernet2.layers:
    if count < 80:
        layer.trainable = False
    else:
        layer.trainable = True
    count = count + 1

In [None]:

from keras.layers import Dense,Flatten,GlobalAveragePooling2D, MaxPooling2D, BatchNormalization,Concatenate, Resizing
from keras import regularizers

inputs = tfk.Input((224,224,3))

y2 = supernet2(inputs)

y2 = tf.keras.layers.GlobalAveragePooling2D()(y2)
y2 = tf.keras.layers.Dense(256, activation='relu', kernel_initializer = tfk.initializers.HeUniform(SEED))(y2)
outputs = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer = tfk.initializers.GlorotUniform(SEED))(y2)

# Connect input and output through the Model class
tl_model_exp3 = tfk.Model(inputs=inputs, outputs=outputs, name='model')

# Compile the model
tl_model_exp3.compile(loss=tf.keras.losses.BinaryFocalCrossentropy(True), optimizer=tfk.optimizers.Adam(), metrics='accuracy')

tl_model_exp3.summary()

In [None]:
tl_model_exp3.load_weights("/gdrive/MyDrive/tuberculosis-pneumonia-classification/model_exp3_N2.h5")

In [None]:
training_samples = int(len(train_gen_ovaT)*batch_size)
step_size = 6*training_samples // batch_size

clr = CyclicLR(
    mode='triangular',
    base_lr=1e-5, 
    max_lr=1e-4,             
    step_size= step_size
    )

history = tl_model_exp3.fit(
    train_gen_ovaT,
    workers=8,
    use_multiprocessing=True,
    epochs = 50,
    validation_data = valid_gen_ovaT,
    callbacks = [tfk.callbacks.EarlyStopping(monitor= 'val_accuracy', mode='max', patience=10, restore_best_weights=True), clr],
).history

### Training plots

In [None]:
# Plot the training
plt.figure(figsize=(20,5))
plt.plot(history['loss'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_loss'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Category Crossentropy')
plt.grid(alpha=.3)

plt.figure(figsize=(20,5))
plt.plot(history['accuracy'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_accuracy'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Accuracy')
plt.grid(alpha=.3)

plt.show()

### Model testing

In [None]:
# Predict the test set with the CNN
predictions = tl_model_exp3.predict(test_gen_ovaT,
                                    workers=8,
    use_multiprocessing=True)

In [None]:
print(predictions)

In [None]:
y = list(test_ovaT.label)[:2368]
pred = (predictions > 0.5).astype(int)
target_names = ['O', 'T']
cm = confusion_matrix(y, pred, normalize="true")



# Compute the classification metrics
accuracy = accuracy_score(y, pred)
precision = precision_score(y, pred, average='macro')
recall = recall_score(y, pred, average='macro')
f1 = f1_score(y, pred, average='macro')
print('Accuracy:',accuracy.round(4))
print('Precision:',precision.round(4))
print('Recall:',recall.round(4))
print('F1:',f1.round(4))
print(classification_report(y, pred, target_names=target_names, digits=4))
# Plot the confusion matrix
plt.figure(figsize=(10,10))
sns.heatmap(cm.T, xticklabels=[0,1,2], yticklabels=[0,1,2])
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()

In [None]:
tl_model_exp3.save("model_exp3_T2.h5")

## Third binary classifier: P and BCE

In [None]:
supernet2 = tf.keras.applications.EfficientNetV2B3(
    include_top=False,
    weights="imagenet",
    input_shape=(224,224,3)
)

supernet2._name = "effnet2"



count = 1
print(len(supernet2.layers))
for layer in supernet2.layers:
    if count < 80:
        layer.trainable = False
    else:
        layer.trainable = True
    count = count + 1

In [None]:
from keras.layers import Dense,Flatten,GlobalAveragePooling2D, MaxPooling2D, BatchNormalization,Concatenate, Resizing
from keras import regularizers

inputs = tfk.Input((224,224,3))

y2 = supernet2(inputs)

y2 = tf.keras.layers.GlobalAveragePooling2D()(y2)
y2 = tf.keras.layers.Dense(256, activation='relu', kernel_initializer = tfk.initializers.HeUniform(SEED))(y2)
outputs = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer = tfk.initializers.GlorotUniform(SEED))(y2)

# Connect input and output through the Model class
tl_model_exp3 = tfk.Model(inputs=inputs, outputs=outputs, name='model')

# Compile the model
tl_model_exp3.compile(loss=tf.keras.losses.BinaryFocalCrossentropy(True), optimizer=tfk.optimizers.Adam(learning_rate = 1e-4), metrics='accuracy')

tl_model_exp3.summary()

In [None]:
tl_model_exp3.load_weights("/gdrive/MyDrive/tuberculosis-pneumonia-classification/model_exp3_N2.h5")

In [None]:
training_samples = int(len(train_gen_ovaN)*batch_size)
step_size = 6*training_samples // batch_size

clr = CyclicLR(
    mode='triangular',
    base_lr=1e-5, 
    max_lr=1e-4,             
    step_size= step_size
    )

history = tl_model_exp3.fit(
    train_gen_ovaP,
    workers=8,
    use_multiprocessing=True,
    epochs = 50,
    validation_data = valid_gen_ovaP,
    callbacks = [tfk.callbacks.EarlyStopping(monitor= 'val_accuracy', mode='max', patience=10, restore_best_weights=True), clr],
).history

### Training plots

In [None]:
# Plot the training
plt.figure(figsize=(20,5))
plt.plot(history['loss'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_loss'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Category Crossentropy')
plt.grid(alpha=.3)

plt.figure(figsize=(20,5))
plt.plot(history['accuracy'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_accuracy'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Accuracy')
plt.grid(alpha=.3)

plt.show()

### Model testing

In [None]:
# Predict the test set with the CNN
predictions = tl_model_exp3.predict(test_gen_ovaP,workers=8,use_multiprocessing=True)

In [None]:
print(predictions)

In [None]:
y = list(test_ovaP.label)[:2368]
pred = (predictions > 0.5).astype(int)
target_names = ['O', '  P']
cm = confusion_matrix(y, pred, normalize="true")



# Compute the classification metrics
accuracy = accuracy_score(y, pred)
precision = precision_score(y, pred, average='macro')
recall = recall_score(y, pred, average='macro')
f1 = f1_score(y, pred, average='macro')
print('Accuracy:',accuracy.round(4))
print('Precision:',precision.round(4))
print('Recall:',recall.round(4))
print('F1:',f1.round(4))
print(classification_report(y, pred, target_names=target_names, digits=4))
# Plot the confusion matrix
plt.figure(figsize=(10,10))
sns.heatmap(cm.T, xticklabels=[0,1,2], yticklabels=[0,1,2])
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()

In [None]:
tl_model_exp3.save("model_exp3_P2.h5")

# EffNext OvA Ensemble inference


In [None]:
supernet = tf.keras.applications.EfficientNetV2B3(
    include_top=False,
    weights="imagenet",
    input_shape=(224,224,3)
)



count = 1
print(len(supernet.layers))
for layer in supernet.layers:
    if count < 80:
        layer.trainable = False
    else:
        layer.trainable = True
    count = count + 1

In [None]:

from keras.layers import Dense,Flatten,GlobalAveragePooling2D, MaxPooling2D, BatchNormalization,Concatenate, Resizing
from keras import regularizers

inputs = tfk.Input((224,224,3))

y2 = supernet(inputs)

y2 = tf.keras.layers.GlobalAveragePooling2D()(y2)
y2 = tf.keras.layers.Dense(256, activation='relu', kernel_initializer = tfk.initializers.HeUniform(SEED))(y2)
outputs = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer = tfk.initializers.GlorotUniform(SEED))(y2)

# Connect input and output through the Model class
tl_model_exp = tfk.Model(inputs=inputs, outputs=outputs, name='model')

# Compile the model
tl_model_exp.compile(loss=tfk.losses.BinaryCrossentropy(), optimizer=tfk.optimizers.Adam(learning_rate = 1e-4), metrics='accuracy')

tl_model_exp.summary()

In [None]:
def encode(x):
  if x == 'N':
    return 0
  elif x == 'P':
    return 1
  else:
    return 2

In [None]:
def folderToPaths(
        full_img_dir,
        full_path = True
):

    x_paths_list = []

    full_img_dir = full_img_dir

    for full in os.listdir(full_img_dir):
         if full_path:
            x_paths_list.append(os.path.join(full_img_dir, full))
         else:
          x_paths_list.append(full)
    
    x_paths_list.sort()
    return x_paths_list

In [None]:
labelsDF = pd.read_csv(labels_path)
display(labelsDF.head(20))

In [None]:
labelsDF.label = labelsDF.label.apply(lambda x: encode(x))
display(labelsDF.head(20))

In [None]:
len(set(labelsDF.file)) # 1 acquisition per patient

In [None]:
train_val, test = train_test_split(labelsDF, test_size = test_percentage, shuffle = True, stratify = labelsDF.label, random_state = SEED)
train, val = train_test_split(train_val, test_size = validation_percentage, shuffle = True, stratify = train_val.label, random_state = SEED)

In [None]:
batch_size = 64

In [None]:
train_gen = CustomGenerator(dataframe = train, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False)
valid_gen = CustomGenerator(dataframe = val, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False)
test_gen = CustomGenerator(dataframe = test, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False)

In [None]:
tl_model_exp.load_weights("/gdrive/MyDrive/tuberculosis-pneumonia-classification/model_exp3_P2.h5")
# Predict the test set with the CNN
predictionsP = tl_model_exp.predict(test_gen,
                                    workers=8,
    use_multiprocessing=True)

In [None]:
tl_model_exp.load_weights("/gdrive/MyDrive/tuberculosis-pneumonia-classification/model_exp3_N2.h5")
# Predict the test set with the CNN
predictionsN = tl_model_exp.predict(test_gen,
                                    workers=8,
    use_multiprocessing=True)

In [None]:
tl_model_exp.load_weights("/gdrive/MyDrive/tuberculosis-pneumonia-classification/model_exp3_T2.h5")
# Predict the test set with the CNN
predictionsT = tl_model_exp.predict(test_gen,
                                    workers=8,
    use_multiprocessing=True)

In [None]:
pred = np.squeeze(np.array([predictionsN, predictionsP, predictionsT]))

In [None]:
pred = np.argmax(pred, axis = 0)

In [None]:
y = list(test.label)[:2368]
target_names = ['N', 'P', 'T']
cm = confusion_matrix(y, pred, normalize="true")



# Compute the classification metrics
accuracy = accuracy_score(y, pred)
precision = precision_score(y, pred, average='macro')
recall = recall_score(y, pred, average='macro')
f1 = f1_score(y, pred, average='macro')
print('Accuracy:',accuracy.round(4))
print('Precision:',precision.round(4))
print('Recall:',recall.round(4))
print('F1:',f1.round(4))
print(classification_report(y, pred, target_names=target_names, digits=4))
# Plot the confusion matrix
plt.figure(figsize=(10,10))
sns.heatmap(cm.T, xticklabels=[0,1,2], yticklabels=[0,1,2])
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()

# EffNet Experiment: train with data preprocessing + 1vsAll (KFOLD)



## Common steps

In [None]:
def encode(x):
  if x == 'T':
    return 0
  elif x == 'P':
    return 1
  else:
    return 2

def encode2(x, target_label):
  if x == target_label:
    return 1
  else:
    return 0

In [None]:
def folderToPaths(
        full_img_dir,
        full_path = True
):

    x_paths_list = []

    full_img_dir = full_img_dir

    for full in os.listdir(full_img_dir):
         if full_path:
            x_paths_list.append(os.path.join(full_img_dir, full))
         else:
          x_paths_list.append(full)
    
    x_paths_list.sort()
    return x_paths_list

In [None]:
labelsDF = pd.read_csv(labels_path)
display(labelsDF.head(20))

In [None]:
len(set(labelsDF.file)) # 1 acquisition per patient

In [None]:
train_val, test = train_test_split(labelsDF, test_size = test_percentage, shuffle = True, stratify = labelsDF.label, random_state = SEED)

In [None]:
batch_size = 64

In [None]:
train_gen_ovaT = CustomGenerator(dataframe = train_ovaT, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False, augment = False)
valid_gen_ovaT = CustomGenerator(dataframe = val_ovaT, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False,  augment = False)
test_gen_ovaT = CustomGenerator(dataframe = test_ovaT, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = False, flow_from_directory=True, preprocess_input = True, categorical = False,  augment = False)

train_gen_ovaN = CustomGenerator(dataframe = train_ovaN, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False)
valid_gen_ovaN = CustomGenerator(dataframe = val_ovaN, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False)
test_gen_ovaN = CustomGenerator(dataframe = test_ovaN, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = False, flow_from_directory=True, preprocess_input = True, categorical = False)

train_gen_ovaP = CustomGenerator(dataframe = train_ovaP, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False)
valid_gen_ovaP = CustomGenerator(dataframe = val_ovaP, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False)
test_gen_ovaP =  CustomGenerator(dataframe = test_ovaP, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = False, flow_from_directory=True, preprocess_input = True, categorical = False)

## First binary classifier: N and focal loss

In [None]:
def get_supernet():
  supernet1 = tf.keras.applications.efficientnet_v2.EfficientNetV2B3(
    include_top=False,
    weights="imagenet",
    input_shape=(224,224,3)
)


  count = 1
  for layer in supernet1.layers:
      if count < 80:
          layer.trainable = False
      else:
          layer.trainable = True
      count = count + 1
  
  return supernet1

In [None]:

from keras.layers import Dense,Flatten,GlobalAveragePooling2D, MaxPooling2D, BatchNormalization,Concatenate, Resizing
from keras import regularizers

def build_model():

  inputs = tfk.Input((224,224,3))

  y2 = get_supernet()(inputs)

  y2 = tf.keras.layers.GlobalAveragePooling2D()(y2)
  y2 = tf.keras.layers.Dense(256, activation='relu', kernel_initializer = tfk.initializers.HeUniform(SEED))(y2)
  outputs = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer = tfk.initializers.GlorotUniform(SEED))(y2)

  # Connect input and output through the Model class
  tl_model = tfk.Model(inputs=inputs, outputs=outputs, name='model')

  # Compile the model
  tl_model.compile(loss=tfk.losses.BinaryFocalCrossentropy(True), optimizer=tfk.optimizers.Adam(learning_rate = 1e-4), metrics='accuracy')

  return tl_model

In [None]:
train_val_ovaN = train_val.copy()
train_val_ovaN.label =  train_val.label.apply(lambda x: encode2(x, 'N'))
test_ovaN = test.copy()
test_ovaN.label =  test.label.apply(lambda x: encode2(x, 'N'))
test_gen_ovaN = CustomGenerator(dataframe = test_ovaN, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = False, flow_from_directory=True, preprocess_input = True, categorical = False,  augment = False)

In [None]:
acc_per_fold = []
loss_per_fold = []
val_acc_per_fold = []
val_loss_per_fold = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state = SEED)

for i, (train_index, valid_index) in enumerate(skf.split(train_val_ovaN.file.to_numpy(), train_val_ovaN.label.to_numpy())): 
  if i < 3:
    continue
  print(i)
  trainDF = train_val_ovaN.filter(items= train_index, axis=0)
  validDF = train_val_ovaN.filter(items= valid_index, axis=0)

  train_gen = CustomGenerator(dataframe = trainDF, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False, augment = False)
  valid_gen = CustomGenerator(dataframe = validDF, base_path = 'data/train_all_no_duplicates', batch_size = batch_size, out_shape = img_size, shuffle = True, flow_from_directory=True, preprocess_input = True, categorical = False, augment = False)

  training_samples = int(len(train_gen)*batch_size)
  step_size = 6*training_samples // batch_size

  clr = CyclicLR(
      mode='triangular',
      base_lr=1e-5, 
      max_lr=1e-4,
      step_size= step_size
      )
  
  model = build_model()

  epochs = 100
  # Train
  history = model.fit(train_gen,
                          epochs=epochs,  validation_data = valid_gen,
                          callbacks = [tfk.callbacks.EarlyStopping(monitor= 'val_accuracy', mode='max', patience=10, restore_best_weights=True), clr], workers =8 , use_multiprocessing = True
                          )
  
  val_acc_per_fold.append(history.history["val_accuracy"] * 100)
  val_loss_per_fold.append(history.history["val_loss"])
  acc_per_fold.append(history.history["accuracy"] * 100)
  loss_per_fold.append(history.history["loss"])

  # Plot the training
  plt.figure(figsize=(20,5))
  plt.plot(history.history['loss'], label='Training', alpha=.8, color='#ff7f0e')
  plt.plot(history.history['val_loss'], label='Validation', alpha=.8, color='#4D61E2')
  plt.legend(loc='upper left')
  plt.title('Category Crossentropy')
  plt.grid(alpha=.3)

  plt.figure(figsize=(20,5))
  plt.plot(history.history['accuracy'], label='Training', alpha=.8, color='#ff7f0e')
  plt.plot(history.history['val_accuracy'], label='Validation', alpha=.8, color='#4D61E2')
  plt.legend(loc='upper left')
  plt.title('Accuracy')
  plt.grid(alpha=.3)

  plt.show()

  #prediction on test set
  predictions = model.predict(test_gen_ovaN)


  y = list(test_ovaN.label)[:2368]
  pred = (predictions > 0.5).astype(int)
  target_names = ['O', 'N']
  cm = confusion_matrix(y, pred, normalize="true")


  # Compute the classification metrics
  accuracy = accuracy_score(y, pred)
  precision = precision_score(y, pred, average='macro')
  recall = recall_score(y, pred, average='macro')
  f1 = f1_score(y, pred, average='macro')
  print('Accuracy:',accuracy.round(4))
  print('Precision:',precision.round(4))
  print('Recall:',recall.round(4))
  print('F1:',f1.round(4))
  print(classification_report(y, pred, target_names=target_names, digits=4))
  # Plot the confusion matrix
  plt.figure(figsize=(10,10))
  sns.heatmap(cm.T, xticklabels=[0,1,2], yticklabels=[0,1,2])
  plt.xlabel('True labels')
  plt.ylabel('Predicted labels')
  plt.show()

  # Model saving
  model_directory = './'
  filename = 'cnn_N_kFold_' + str(i+1)
  filename_chosen = os.path.join(model_directory, filename)
  model.save(filename_chosen + '.h5')

### Load trained models

In [None]:
from keras.models import load_model

# load models 
total_model = 5
model_directory = './'
trained_models = list()
for model_n in range(total_model):
  filename = 'cnn_N_kFold_' + str(model_n+1)
  filename_chosen = os.path.join(model_directory, filename)
  tl_model = load_model(filename_chosen + '.h5')
    # Compile the model
  tl_model.compile(loss=tfk.losses.BinaryFocalCrossentropy(True), optimizer=tfk.optimizers.Adam(learning_rate = 1e-4), metrics='accuracy')
  trained_models.append(tl_model)

### Evaluate trained models (for storing statistics)

In [None]:
validation_stats = {}
plt.get_current_fig_manager().full_screen_toggle() # toggle fullscreen mode

for idx, svm_model in enumerate(trained_models):  
  predictions = svm_model.predict(test_gen)
  y = np.argmax(tfk.utils.to_categorical(list(test.label))[:-2], axis = -1)
  pred = np.argmax(predictions, axis=-1)
  target_names = ['N', 'P', 'T']
  cm = confusion_matrix(y, pred, normalize="true")


  # Compute the classification metrics
  accuracy = accuracy_score(y, pred)
  precision = precision_score(y, pred, average='macro')
  recall = recall_score(y, pred, average='macro')
  f1 = f1_score(y, pred, average='macro')


  validation_stats['fold_' + str(idx+1) ] = {
      'accuracy' : accuracy,
      'precision' :precision,
      'recall' :recall,
      'f1' :f1,
      'classification_report' :classification_report(y, pred, target_names=target_names, digits=4, output_dict = True)}


  print('Accuracy:',accuracy.round(4))
  print('Precision:',precision.round(4))
  print('Recall:',recall.round(4))
  print('F1:',f1.round(4))
  print(classification_report(y, pred, target_names=target_names, digits=4))
  # Plot the confusion matrix
  plt.figure(figsize=(10,10))
  hm = sns.heatmap(cm.T, xticklabels=[0,1,2], yticklabels=[0,1,2])
  plt.xlabel('True labels')
  plt.ylabel('Predicted labels')
  plt.show()

  hm.get_figure().savefig("alice/data_for_report/heatmap_fold_"+str(idx+1)+".pdf")

with open('alice/data_for_report/test_stats.json' , 'w') as fp:
    json.dump(validation_stats, fp)



## Second binary classifier: T and focal loss

In [None]:
supernet2 = tf.keras.applications.EfficientNetV2B3(
    include_top=False,
    weights="imagenet",
    input_shape=(480,480,3)
)


supernet2._name = "effnet2"



count = 1
print(len(supernet2.layers))
for layer in supernet2.layers:
    if count < 80:
        layer.trainable = False
    else:
        layer.trainable = True
    count = count + 1

In [None]:
from keras.layers import Dense,Flatten,GlobalAveragePooling2D, MaxPooling2D, BatchNormalization,Concatenate, Resizing
from keras import regularizers

inputs = tfk.Input((480,480,3))

y2 = supernet2(inputs)

y2 = tf.keras.layers.GlobalAveragePooling2D()(y2)
y2 = tf.keras.layers.Dense(256, activation='relu', kernel_initializer = tfk.initializers.HeUniform(SEED))(y2)
outputs = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer = tfk.initializers.GlorotUniform(SEED))(y2)

# Connect input and output through the Model class
tl_model_exp3 = tfk.Model(inputs=inputs, outputs=outputs, name='model')

# Compile the model
tl_model_exp3.compile(loss=tf.keras.losses.BinaryFocalCrossentropy(True), optimizer=tfk.optimizers.experimental.AdamW(), metrics='accuracy')

tl_model_exp3.summary()

In [None]:
tl_model_exp3.load_weights("/gdrive/MyDrive/tuberculosis-pneumonia-classification/model_exp3_N2.h5")

In [None]:
training_samples = int(len(train_gen_ovaT)*batch_size)
step_size = 6*training_samples // batch_size

clr = CyclicLR(
    mode='triangular',
    base_lr=1e-5, 
    max_lr=1e-4,             
    step_size= step_size
    )

history = tl_model_exp3.fit(
    train_gen_ovaT,
    workers=8,
    use_multiprocessing=True,
    epochs = 35,
    validation_data = valid_gen_ovaT,
    callbacks = [tfk.callbacks.EarlyStopping(monitor= 'val_accuracy', mode='max', patience=10, restore_best_weights=True), clr],
).history


### Training plots

In [None]:
# Plot the training
plt.figure(figsize=(20,5))
plt.plot(history['loss'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_loss'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Category Crossentropy')
plt.grid(alpha=.3)

plt.figure(figsize=(20,5))
plt.plot(history['accuracy'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_accuracy'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Accuracy')
plt.grid(alpha=.3)

plt.show()

### Model testing

In [None]:
# Predict the test set with the CNN
predictions = tl_model_exp3.predict(test_gen_ovaT,
                                    workers=8,
    use_multiprocessing=True)

In [None]:
print(predictions)

In [None]:
y = list(test_ovaT.label)[:2368]
pred = (predictions > 0.5).astype(int)
target_names = ['O', 'T']
cm = confusion_matrix(y, pred, normalize="true")



# Compute the classification metrics
accuracy = accuracy_score(y, pred)
precision = precision_score(y, pred, average='macro')
recall = recall_score(y, pred, average='macro')
f1 = f1_score(y, pred, average='macro')
print('Accuracy:',accuracy.round(4))
print('Precision:',precision.round(4))
print('Recall:',recall.round(4))
print('F1:',f1.round(4))
print(classification_report(y, pred, target_names=target_names, digits=4))
# Plot the confusion matrix
plt.figure(figsize=(10,10))
sns.heatmap(cm.T, xticklabels=[0,1,2], yticklabels=[0,1,2])
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()

In [None]:
tl_model_exp3.save("model_exp3_T2.h5")

## Third binary classifier: P and BCE

In [None]:
supernet2 = tf.keras.applications.EfficientNetV2B3(
    include_top=False,
    weights="imagenet",
    input_shape=(224,224,3)
)

supernet2._name = "effnet2"



count = 1
print(len(supernet2.layers))
for layer in supernet2.layers:
    if count < 80:
        layer.trainable = False
    else:
        layer.trainable = True
    count = count + 1

In [None]:
# supernet.trainable = False
from keras.layers import Dense,Flatten,GlobalAveragePooling2D, MaxPooling2D, BatchNormalization,Concatenate, Resizing
from keras import regularizers

inputs = tfk.Input((224,224,3))

y2 = supernet2(inputs)

y2 = tf.keras.layers.GlobalAveragePooling2D()(y2)
y2 = tf.keras.layers.Dense(256, activation='relu', kernel_initializer = tfk.initializers.HeUniform(SEED))(y2)
outputs = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer = tfk.initializers.GlorotUniform(SEED))(y2)

# Connect input and output through the Model class
tl_model_exp3 = tfk.Model(inputs=inputs, outputs=outputs, name='model')

# Compile the model
tl_model_exp3.compile(loss=tf.keras.losses.BinaryFocalCrossentropy(True), optimizer=tfk.optimizers.Adam(learning_rate = 1e-4), metrics='accuracy')

tl_model_exp3.summary()

In [None]:
tl_model_exp3.load_weights("/gdrive/MyDrive/tuberculosis-pneumonia-classification/model_exp3_N2.h5")

In [None]:
training_samples = int(len(train_gen_ovaN)*batch_size)
step_size = 6*training_samples // batch_size

clr = CyclicLR(
    mode='triangular',
    base_lr=1e-5, 
    max_lr=1e-4,             
    step_size= step_size
    )

history = tl_model_exp3.fit(
    train_gen_ovaP,
    workers=8,
    use_multiprocessing=True,
    epochs = 35,
    validation_data = valid_gen_ovaP,
    callbacks = [tfk.callbacks.EarlyStopping(monitor= 'val_accuracy', mode='max', patience=5, restore_best_weights=True), clr],
).history


### Training plots

In [None]:
# Plot the training
plt.figure(figsize=(20,5))
plt.plot(history['loss'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_loss'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Category Crossentropy')
plt.grid(alpha=.3)

plt.figure(figsize=(20,5))
plt.plot(history['accuracy'], label='Training', alpha=.8, color='#ff7f0e')
plt.plot(history['val_accuracy'], label='Validation', alpha=.8, color='#4D61E2')
plt.legend(loc='upper left')
plt.title('Accuracy')
plt.grid(alpha=.3)

plt.show()

### Model testing

In [None]:
# Predict the test set with the CNN
predictions = tl_model_exp3.predict(test_gen_ovaP,
                                    workers=8,
    use_multiprocessing=True)

In [None]:
print(predictions)

In [None]:
y = list(test_ovaP.label)[:2368]
pred = (predictions > 0.5).astype(int)
target_names = ['O', '  P']
cm = confusion_matrix(y, pred, normalize="true")



# Compute the classification metrics
accuracy = accuracy_score(y, pred)
precision = precision_score(y, pred, average='macro')
recall = recall_score(y, pred, average='macro')
f1 = f1_score(y, pred, average='macro')
print('Accuracy:',accuracy.round(4))
print('Precision:',precision.round(4))
print('Recall:',recall.round(4))
print('F1:',f1.round(4))
print(classification_report(y, pred, target_names=target_names, digits=4))
# Plot the confusion matrix
plt.figure(figsize=(10,10))
sns.heatmap(cm.T, xticklabels=[0,1,2], yticklabels=[0,1,2])
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()

In [None]:
tl_model_exp3.save("model_exp3_P2.h5")