In [56]:
import sys
sys.path.append("/opt/homebrew/Caskroom/miniforge/base/lib/python3.9/site-packages")

In [57]:
import pandas as pd
import data_constants
import os
import tensorflow as tf
import numpy as np

In [79]:
data_folder = data_constants.DATA_FOLDER
images_folder = os.path.join(data_folder, '2A_images')
train_file = os.path.join(data_folder, 'train_COVIDx_CT-2A.txt')
test_file = os.path.join(data_folder, 'test_COVIDx_CT-2A.txt')
val_file = os.path.join(data_folder, 'val_COVIDx_CT-2A.txt')

In [72]:
assert os.path.exists(data_folder), 'Data folder must be downloaded first'
assert os.path.exists(images_folder), 'Images was not downloaded properly'

assert os.path.exists(train_file)
assert os.path.exists(test_file)
assert os.path.exists(val_file)

In [80]:
col_names = ["filename", "class", "xmin", "ymin", "xmax", "ymax"]
train_ds = pd.read_csv(train_file, sep = ' ', names = col_names)
test_ds = pd.read_csv(test_file, sep = ' ', names = col_names)
val_ds = pd.read_csv(val_file, sep = ' ', names = col_names)

train_ds['filename'] =  train_ds['filename'].apply(lambda x: os.path.join(images_folder, x))
val_ds['filename'] =  val_ds['filename'].apply(lambda x: os.path.join(images_folder, x))
test_ds['filename'] =  test_ds['filename'].apply(lambda x: os.path.join(images_folder, x))

In [81]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, df, X_col, y_col,
                 batch_size,
                 input_size=(512, 512, 3),
                 shuffle=True):
        """
        df : Dataset dataframe
        X_col : a dictionary which has a mapping of key with actual column names
              : Currently the key and value of the dictionary are same as the dataframe columns
        y_col : Similar to X_columns. Contains a mapping to columns in dataframe for the prediction features
        input_size : Shape of the image
        """
        self.df = df.copy()
        self.X_col = X_col
        self.y_col = y_col
        self.batch_size = batch_size
        self.input_size = input_size
        self.shuffle = shuffle
        self.n = len(self.df)
    
    def __len__(self):
        return int(self.n / self.batch_size)
    
    def __get_input(self, path):
        """
        A helper function which returns the image array from image path
        """
        image = tf.keras.preprocessing.image.load_img(path)
        image_arr = tf.keras.preprocessing.image.img_to_array(image)
        return image_arr/255
    
    def __get_output(self, label, num_classes):
        """
        A helper function which converts numerical class labels to one hot encoded vectors
        """
        return tf.keras.utils.to_categorical(label, num_classes=num_classes)
    
    
    def __get_data(self, batches):
        """
        A helper function which returns the following a tuple of X and y
        X - > Image array of shape (Batch_size, H, W)
        y -> A tuple of class label and a list of bounding box coordinates
        """
        path_batch = batches[self.X_col['path']]
        class_batch = batches[self.y_col['class']]
        bb_batch = batches[[self.y_col['xmin'], self.y_col['ymin'], self.y_col['xmax'], self.y_col['ymax']]]

        X_batch = np.asarray([self.__get_input(x_path) for x_path in path_batch])
        return X_batch, (class_batch, bb_batch)
    
    def __getitem__(self, index):
        """
        Returns a batch of X and y data
        X - > Image array of shape (Batch_size, H, W)
        y -> A tuple of class label and a list of bounding box coordinates
        """
        batches = self.df[index * self.batch_size:(index + 1) * self.batch_size]
        X, y = self.__get_data(batches)        
        return X, y

In [82]:
batch_size = 64
x_map = {'path':'filename'}
y_map = {'class': 'class', 'xmin': 'xmin', 'ymin' : 'ymin', 'xmax' : 'xmax', 'ymax' : 'ymax'}
train_gen = DataGenerator(train_ds,
                         X_col = x_map,
                         y_col=y_map,
                         batch_size=batch_size)

val_gen = DataGenerator(train_ds,
                         X_col = x_map,
                         y_col = y_map,
                         batch_size=batch_size)

test_gen = DataGenerator(train_ds,
                         X_col = x_map,
                         y_col = y_map,
                         batch_size=batch_size)

Usage of the train data generator

In [83]:
for t in train_gen:
    X, y = t
    y_class, y_bbox = y
    print('Image shape', X.shape)
    print('Label shape', y_class.shape)
    print('Bounding box shape', y_bbox.shape)
    break

Image shape (64, 512, 512, 3)
Label shape (64,)
Bounding box shape (64, 4)


In [84]:
for t in val_gen:
    X, y = t
    y_class, y_bbox = y
    print('Image shape', X.shape)
    print('Label shape', y_class.shape)
    print('Bounding box shape', y_bbox.shape)
    break

Image shape (64, 512, 512, 3)
Label shape (64,)
Bounding box shape (64, 4)
