In [3]:
#Importing necessary library

import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
import random
import itertools

import tensorflow as tf
from PIL import Image
from keras.preprocessing import image
from keras.utils import to_categorical


In [4]:
#Function to call the images and resizing them to 224 by 224 images, and store them in array lists
def load_images(directory):
    image_array = []
    
    for filename in os.listdir(directory):
        if filename.endswith(".png"):
            filepath = os.path.join(directory, filename)
            
            # Load and resize the image
            img = cv2.imread(filepath)
            img = cv2.resize(img, (224, 224))
            
            image_array.append(img)
    
    return image_array


In [5]:
#Directories and start resizing, and list them into array lists
normal_dir= 'D:/Latihan_Python/Datasets/Split_Data/Normal'  # Normal CXR Image Dataset
viral_dir= 'D:/Latihan_Python/Datasets/Split_Data/Viral Pneumonia'    # Viral Pneumonia CXR Image Dataset
covid_dir= 'D:/Latihan_Python/Datasets/Split_Data/COVID'    # COVID-19 CXR Image Dataset

normal_dataset = load_images(normal_dir)
viral_dataset = load_images(viral_dir)
covid_dataset = load_images(covid_dir)

In [6]:
#Checking the array list
print("The Dataset which is made up of {} Image Arrays has: ".format(len(normal_dataset) + len(viral_dataset) + len(covid_dataset)))
print('{} Normal CXR Image Arrays'.format(len(normal_dataset)))
print('{} Viral Pneumonia CXR Image Arrays'.format(len(viral_dataset)))
print('{} COVID-19 CXR Image Arrays'.format(len(covid_dataset)))

The Dataset which is made up of 2952 Image Arrays has: 
988 Normal CXR Image Arrays
989 Viral Pneumonia CXR Image Arrays
975 COVID-19 CXR Image Arrays


In [7]:
#Funtion to normalize the data, and save them in array lists
def normalizer(image_arrays):


    norm_image_arrays = []
    
    # Iterate over all the image arrays and normalize them before storing them into our predefined list
    for image_array in image_arrays:
        norm_image_array = image_array / 255.0
        norm_image_arrays.append(norm_image_array)
    
    return norm_image_arrays

In [8]:
#Storing them in variable and do the normalization
normal_dataset_normalized = normalizer(normal_dataset)
viral_dataset_normalized = normalizer(viral_dataset)
covid_dataset_normalized = normalizer(covid_dataset)

In [9]:
#Splitting the dataset
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

def split_and_merge_function(image_arrays, split_factor=[0.7, 0.15, 0.15]):
    datasets = {}
    train_data, validation_data, test_data = [], [], []
    train_labels, validation_labels, test_labels = [], [], []
    
    for image_array_id, image_array in enumerate(image_arrays):
        labels = [image_array_id] * len(image_array)
        
        # Split the data and labels into train, validation, and test sets
        train_data_temp, test_data_temp, train_labels_temp, test_labels_temp = train_test_split(image_array, labels, train_size=split_factor[0], random_state=42)
        validation_data_temp, test_data_temp, validation_labels_temp, test_labels_temp = train_test_split(test_data_temp, test_labels_temp, train_size=split_factor[1] / (split_factor[1] + split_factor[2]), random_state=42)
        
        # Merge the data and labels into the respective lists
        train_data.extend(train_data_temp)
        train_labels.extend(train_labels_temp)
        validation_data.extend(validation_data_temp)
        validation_labels.extend(validation_labels_temp)
        test_data.extend(test_data_temp)
        test_labels.extend(test_labels_temp)
        
    # Store the train, validation, and test datasets into the datasets dictionary
    datasets['train_dataset'] = np.array(train_data)
    datasets['validation_dataset'] = np.array(validation_data)
    datasets['test_dataset'] = np.array(test_data)
    # Convert labels from label-encoding to one-hot encoding and store in the datasets dictionary
    datasets['train_labels'] = to_categorical(np.array(train_labels))
    datasets['validation_labels'] = to_categorical(np.array(validation_labels))
    datasets['test_labels'] = to_categorical(np.array(test_labels))
    return datasets


In [10]:
#Split the data set in to desired percentage
image_arrays = [normal_dataset_normalized, viral_dataset_normalized, covid_dataset_normalized]
datasets = split_and_merge_function(image_arrays, split_factor = [0.7, 0.15, 0.15])

In [11]:
#Store it in variables
train_dataset = datasets['train_dataset']
validation_dataset = datasets['validation_dataset']
test_dataset = datasets['test_dataset']
train_labels = datasets['train_labels'] 
validation_labels = datasets['validation_labels']
test_labels = datasets['test_labels']

In [12]:
#Checking the splitted dataset
print("The Dataset which is made up of {} Image Arrays has been splitted into:".format(len(train_dataset) + len(validation_dataset) + len(test_dataset)))
print('{} Training Image Arrays'.format(len(train_dataset)))
print('{} Validation Image Arrays'.format(len(validation_dataset)))
print('{} Test Image Arrays'.format(len(test_dataset)))

The Dataset which is made up of 2952 Image Arrays has been splitted into:
2065 Training Image Arrays
442 Validation Image Arrays
445 Test Image Arrays


In [13]:
# Saving Splitted Dataset

# Create a directory to save the split datasets
save_dir = 'D:/Latihan_Python/Notebooks/Tugas Akhir/Splited_dataset'
os.makedirs(save_dir, exist_ok=True)

# Save the train, validation, and test datasets
np.save(os.path.join(save_dir, 'train_dataset.npy'), train_dataset)
np.save(os.path.join(save_dir, 'validation_dataset.npy'), validation_dataset)
np.save(os.path.join(save_dir, 'test_dataset.npy'), test_dataset)

# Save the train, validation, and test labels
np.save(os.path.join(save_dir, 'train_labels.npy'), train_labels)
np.save(os.path.join(save_dir, 'validation_labels.npy'), validation_labels)
np.save(os.path.join(save_dir, 'test_labels.npy'), test_labels)
