### TO check whether the 4 classes of images are their in the specified directory

In [None]:
import os
import shutil

# Define paths for training and testing datasets
source_train_dir = "BrainTumor_MRI/Training"
source_test_dir = "BrainTumor_MRI/Testing"

# Define class names
classes = ["glioma", "meningioma", "notumor", "pituitary"]

# Check if images are in the correct source directory
for class_name in classes:
    class_train_folder = os.path.join(source_train_dir, class_name)
    class_test_folder = os.path.join(source_test_dir, class_name)

    if not os.path.exists(class_train_folder):
        print(f"Source training folder for {class_name} does not exist!")
    if not os.path.exists(class_test_folder):
        print(f"Source testing folder for {class_name} does not exist!")
    else:
        print(f"Source folders exist and contain {len(os.listdir(class_train_folder))} training images and {len(os.listdir(class_test_folder))} testing images for {class_name}.")


### TO split the IID dataset into NON-IID format, to create real world FL environment

In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Paths to the dataset
train_dir = 'BrainTumor_MRI/Training'
test_dir = 'BrainTumor_MRI/Testing'
output_dir = 'NON_IID/'

# Class names
classes = ['glioma', 'meningioma', 'notumor', 'pituitary']

# Number of samples to be taken for each model
# We consider this values to simulate a situation that each client have sufficient data 
# atleast for one type of Brain tumor MRI scan
samples = {
    'Model_1': {'glioma': (1021, 210), 'meningioma': (100, 32), 'notumor': (100, 50), 'pituitary': (100, 50)},
    'Model_2': {'glioma': (100, 30), 'meningioma': (1039, 210), 'notumor': (100, 50), 'pituitary': (100, 50)},
    'Model_3': {'glioma': (100, 30), 'meningioma': (100, 32), 'notumor': (1295, 255), 'pituitary': (100, 50)},
    'Model_4': {'glioma': (100, 30), 'meningioma': (100, 32), 'notumor': (100, 50), 'pituitary': (1157, 150)}
}

# Function to copy files to the appropriate directories
def copy_files(file_list, dest_dir):
    for file_path in file_list:
        shutil.copy(file_path, dest_dir)

# Splitting the dataset
for model, splits in samples.items():
    model_dir = os.path.join(output_dir, model)
    os.makedirs(model_dir, exist_ok=True)

    for cls, (train_size, test_size) in splits.items():
        # Train directory and files
        train_class_dir = os.path.join(train_dir, cls)
        train_files = [os.path.join(train_class_dir, f) for f in os.listdir(train_class_dir)]

        # Test directory and files
        test_class_dir = os.path.join(test_dir, cls)
        test_files = [os.path.join(test_class_dir, f) for f in os.listdir(test_class_dir)]

        # Split the train files for the model
        train_subset, _ = train_test_split(train_files, train_size=train_size, random_state=42)

        # Split the test files for the model
        test_subset, _ = train_test_split(test_files, train_size=test_size, random_state=42)

        # Copy train files
        model_train_dir = os.path.join(model_dir, 'train', cls)
        os.makedirs(model_train_dir, exist_ok=True)
        copy_files(train_subset, model_train_dir)

        # Copy test files
        model_test_dir = os.path.join(model_dir, 'test', cls)
        os.makedirs(model_test_dir, exist_ok=True)
        copy_files(test_subset, model_test_dir)

print("Dataset split completed successfully.")


### Plot the NON-IID Data Distribution before agumentation

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Categories (Tumor types)
categories = ["Glioma", "Meningioma", "No Tumor", "Pituitary"]
x = np.arange(len(categories))  # X positions


# NON_IID_Data for each model
model_1_train = [1021, 100, 100, 100]
model_1_test = [210, 32, 50, 50]

model_2_train = [100, 1039, 100, 100]
model_2_test = [30, 210, 50, 50]

model_3_train = [100, 100, 1295, 100]
model_3_test = [30, 32, 255, 50]

model_4_train = [100, 100, 100, 1157]
model_4_test = [30, 32, 50, 150]

# Bar width
bar_width = 0.2

# Creating the plot
fig, ax = plt.subplots(figsize=(16, 10))

# Plot bars for Model 1
ax.bar(x - 1.5 * bar_width, model_1_train, bar_width, label="Model 1 - Training", color='#1f77b4')
ax.bar(x - 1.5 * bar_width, model_1_test, bar_width, bottom=model_1_train, label="Model 1 - Testing", color='#aec7e8')

# Plot bars for Model 2
ax.bar(x - 0.5 * bar_width, model_2_train, bar_width, label="Model 2 - Training", color='#9467bd')
ax.bar(x - 0.5 * bar_width, model_2_test, bar_width, bottom=model_2_train, label="Model 2 - Testing", color='#c5b0d5')

# Plot bars for Model 3
ax.bar(x + 0.5 * bar_width, model_3_train, bar_width, label="Model 3 - Training", color='#2ca02c')
ax.bar(x + 0.5 * bar_width, model_3_test, bar_width, bottom=model_3_train, label="Model 3 - Testing", color='#98df8a')

# Plot bars for Model 4
ax.bar(x + 1.5 * bar_width, model_4_train, bar_width, label="Model 4 - Training", color='#d62728')
ax.bar(x + 1.5 * bar_width, model_4_test, bar_width, bottom=model_4_train, label="Model 4 - Testing", color='#ff9896')

# Labels and title with increased font size
ax.set_xlabel("Tumor Type", fontsize=20)
ax.set_ylabel("Number of Samples", fontsize=20)
ax.set_title("Combined Sample Distribution for Models 1, 2, 3, and 4", fontsize=24)
ax.set_xticks(x)
ax.set_xticklabels(categories, fontsize=19)
#ax.set_yticks(y)
ax.set_yticklabels([0, 200, 400, 600, 800, 1000, 1200 ,1400 ,1600],fontsize=19)

# Adjust legend to be at the bottom center
ax.legend(
    loc='upper center',
    bbox_to_anchor=(0.5, -0.25),  # Place it outside the plot area
    ncol=4,  # Number of columns for legend
    fontsize=20,  # Adjust font size for legend items
    title="Training and Testing Models",
    title_fontsize=21 # Font size for legend title
)

# Adjust layout to ensure the legend fits within the figure
plt.tight_layout()

# Show the plot
plt.show()

# Save the figure as a high-resolution PDF
fig.savefig("NON_IID_DATA_REPRESENTATION.pdf", format="pdf", dpi=900)
