#train _test split(85% -15% )

In [5]:
import os
import shutil
from pathlib import Path
import random

random.seed(42)  # Set seed here for reproducibility
source_dir = Path("../data/raw/COVID-19_Radiography_Dataset")
dest_dir = Path("../data/processed")
train_dir = dest_dir / "train_data"
test_dir = dest_dir / "test_data"

classes = ['Normal', 'COVID', 'Lung_Opacity', 'Viral Pneumonia']
split_ratio = 0.85

for cls in classes:
    print(f"Processing class: {cls}")
    cls_path = source_dir / cls / "images"  # <-- reading images from the correct place 

    all_images = [img for img in os.listdir(cls_path) if img.lower().endswith(('.png', '.jpg', '.jpeg'))]   #Reads all image filenames (ignoring other file types like .xml, .txt, etc.).
    #Randomizes order so split is not biased.
    random.shuffle(all_images)

    split_index = int(len(all_images) * split_ratio)      #First 85% into train_imgs,Remaining 15% into test_imgs
    train_imgs = all_images[:split_index]
    test_imgs = all_images[split_index:]

    # Create destination directories
    train_cls_path = train_dir / cls
    test_cls_path = test_dir / cls
    train_cls_path.mkdir(parents=True, exist_ok=True)
    test_cls_path.mkdir(parents=True, exist_ok=True)

    # Copy files
    for img in train_imgs:
        shutil.copy(cls_path / img, train_cls_path / img)
    for img in test_imgs:
        shutil.copy(cls_path / img, test_cls_path / img)

    print(f"  Copied {len(train_imgs)} images to {train_cls_path}")
    print(f"  Copied {len(test_imgs)} images to {test_cls_path}")

#RESULT  OF SPLIT:
    #Processing class: Normal
  #Copied 8663 images to ..\data\processed\train_data\Normal
  #Copied 1529 images to ..\data\processed\test_data\Normal
#Processing class: COVID
  #Copied 3073 images to ..\data\processed\train_data\COVID
  #Copied 543 images to ..\data\processed\test_data\COVID
#Processing class: Lung_Opacity
  #Copied 5110 images to ..\data\processed\train_data\Lung_Opacity
  #Copied 902 images to ..\data\processed\test_data\Lung_Opacity
#Processing class: Viral Pneumonia
  #Copied 1143 images to ..\data\processed\train_data\Viral Pneumonia
  #Copied 202 images to ..\data\processed\test_data\Viral Pneumonia

Processing class: Normal
  Copied 8663 images to ..\data\processed\train_data\Normal
  Copied 1529 images to ..\data\processed\test_data\Normal
Processing class: COVID
  Copied 3073 images to ..\data\processed\train_data\COVID
  Copied 543 images to ..\data\processed\test_data\COVID
Processing class: Lung_Opacity
  Copied 5110 images to ..\data\processed\train_data\Lung_Opacity
  Copied 902 images to ..\data\processed\test_data\Lung_Opacity
Processing class: Viral Pneumonia
  Copied 1143 images to ..\data\processed\train_data\Viral Pneumonia
  Copied 202 images to ..\data\processed\test_data\Viral Pneumonia


#augment minority and validation split, save train and val. npy file

In [None]:
import numpy as np
from pathlib import Path
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from sklearn.model_selection import train_test_split

# --- Settings ---
data_dir = Path("../data/processed/train_data")
classes = ['Normal', 'COVID', 'Lung_Opacity', 'Viral Pneumonia']
img_size = (64, 64)

# --- Load images and labels ---
X, y = [], []

print("Loading images and labels...")
for class_index, class_name in enumerate(classes):
    class_path = data_dir / class_name
    img_files = list(class_path.glob("*"))
    print(f"  Loading {len(img_files)} images from '{class_name}'")

    for img_path in img_files:
        img = load_img(img_path, target_size=img_size, color_mode='grayscale')     #Load each image, resize to (64,64), convert to grayscale.

        img_array = img_to_array(img)                        #Convert image to numpy array and append to list X.
        X.append(img_array)      
                                                            #Create one-hot encoded label vector and append to y.
        label = np.zeros(len(classes))
        label[class_index] = 1
        y.append(label)

X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.float32)                               #Convert Lists to Numpy Arrays
print(f"Total images loaded: {len(X)}")
print(f"X shape: {X.shape}, y shape: {y.shape}")

# --- Normalize ---
X /= 255.0

# --- Identify minority class samples (all except 'Normal') ---
normal_mask = np.argmax(y, axis=1) == 0  # Index 0 is Normal
minority_mask = ~normal_mask

#The Normal class is majority.
#Create masks to separate minority class samples (all classes except Normal).
#Extract minority samples from X and y for augmentation.

X_minority = X[minority_mask]
y_minority = y[minority_mask]
print(f"Minority class samples: {len(X_minority)}")

# --- Data Augmentation for minority classes ---
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True
)

datagen.fit(X_minority)

#Generate Augmented Minority Samples
batch_size = len(X_minority)

augmented_iter = datagen.flow(
    X_minority,
    y_minority,
    batch_size=batch_size,
    shuffle=False
)

augmented_minority = next(augmented_iter)
print(f"Augmented minority samples generated: {len(augmented_minority[0])}")

# --- Combine original and augmented data ---
X_balanced = np.concatenate([X, augmented_minority[0]], axis=0)
y_balanced = np.concatenate([y, augmented_minority[1]], axis=0)
print(f"Total samples after augmentation: {len(X_balanced)}")

# --- Split into train and validation sets with stratification ---
X_train, X_val, y_train, y_val = train_test_split(
    X_balanced,
    y_balanced,
    test_size=0.2,
    random_state=42,
    stratify=y_balanced.argmax(axis=1)
)

print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

# --- Save arrays as .npy files ---
np.save('X_traindl.npy', X_train)
np.save('y_traindl.npy', y_train)
np.save('X_valdl.npy', X_val)
np.save('y_valdl.npy', y_val)

print("Saved train and validation datasets as .npy files.")
#X_train = np.load('X_traindl.npy.npy')
#y_train = np.load('y_traindl.npy')
#X_val = np.load('X_valdl.npy')
#y_val = np.load('y_valdl.npy')


Loading images and labels...
  Loading 8663 images from 'Normal'
  Loading 3073 images from 'COVID'
  Loading 5110 images from 'Lung_Opacity'
  Loading 1143 images from 'Viral Pneumonia'
Total images loaded: 17989
X shape: (17989, 64, 64, 1), y shape: (17989, 4)
Minority class samples: 9326
Augmented minority samples generated: 9326
Total samples after augmentation: 27315
Training samples: 21852, Validation samples: 5463
Saved train and validation datasets as .npy files.


#test npy file

In [None]:

# Define path to test data
test_data_dir = Path("../data/processed/test_data")

# Define classes 
classes = ['Normal', 'COVID', 'Lung_Opacity', 'Viral Pneumonia']

# Image size expected by the model 
img_size = (64, 64)

# Prepare lists for images and labels
X_test = []
y_test = []

print("Loading test images and labels...")
for class_index, class_name in enumerate(classes):
    class_path = test_data_dir / class_name
    img_files = list(class_path.glob("*"))
    print(f"  Loading {len(img_files)} images from test class '{class_name}'")

    for img_path in img_files:
        img = load_img(img_path, target_size=img_size, color_mode='grayscale')
        img_array = img_to_array(img)
        X_test.append(img_array)

        label = np.zeros(len(classes))
        label[class_index] = 1
        y_test.append(label)

# Convert lists to numpy arrays
X_test = np.array(X_test, dtype=np.float32) / 255.0  # Normalize
y_test = np.array(y_test, dtype=np.float32)

print(f"Total test images loaded: {len(X_test)}")
print(f"Shape of X_test: {X_test.shape}, Shape of y_test: {y_test.shape}")

# Save test data as .npy files
np.save('X_testdl.npy', X_test)
np.save('y_testdl.npy', y_test)

print("Saved test datasets as .npy files.")
#Load them as:
#X_test = np.load('X_testdl.npy')
#y_test = np.load('y_testdl.npy')


Loading test images and labels...
  Loading 1529 images from test class 'Normal'
  Loading 543 images from test class 'COVID'
  Loading 902 images from test class 'Lung_Opacity'
  Loading 202 images from test class 'Viral Pneumonia'
Total test images loaded: 3176
Shape of X_test: (3176, 64, 64, 1), Shape of y_test: (3176, 4)
Saved test datasets as .npy files.


'#model architecture

In [None]:
#LENET, CNN

#train and evaluate


In [None]:
# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=32,
    shuffle=True,
    verbose=2  # Cleaner output per epoch
)

# Evaluate on validation data
val_loss, val_acc = model.evaluate(X_val, y_val, batch_size=32, verbose=2)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")

# Evaluate on test data
test_loss, test_acc = model.evaluate(X_test, y_test, batch_size=32, verbose=2)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

