In [1]:
import os

dataset_path = "/content/drive/MyDrive/Dataset Covid19/Covid-19"

categories = ["COVID", "Lung_Opacity", "Normal", "Viral Pneumonia"]

for category in categories:
    folder_path = os.path.join(dataset_path, category, "images")

    if os.path.exists(folder_path):
        num_images = len([
            file for file in os.listdir(folder_path)
            if file.lower().endswith(('.png', '.jpg', '.jpeg'))
        ])
        print(f"{category}: {num_images} images")
    else:
        print(f"Folder not found: {folder_path}")


COVID: 3616 images
Lung_Opacity: 6012 images
Normal: 10192 images
Viral Pneumonia: 1345 images


In [2]:
import os
import cv2
import shutil
import random
from tqdm import tqdm

dataset_dir = "/content/drive/MyDrive/Dataset Covid19/Covid-19"
output_dir = "/content/drive/MyDrive/Dataset Covid19/Covid-19_processed"

splits = ['train', 'val', 'test']
categories = ["COVID", "Lung_Opacity", "Normal", "Viral Pneumonia"]

# Create output directories
for split in splits:
    for category in categories:
        os.makedirs(os.path.join(output_dir, split, category), exist_ok=True)


def preprocess_image(img):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if len(img.shape) == 3 else img

    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    enhanced = clahe.apply(gray)

    # Denoising (Bilateral Filter)
    denoised = cv2.bilateralFilter(enhanced, d=9, sigmaColor=75, sigmaSpace=75)

    return denoised


def split_dataset():
    for category in categories:
        img_folder = os.path.join(dataset_dir, category, "images")
        images = [f for f in os.listdir(img_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        random.shuffle(images)

        total = len(images)
        train_split = int(total * 0.8)
        val_split = int(total * 0.1)

        train_imgs = images[:train_split]
        val_imgs = images[train_split:train_split + val_split]
        test_imgs = images[train_split + val_split:]

        splits_dict = {"train": train_imgs, "val": val_imgs, "test": test_imgs}


        for split, img_list in splits_dict.items():
            for img_name in tqdm(img_list, desc=f"{category} → {split}"):
                img_path = os.path.join(img_folder, img_name)
                img = cv2.imread(img_path)

                if img is None:
                    continue  # skip broken or unreadable images

                processed_img = preprocess_image(img)
                save_path = os.path.join(output_dir, split, category, img_name)
                cv2.imwrite(save_path, processed_img)

split_dataset()
print("Dataset split and preprocessing complete!")


COVID → train: 100%|██████████| 2892/2892 [02:37<00:00, 18.38it/s]
COVID → val: 100%|██████████| 361/361 [00:12<00:00, 28.69it/s]
COVID → test: 100%|██████████| 363/363 [00:12<00:00, 28.86it/s]
Lung_Opacity → train: 100%|██████████| 4809/4809 [06:15<00:00, 12.79it/s]
Lung_Opacity → val: 100%|██████████| 601/601 [00:21<00:00, 27.70it/s]
Lung_Opacity → test: 100%|██████████| 602/602 [00:21<00:00, 28.30it/s]
Normal → train: 100%|██████████| 8153/8153 [12:56<00:00, 10.50it/s]
Normal → val: 100%|██████████| 1019/1019 [00:36<00:00, 27.91it/s]
Normal → test: 100%|██████████| 1020/1020 [00:37<00:00, 27.46it/s]
Viral Pneumonia → train: 100%|██████████| 1076/1076 [01:00<00:00, 17.91it/s]
Viral Pneumonia → val: 100%|██████████| 134/134 [00:05<00:00, 24.47it/s]
Viral Pneumonia → test: 100%|██████████| 135/135 [00:04<00:00, 27.04it/s]

Dataset split and preprocessing complete!





In [8]:

def count_images(output_dir, splits, categories):
    print("\nImage count per split and class:")
    for split in splits:
        print(f"\n{split.upper()}: ")
        for category in categories:
            folder_path = os.path.join(output_dir, split, category)
            if os.path.exists(folder_path):
                num_images = len([f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
                print(f"{category}: {num_images} images")
            else:
                print(f"{category}: Folder not found")

# Call the counting function after preprocessing
count_images(output_dir, splits, categories)



Image count per split and class:

TRAIN: 
COVID: 2892 images
Lung_Opacity: 4809 images
Normal: 8153 images
Viral Pneumonia: 1076 images

VAL: 
COVID: 361 images
Lung_Opacity: 601 images
Normal: 1019 images
Viral Pneumonia: 134 images

TEST: 
COVID: 363 images
Lung_Opacity: 602 images
Normal: 1020 images
Viral Pneumonia: 135 images


In [9]:
import os
import random
import shutil
from tqdm import tqdm

source_dir = "/content/drive/MyDrive/Dataset Covid19/Covid-19_processed"  # your preprocessed dataset
output_dir = "/content/drive/MyDrive/Dataset Covid19/Covid-19_small"      # new smaller dataset

categories = ["COVID", "Lung_Opacity", "Normal", "Viral Pneumonia"]
splits = ["train", "val", "test"]

# ==== PARAMETERS ====
images_per_class = 500  # total images per class (across all splits)
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# ==== CREATE OUTPUT FOLDERS ====
for split in splits:
    for category in categories:
        os.makedirs(os.path.join(output_dir, split, category), exist_ok=True)

# ==== COPY RANDOMLY SELECTED IMAGES ====
for category in categories:
    # Collect all images from train, val, and test splits
    all_images = []
    for split in splits:
        src_folder = os.path.join(source_dir, split, category)
        imgs = [os.path.join(split, category, f)
                for f in os.listdir(src_folder)
                if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        all_images.extend(imgs)

    if len(all_images) < images_per_class:
        print(f"⚠️ Warning: {category} has only {len(all_images)} available (less than 500). Using all.")
        selected = all_images
    else:
        selected = random.sample(all_images, images_per_class)

    # Split into 80:10:10
    total = len(selected)
    train_end = int(total * train_ratio)
    val_end = train_end + int(total * val_ratio)

    train_imgs = selected[:train_end]
    val_imgs = selected[train_end:val_end]
    test_imgs = selected[val_end:]

    split_map = {
        "train": train_imgs,
        "val": val_imgs,
        "test": test_imgs
    }

    # Copy images to new subset
    for split, img_list in split_map.items():
        for rel_path in tqdm(img_list, desc=f"{category} → {split}", ncols=80):
            src_path = os.path.join(source_dir, rel_path)
            dest_path = os.path.join(output_dir, rel_path)
            os.makedirs(os.path.dirname(dest_path), exist_ok=True)
            shutil.copy2(src_path, dest_path)

print("\n✅ Subset dataset creation complete!")
print(f"Saved at: {output_dir}")


COVID → train: 100%|██████████████████████████| 400/400 [00:09<00:00, 41.44it/s]
COVID → val: 100%|██████████████████████████████| 50/50 [00:01<00:00, 41.12it/s]
COVID → test: 100%|█████████████████████████████| 50/50 [00:01<00:00, 49.74it/s]
Lung_Opacity → train: 100%|███████████████████| 400/400 [00:59<00:00,  6.73it/s]
Lung_Opacity → val: 100%|███████████████████████| 50/50 [00:00<00:00, 55.42it/s]
Lung_Opacity → test: 100%|██████████████████████| 50/50 [00:00<00:00, 57.24it/s]
Normal → train: 100%|█████████████████████████| 400/400 [00:14<00:00, 27.25it/s]
Normal → val: 100%|█████████████████████████████| 50/50 [00:00<00:00, 50.60it/s]
Normal → test: 100%|████████████████████████████| 50/50 [00:00<00:00, 52.52it/s]
Viral Pneumonia → train: 100%|████████████████| 400/400 [00:15<00:00, 25.46it/s]
Viral Pneumonia → val: 100%|████████████████████| 50/50 [00:01<00:00, 49.63it/s]
Viral Pneumonia → test: 100%|███████████████████| 50/50 [00:00<00:00, 54.42it/s]


✅ Subset dataset creation complete!
Saved at: /content/drive/MyDrive/Dataset Covid19/Covid-19_small



