### Classify raw dataset in two folders
postive folder has images positive for tumor  
negative folder has images negative for tumor

Preprocessing: 
1. Converting all the images into grayscale
2. Resizing all the images to maintain uniformity

In [2]:
import os
from PIL import Image
from tqdm import tqdm

In [None]:
# Folder paths
parent_folder = "../dataset"
raw_data = os.path.join(parent_folder, "raw_dataset")
processed_data = os.path.join(parent_folder, "processed_dataset")
image_size = 512  # Resize dimension

if os.path.exists(raw_data) and os.path.isdir(raw_data):
    os.makedirs(processed_data, exist_ok=True)

    # Create output folders
    negative_path = os.path.join(processed_data, 'negative')
    positive_path = os.path.join(processed_data, 'positive')
    os.makedirs(negative_path, exist_ok=True)
    os.makedirs(positive_path, exist_ok=True)

    valid_extensions = ('.jpg', '.jpeg', '.png')

    print("Processing raw dataset...")

    # Separate counters for naming
    pos_count = 1
    neg_count = 1

    for folder_name in tqdm(os.listdir(raw_data), desc="Folders"):
        folder_path = os.path.join(raw_data, folder_name)
        is_normal = folder_name.startswith('_')
        target_folder = negative_path if is_normal else positive_path

        image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(valid_extensions)]

        for file_name in tqdm(image_files, desc=f"🖼️ {folder_name}", leave=True):
            file_path = os.path.join(folder_path, file_name)
            try:
                # Convert to grayscale and resize
                img = Image.open(file_path).convert('L')
                img = img.resize((image_size, image_size))
                if is_normal:
                    new_img_name = f"{neg_count}.jpg"
                    neg_count += 1
                else:
                    new_img_name = f"{pos_count}.jpg"
                    pos_count += 1

                new_img_path = os.path.join(target_folder, new_img_name)
                img.save(new_img_path)

            except Exception as e:
                print(f"Error processing {file_path}: {e}")
else:
    print("Raw data folder not found or invalid.")

Processing raw dataset...


🖼️ Astrocitoma T1: 100%|██████████| 176/176 [00:00<00:00, 176.63it/s]
🖼️ Astrocitoma T1C+: 100%|██████████| 232/232 [00:01<00:00, 164.34it/s]
🖼️ Astrocitoma T2: 100%|██████████| 171/171 [00:00<00:00, 171.11it/s]
🖼️ Carcinoma T1: 100%|██████████| 66/66 [00:00<00:00, 184.43it/s]
🖼️ Carcinoma T1C+: 100%|██████████| 112/112 [00:00<00:00, 183.29it/s]
🖼️ Carcinoma T2: 100%|██████████| 73/73 [00:00<00:00, 153.94it/s]
🖼️ Ependimoma T1: 100%|██████████| 45/45 [00:00<00:00, 175.96it/s]
🖼️ Ependimoma T1C+: 100%|██████████| 48/48 [00:00<00:00, 172.35it/s]
🖼️ Ependimoma T2: 100%|██████████| 57/57 [00:00<00:00, 156.82it/s]
🖼️ Ganglioglioma T1: 100%|██████████| 20/20 [00:00<00:00, 182.23it/s]
🖼️ Ganglioglioma T1C+: 100%|██████████| 18/18 [00:00<00:00, 162.51it/s]
🖼️ Ganglioglioma T2: 100%|██████████| 23/23 [00:00<00:00, 184.76it/s]
🖼️ Germinoma T1: 100%|██████████| 27/27 [00:00<00:00, 175.88it/s]
🖼️ Germinoma T1C+: 100%|██████████| 40/40 [00:00<00:00, 159.65it/s]
🖼️ Germinoma T2: 100%|██████████| 33/