In [2]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img, array_to_img, save_img
from sklearn.model_selection import train_test_split
import glob

# Set directory paths and target size
base_dir = 'CT_Scans'
folders = ['Bengin cases', 'Malignant cases', 'Normal cases']
target_size = (224, 224)
output_dir = 'data'

# Ensure output directories exist
for split in ['train', 'val', 'test']:
    for folder in folders:
        os.makedirs(os.path.join(output_dir, split, folder), exist_ok=True)

# Initialize data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Function to preprocess and save images
def preprocess_and_save_image(image_path, save_dir, augment=True):
    try:
        img = load_img(image_path, target_size=target_size)
        img_array = img_to_array(img)
        img_array = np.expand_dims(img_array, 0)  # Add batch dimension

        if augment:
            for i, batch in enumerate(datagen.flow(img_array, batch_size=1)):
                aug_img = array_to_img(batch[0], scale=True)
                save_path = os.path.join(save_dir, f"{os.path.splitext(os.path.basename(image_path))[0]}_aug_{i}.jpg")
                save_img(save_path, aug_img)
                if i >= 4:  # Save 5 augmented images per original image
                    break
        else:
            # Save original resized image if augmentation is disabled
            save_path = os.path.join(save_dir, os.path.basename(image_path))
            save_img(save_path, img)
    except Exception as e:
        print(f"Error processing {image_path}: {e}")

# Process dataset with checks
for folder in folders:
    images = glob.glob(os.path.join(base_dir, folder, '*.jpg'))
    
    # Check if images are loaded from the folder
    if len(images) == 0:
        print(f"No images found in folder: {folder}")
        continue
    
    print(f"Processing {len(images)} images in folder: {folder}")
    
    # Split dataset
    train_imgs, test_imgs = train_test_split(images, test_size=0.2, random_state=42)
    train_imgs, val_imgs = train_test_split(train_imgs, test_size=0.25, random_state=42)  # 60% train, 20% val, 20% test

    for split, imgs in zip(['train', 'val', 'test'], [train_imgs, val_imgs, test_imgs]):
        for img_path in imgs:
            augment = split == 'train'  # Only augment training images
            preprocess_and_save_image(img_path, os.path.join(output_dir, split, folder), augment)

print("Data preprocessing and augmentation complete.")

Processing 120 images in folder: Bengin cases
Processing 561 images in folder: Malignant cases
Processing 416 images in folder: Normal cases
Data preprocessing and augmentation complete.
