# *Create DataSet Pipline**

In [2]:
def create_dataset_pipeline(source_dir, base_dir, image_size=(224, 224), val_ratio=0.20):
    import os
    import shutil
    import cv2
    from sklearn.model_selection import train_test_split
    """
    Augments train & test images separately.
    Splits only the train set into train and validation sets.
    """
    print("Starting Data Engineering Pipeline...")

    # Directories
    augmented_train_dir = os.path.join(base_dir, '01_augmented_train')
    augmented_test_dir = os.path.join(base_dir, '02_augmented_test')
    train_dir = os.path.join(base_dir, '03_train')
    validation_dir = os.path.join(base_dir, '04_validation')
    test_dir = os.path.join(base_dir, '05_test')

    # Remove old output
    if os.path.exists(base_dir):
        print(f"Removing old base directory: {base_dir}")
        shutil.rmtree(base_dir)
    os.makedirs(augmented_train_dir, exist_ok=True)
    os.makedirs(augmented_test_dir, exist_ok=True)

    def augment_and_save(src_split_dir, out_split_dir):
        """Resize + rotate + flip images and save them."""
        class_labels = [d for d in os.listdir(src_split_dir) if os.path.isdir(os.path.join(src_split_dir, d))]
        print(f"Found classes in {src_split_dir}: {class_labels}")

        for class_name in class_labels:
            src_class_dir = os.path.join(src_split_dir, class_name)
            out_class_dir = os.path.join(out_split_dir, class_name)
            os.makedirs(out_class_dir, exist_ok=True)

            for filename in os.listdir(src_class_dir):
                img_path = os.path.join(src_class_dir, filename)
                img = cv2.imread(img_path)
                if img is None:
                    print(f"Could not read image: {img_path}, skipping")
                    continue

                resized = cv2.resize(img, (image_size[1], image_size[0]))
                rotated = cv2.rotate(resized, cv2.ROTATE_90_CLOCKWISE)
                flipped = cv2.flip(resized, 0)

                base_name, ext = os.path.splitext(filename)
                cv2.imwrite(os.path.join(out_class_dir, f"{base_name}_orig{ext}"), resized)
                cv2.imwrite(os.path.join(out_class_dir, f"{base_name}_rot{ext}"), rotated)
                cv2.imwrite(os.path.join(out_class_dir, f"{base_name}_flip{ext}"), flipped)

        return class_labels

    # Augment train and test sets separately
    print("Augmenting TRAIN set...")
    train_labels = augment_and_save(os.path.join(source_dir, 'train'), augmented_train_dir)
    print("Augmenting TEST set...")
    _ = augment_and_save(os.path.join(source_dir, 'test'), augmented_test_dir)

    # Collect augmented train files for splitting
    all_train_files = []
    all_train_labels = []
    for class_name in train_labels:
        class_dir = os.path.join(augmented_train_dir, class_name)
        for fname in os.listdir(class_dir):
            all_train_files.append(os.path.join(class_dir, fname))
            all_train_labels.append(class_name)

    # Split train into train/validation
    train_files, val_files, train_labels_split, val_labels_split = train_test_split(
        all_train_files, all_train_labels,
        test_size=val_ratio,
        random_state=42,
        stratify=all_train_labels
    )

    def copy_files(file_list, destination_dir):
        for file_path in file_list:
            cls = os.path.basename(os.path.dirname(file_path))
            dest_class_dir = os.path.join(destination_dir, cls)
            os.makedirs(dest_class_dir, exist_ok=True)
            shutil.copy(file_path, dest_class_dir)

    # Create final dirs
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(validation_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    print("Copying TRAIN files...")
    copy_files(train_files, train_dir)
    print("Copying VALIDATION files...")
    copy_files(val_files, validation_dir)
    print("Copying TEST files...")
    for class_name in os.listdir(augmented_test_dir):
        class_src_dir = os.path.join(augmented_test_dir, class_name)
        for fname in os.listdir(class_src_dir):
            dest_class_dir = os.path.join(test_dir, class_name)
            os.makedirs(dest_class_dir, exist_ok=True)
            shutil.copy(os.path.join(class_src_dir, fname), dest_class_dir)

    print("Pipeline finished successfully.")
    print(f"Train images: {len(train_files)}")
    print(f"Validation images: {len(val_files)}")
    print(f"Test images: sum([len(files) for files in test_dir])")

    return {
        'augmented_train_dir': augmented_train_dir,
        'augmented_test_dir': augmented_test_dir,
        'train_dir': train_dir,
        'validation_dir': validation_dir,
        'test_dir': test_dir,
        'class_labels': train_labels
    }

In [4]:
source_dir, base_dir = r"F:\brain tumor classification final year project\MRI_Orignal Data",r"F:\Rough preprocess image"
create_dataset_pipeline(source_dir,base_dir)

Starting Data Engineering Pipeline...
Removing old base directory: F:\Rough preprocess image
Augmenting TRAIN set...
Found classes in F:\brain tumor classification final year project\MRI_Orignal Data\train: ['glioma_tumor', 'meningioma_tumor', 'no_tumor', 'pituitary_tumor']
Augmenting TEST set...
Found classes in F:\brain tumor classification final year project\MRI_Orignal Data\test: ['glioma_tumor', 'meningioma_tumor', 'no_tumor', 'pituitary_tumor']
Copying TRAIN files...
Copying VALIDATION files...
Copying TEST files...
Pipeline finished successfully.
Train images: 6888
Validation images: 1722
Test images: sum([len(files) for files in test_dir])


{'augmented_train_dir': 'F:\\Rough preprocess image\\01_augmented_train',
 'augmented_test_dir': 'F:\\Rough preprocess image\\02_augmented_test',
 'train_dir': 'F:\\Rough preprocess image\\03_train',
 'validation_dir': 'F:\\Rough preprocess image\\04_validation',
 'test_dir': 'F:\\Rough preprocess image\\05_test',
 'class_labels': ['glioma_tumor',
  'meningioma_tumor',
  'no_tumor',
  'pituitary_tumor']}

In [None]:
1. Tqdm when the datset pipline is executed it show the progreess which loop is executed and which loop is not executed aslo show the total number of image after the augmentatio and total number of image before the augmentaton also print some other necessary infomation if needed , in this function another problem is that it duplicate the data set first it store the augment train image , and than it seprate this augment train into three directry train ,test and validation , remove this duplication adjust the function that only store the the train ,validation and final  test set not train_augment,test_augment, train,test and validation you see in the screenshut 