In [2]:
import os
import shutil
import random

from google.colab import drive

In [4]:
## Some constant variables
drive.mount('/content/drive')
CAPSTONE_PATH = "/content/drive/MyDrive/CMU/Fall 2023/Capstone Project - Team Cylab/CapStone/"  # replace it with your path to the drive for capstone shared with you
DATA_ROOT_PATH = os.path.join(CAPSTONE_PATH, 'data')
DATA_UNSEEN_PATH = os.path.join(DATA_ROOT_PATH, "UNSEEN_DATA")

# Comment and uncomment below two lines to toggle different datasets
data_dir = os.path.join(DATA_ROOT_PATH, 'LOW_QUALITY_WITH_OG')  # Final dataset with both Original dataser and low quality generated dataset
data_unseen_all = os.path.join(DATA_UNSEEN_PATH, "ALL")
# data_dir = os.path.join(DATA_ROOT_PATH, 'LOW_QUALITY_WITH_NO_OG')  # Final low quality dataset without original images

# Define the paths for your dataset
# train_dir = os.path.join(DATA_ROOT_PATH, 'SPLIT_DATA_WITH_NO_OG', 'Train')
# val_dir = os.path.join(DATA_ROOT_PATH, 'SPLIT_DATA_WITH_NO_OG', 'Valid')
# test_dir = os.path.join(DATA_ROOT_PATH, 'SPLIT_DATA_WITH_NO_OG', 'Test')


# Define the paths for your dataset
train_dir = os.path.join(DATA_ROOT_PATH, 'SPLIT_DATA_WITH_OG', 'Train')
val_dir = os.path.join(DATA_ROOT_PATH, 'SPLIT_DATA_WITH_OG', 'Valid')
test_dir = os.path.join(DATA_ROOT_PATH, 'SPLIT_DATA_WITH_OG', 'Test')
feature_extractor_dir = os.path.join(DATA_ROOT_PATH, 'SPLIT_DATA_WITH_OG', 'Feature_Extractor')



test_unseen_dir = os.path.join(DATA_UNSEEN_PATH, "TEST")
# feature_extractor_dir = os.path.join(DATA_UNSEEN_PATH, "FEATURE_EXTRACTOR")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def split_unseen_dataset():
    """
    The function which splits the dataset into train, validation and test
    """
    os.makedirs(test_unseen_dir, exist_ok=True)
    os.makedirs(feature_extractor_dir, exist_ok=True)
    os.makedirs(data_unseen_all, exist_ok=True)

    # Define the split ratios
    train_ratio = 0.7 # 70% for training
    val_ratio = 0.15  # 15% for validation
    test_ratio = 0.15  # 15% for testing

    # List all subdirectories (each subdirectory corresponds to a class)
    class_dirs = [d for d in os.listdir(data_unseen_all) if os.path.isdir(os.path.join(data_unseen_all, d))]

    # Iterate through each class directory and split the data
    for class_dir in class_dirs:
        class_data_dir = os.path.join(data_unseen_all, class_dir)

        class_val_dir = os.path.join(feature_extractor_dir, class_dir)
        class_test_dir = os.path.join(test_unseen_dir, class_dir)

        os.makedirs(class_val_dir, exist_ok=True)
        os.makedirs(class_test_dir, exist_ok=True)

        class_files = os.listdir(class_data_dir)
        random.shuffle(class_files)

        num_files = len(class_files)
        num_train = int(train_ratio * num_files)
        num_val = int(val_ratio * num_files)

        train_files = class_files[:num_train]
        val_files = class_files[num_train:num_train + num_val]
        test_files = class_files[num_train + num_val:]

        # for file in train_files:
        #     src = os.path.join(class_data_dir, file)
        #     dst = os.path.join(class_train_dir, file)
        #     shutil.copy(src, dst)

        for file in val_files:
            src = os.path.join(class_data_dir, file)
            dst = os.path.join(class_val_dir, file)
            shutil.copy(src, dst)

        for file in test_files:
            src = os.path.join(class_data_dir, file)
            dst = os.path.join(class_test_dir, file)
            shutil.copy(src, dst)


In [6]:
def split_dataset(data_dir):
    """
    The function which splits the dataset into train, validation and test
    """


    # Create the train, validation, and test directories if they don't exist
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(feature_extractor_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    # Define the split ratios
    # Define the split ratios
    train_ratio = 0.6 # 70% for training
    val_ratio = 0.15  # 15% for validation
    feature_extractor = 0.15
    test_ratio = 0.10  # 15% for testing

    # List all subdirectories (each subdirectory corresponds to a class)
    class_dirs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]

    # Iterate through each class directory and split the data
    for class_dir in class_dirs:
        class_data_dir = os.path.join(data_dir, class_dir)
        class_train_dir = os.path.join(train_dir, class_dir)
        class_val_dir = os.path.join(val_dir, class_dir)
        class_feature_extractor_dir = os.path.join(feature_extractor_dir, class_dir)
        class_test_dir = os.path.join(test_dir, class_dir)

        os.makedirs(class_train_dir, exist_ok=True)
        os.makedirs(class_val_dir, exist_ok=True)
        os.makedirs(class_feature_extractor_dir, exist_ok=True)
        os.makedirs(class_test_dir, exist_ok=True)

        class_files = os.listdir(class_data_dir)
        random.shuffle(class_files)

        num_files = len(class_files)
        num_train = int(train_ratio * num_files)
        num_val = round(val_ratio * num_files)
        num_feature_extractor = round(feature_extractor * num_files)
        num_test = int(test_ratio * num_files)

        train_files = class_files[:num_train]
        val_files = class_files[num_train:num_train + num_val]
        feature_extractor_files = class_files[num_train + num_val:num_train + num_val + num_feature_extractor]
        test_files = class_files[num_train + num_val + num_feature_extractor:]

        for file in train_files:
            src = os.path.join(class_data_dir, file)
            dst = os.path.join(class_train_dir, file)
            shutil.copy(src, dst)

        for file in val_files:
            src = os.path.join(class_data_dir, file)
            dst = os.path.join(class_val_dir, file)
            shutil.copy(src, dst)

        for file in feature_extractor_files:
            src = os.path.join(class_data_dir, file)
            dst = os.path.join(class_feature_extractor_dir, file)
            shutil.copy(src, dst)

        for file in test_files:
            src = os.path.join(class_data_dir, file)
            dst = os.path.join(class_test_dir, file)
            shutil.copy(src, dst)


In [None]:
split_unseen_dataset()

In [7]:
split_dataset(data_dir)