In [None]:
# 1. Mount Google Drive (if your data is on Drive)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 2. Import necessary libraries
import os
import re
import shutil

# Define source directory containing your images
source_dir = '/content/drive/MyDrive/UTKFace/55_75'  # Change this path to your images folder

# Define the base directory where classified images will be stored
dest_base_dir = '/content/drive/MyDrive/UTKFace/classified_images_55_75'  # Change as needed

# Define mappings for gender and race based on expected values in file names
gender_map = {'0': "Male", '1': "Female"}
race_map = {'0': "White", '1': "Black", '2': "Asian", '3': "Indian", '4': "Others"}

# Create the base destination folder if it doesn't exist
if not os.path.exists(dest_base_dir):
    os.makedirs(dest_base_dir)

# Regex pattern to extract age, gender, and race from filenames.
# This pattern matches filenames in the format: <age>_<gender>_<race>_<other_info>.jpg
pattern = r"(\d+)_([01])_([0-4])_.*\.jpg(?:\.chip\.jpg)?$"

# 3. Process each file: extract classification info and copy file to respective folder.
for file_name in os.listdir(source_dir):
    # Only process .jpg files (modify if needed for other file types)
    if file_name.lower().endswith(".jpg"):
        match = re.match(pattern, file_name)
        if match:
            age, gender_val, race_val = match.groups()
            # Map the extracted string labels to descriptive text using our maps
            gender_class = gender_map[gender_val]
            race_class = race_map[race_val]

            # Create a destination folder for this classification:
            # e.g., /.../classified_images/Male/White/
            dest_folder = os.path.join(dest_base_dir, gender_class, race_class)
            if not os.path.exists(dest_folder):
                os.makedirs(dest_folder)

            # Define source and destination file paths
            src_file = os.path.join(source_dir, file_name)
            dest_file = os.path.join(dest_folder, file_name)

            # Copy the file from source to destination folder
            shutil.copy(src_file, dest_file)

            print(f"Copied {file_name} to {dest_folder}")


Copied 65_0_2_20170120223049075.jpg.chip.jpg to /content/drive/MyDrive/UTKFace/classified_images_55_75/Male/Asian
Copied 65_0_3_20161220221926818.jpg.chip.jpg to /content/drive/MyDrive/UTKFace/classified_images_55_75/Male/Indian
Copied 65_0_0_20170120222749697.jpg.chip.jpg to /content/drive/MyDrive/UTKFace/classified_images_55_75/Male/White
Copied 65_0_1_20170113174447314.jpg.chip.jpg to /content/drive/MyDrive/UTKFace/classified_images_55_75/Male/Black
Copied 65_0_0_20170120223704629.jpg.chip.jpg to /content/drive/MyDrive/UTKFace/classified_images_55_75/Male/White
Copied 65_0_0_20170120221357579.jpg.chip.jpg to /content/drive/MyDrive/UTKFace/classified_images_55_75/Male/White
Copied 65_0_1_20170120222944643.jpg.chip.jpg to /content/drive/MyDrive/UTKFace/classified_images_55_75/Male/Black
Copied 65_0_1_20170117165348576.jpg.chip.jpg to /content/drive/MyDrive/UTKFace/classified_images_55_75/Male/Black
Copied 65_0_1_20170120223219484.jpg.chip.jpg to /content/drive/MyDrive/UTKFace/classifi

In [None]:
# Mount Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

import os
import random
import shutil

def split_dataset(source_dir: str, train_dir: str, test_dir: str, train_ratio: float = 0.8, seed: int = 42):
    """
    Splits images in each class subfolder of 'source_dir' into
    train and test sets according to 'train_ratio'.

    Parameters:
    - source_dir: str, path to the source dataset directory (e.g., containing the "Female" folder with class subfolders)
    - train_dir: str, path where the training set will be saved.
    - test_dir: str, path where the testing set will be saved.
    - train_ratio: float, ratio of images to place in the training set. (Default 0.8)
    - seed: int, random seed for shuffling.
    """

    # Set a fixed random seed for reproducibility
    random.seed(seed)

    # Create train and test directories if they do not exist
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    # Iterate over each class folder in the source directory
    for class_name in os.listdir(source_dir):
        class_path = os.path.join(source_dir, class_name)

        # Process only directories (class folders)
        if os.path.isdir(class_path):
            # Get list of image files from the current class folder
            images = [
                f for f in os.listdir(class_path)
                if os.path.isfile(os.path.join(class_path, f)) and f.lower().endswith(('.jpg', '.jpeg', '.png'))
            ]

            # Shuffle the list to ensure randomness
            random.shuffle(images)

            # Determine the split index for train/test based on train_ratio
            split_idx = int(len(images) * train_ratio)
            train_images = images[:split_idx]
            test_images = images[split_idx:]

            # Create corresponding directories in train and test splits for the current class
            train_class_dir = os.path.join(train_dir, class_name)
            test_class_dir = os.path.join(test_dir, class_name)
            os.makedirs(train_class_dir, exist_ok=True)
            os.makedirs(test_class_dir, exist_ok=True)

            # Copy images to the training directory
            for img in train_images:
                src_path = os.path.join(class_path, img)
                dst_path = os.path.join(train_class_dir, img)
                shutil.copy2(src_path, dst_path)

            # Copy images to the testing directory
            for img in test_images:
                src_path = os.path.join(class_path, img)
                dst_path = os.path.join(test_class_dir, img)
                shutil.copy2(src_path, dst_path)

            print(f"Class '{class_name}': {len(train_images)} images in train, {len(test_images)} images in test.")

# Example usage:
if __name__ == "__main__":
    # Update these paths to match your Google Drive structure
    source_dir = '/content/drive/MyDrive/UTKFace/classified_images_55_75/Female'  # Replace with your dataset path
    train_dir = '/content/drive/MyDrive/UTKFace/55_75_train_female'     # Replace with your desired train output path
    test_dir  = '/content/drive/MyDrive/UTKFace/55_75_test_female'      # Replace with your desired test output path

    split_dataset(source_dir, train_dir, test_dir, train_ratio=0.8, seed=42)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Class 'White': 548 images in train, 137 images in test.
Class 'Black': 90 images in train, 23 images in test.
Class 'Asian': 26 images in train, 7 images in test.
Class 'Indian': 65 images in train, 17 images in test.
Class 'Others': 5 images in train, 2 images in test.


In [None]:
# Mount Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

import os
import random
import shutil

def split_dataset(source_dir: str, train_dir: str, test_dir: str, train_ratio: float = 0.8, seed: int = 42):
    """
    Splits images in each class subfolder of 'source_dir' into
    train and test sets according to 'train_ratio'.

    Parameters:
    - source_dir: str, path to the source dataset directory (e.g., containing the "Female" folder with class subfolders)
    - train_dir: str, path where the training set will be saved.
    - test_dir: str, path where the testing set will be saved.
    - train_ratio: float, ratio of images to place in the training set. (Default 0.8)
    - seed: int, random seed for shuffling.
    """

    # Set a fixed random seed for reproducibility
    random.seed(seed)

    # Create train and test directories if they do not exist
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    # Iterate over each class folder in the source directory
    for class_name in os.listdir(source_dir):
        class_path = os.path.join(source_dir, class_name)

        # Process only directories (class folders)
        if os.path.isdir(class_path):
            # Get list of image files from the current class folder
            images = [
                f for f in os.listdir(class_path)
                if os.path.isfile(os.path.join(class_path, f)) and f.lower().endswith(('.jpg', '.jpeg', '.png'))
            ]

            # Shuffle the list to ensure randomness
            random.shuffle(images)

            # Determine the split index for train/test based on train_ratio
            split_idx = int(len(images) * train_ratio)
            train_images = images[:split_idx]
            test_images = images[split_idx:]

            # Create corresponding directories in train and test splits for the current class
            train_class_dir = os.path.join(train_dir, class_name)
            test_class_dir = os.path.join(test_dir, class_name)
            os.makedirs(train_class_dir, exist_ok=True)
            os.makedirs(test_class_dir, exist_ok=True)

            # Copy images to the training directory
            for img in train_images:
                src_path = os.path.join(class_path, img)
                dst_path = os.path.join(train_class_dir, img)
                shutil.copy2(src_path, dst_path)

            # Copy images to the testing directory
            for img in test_images:
                src_path = os.path.join(class_path, img)
                dst_path = os.path.join(test_class_dir, img)
                shutil.copy2(src_path, dst_path)

            print(f"Class '{class_name}': {len(train_images)} images in train, {len(test_images)} images in test.")

# Example usage:
if __name__ == "__main__":
    # Update these paths to match your Google Drive structure
    source_dir = '/content/drive/MyDrive/UTKFace/classified_images_55_75/Male'  # Replace with your dataset path
    train_dir = '/content/drive/MyDrive/UTKFace/55_75_train_male'     # Replace with your desired train output path
    test_dir  = '/content/drive/MyDrive/UTKFace/55_75_test_male'      # Replace with your desired test output path

    split_dataset(source_dir, train_dir, test_dir, train_ratio=0.8, seed=42)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Class 'Asian': 127 images in train, 32 images in test.
Class 'Indian': 244 images in train, 61 images in test.
Class 'White': 882 images in train, 221 images in test.
Class 'Black': 223 images in train, 56 images in test.
Class 'Others': 20 images in train, 5 images in test.


In [16]:
# 1) Mount your Google Drive (if your data is on Drive)
from google.colab import drive
drive.mount('/content/drive')

import os
import shutil

def merge_images_into_single_folder(source_dir: str, output_dir: str):
    """
    Collects (copies) all images from multiple subfolders into a single folder.

    Parameters:
    - source_dir: The path to the parent directory containing subfolders
                  (e.g., 'Asian', 'Black', 'Indian', 'Others', 'White').
    - output_dir: The single directory where all images will be copied.
    """

    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Loop over all items in source_dir
    for item in os.listdir(source_dir):
        item_path = os.path.join(source_dir, item)

        # Proceed only if item is a subfolder (e.g. 'Asian', 'Black', etc.)
        if os.path.isdir(item_path):
            # For each subfolder, collect all images
            for file_name in os.listdir(item_path):
                src_file_path = os.path.join(item_path, file_name)

                # Check if this is an image (adjust extensions as needed)
                if os.path.isfile(src_file_path) and file_name.lower().endswith(('.jpg', '.jpeg', '.png')):
                    # Build destination path
                    dst_file_path = os.path.join(output_dir, file_name)

                    # Copy the file (use shutil.move if you want to remove the original)
                    shutil.copy2(src_file_path, dst_file_path)
                    # If file_name might be duplicated across subfolders,
                    # you could rename it or handle collisions here.

    print("Merging complete! Check your output folder.")

# Example usage:
if __name__ == "__main__":
    source_dir = "/content/drive/MyDrive/UTKFace/55_75_train_male"  # your directory with subfolders
    output_dir = "/content/drive/MyDrive/UTKFace/55_75_train_male_merged"  # your merged folder

    merge_images_into_single_folder(source_dir, output_dir)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Merging complete! Check your output folder.
