In [1]:
import os
import pandas as pd
dataset_dir = '/blue/prabhat/shared/datasets/celeba'

In [3]:
def prepare_custom_filtered_dataset(dataset_dir, target_dir, attributes_list1, attributes_list2):
    
    attr_file = os.path.join(dataset_dir, 'list_attr_celeba.csv')
    data_info = pd.read_csv(attr_file)

    condition = pd.Series([True] * len(data_info))

    # Filter for attributes in attributes_list1 (where value must be 1)
    for attr in attributes_list1:
        condition &= (data_info[attr] == 1)

    # Filter for attributes in attributes_list2 (where value must be -1)
    for attr in attributes_list2:
        condition &= (data_info[attr] == -1)

    matching_images = data_info[condition]['image_id']

    class_folder_name = "_".join(attributes_list1 + ['Not_' + attr for attr in attributes_list2])  # Folder name combining the attributes
    target_class_folder = os.path.join(target_dir, class_folder_name)

    if not os.path.exists(target_class_folder):
        os.makedirs(target_class_folder)

    for image_name in matching_images:
        source_image_path = os.path.join(dataset_dir, 'img_align_celeba', image_name)
        target_image_path = os.path.join(target_class_folder, image_name)

        if os.path.exists(source_image_path):
            with open(source_image_path, 'rb') as src_file:
                with open(target_image_path, 'wb') as tgt_file:
                    tgt_file.write(src_file.read())

    print(f"Prepared dataset with images that match attributes in {attributes_list1} and do not have attributes in {attributes_list2}.")
    print(f"Number of images copied: {len(matching_images)}")

# Define the target directory where the filtered dataset will be saved
target_dir = os.path.join(os.getcwd(), 'celebA_class_dataset')

# List of attributes to filter by, with one list for attributes=1 and another for attributes=-1
attributes_list1 = ['Smiling','Young']   # Attributes that should be 1 (present)
attributes_list2 = []      # Attributes that should be -1 (absent)

prepare_custom_filtered_dataset(dataset_dir, target_dir, attributes_list1, attributes_list2)

Prepared dataset with images that match attributes in ['Smiling', 'Young'] and do not have attributes in [].
Number of images copied: 74152


In [5]:
def count_images_with_all_attributes(dataset_dir, attributes_list1, attributes_list2):

    attr_file = os.path.join(dataset_dir, 'list_attr_celeba.csv')
    data_info = pd.read_csv(attr_file)
    condition = pd.Series([True] * len(data_info))

    for attr in attributes_list1:
        condition &= (data_info[attr] == 1)

    for attr in attributes_list2:
        condition &= (data_info[attr] == -1)

    matching_images_count = condition.sum()

    print(f"Number of images that match the attributes {attributes_list1} (1) and {attributes_list2} (-1): {matching_images_count}")

# List of attributes to filter by, with one list for attributes=1 and another for attributes=-1
attributes_list1 = ['Smiling','Young','Male','Bald']   # Attributes that should be 1 (present)
attributes_list2 = []      # Attributes that should be -1 (absent)

# Call the function to count images
count_images_with_all_attributes(dataset_dir, attributes_list1, attributes_list2)

Number of images that match the attributes ['Smiling', 'Young', 'Male', 'Bald'] (1) and [] (-1): 520


In [4]:
def count_items_in_folder(folder):
    return len(os.listdir(folder)) if os.path.isdir(folder) else 0

subfolder = os.getcwd() + "/celebA_class_dataset/Smiling_Young_Male_Bald_No_Beard_Mustache"
print("Number of items:", count_items_in_folder(subfolder))

Number of items: 2


In [9]:
attributes_list1 = []   # Attributes that should be 1 (present)
attributes_list2 = ['Male']      # Attributes that should be -1 (absent)

class_folder_name = "_".join(attributes_list1 + ['Not_' + attr for attr in attributes_list2])  # Folder name combining the attributes
target_class_folder = os.path.join(target_dir, class_folder_name)

prepare_custom_filtered_dataset(dataset_dir, target_dir, attributes_list1, attributes_list2)
print("Number of items:", count_items_in_folder(target_class_folder))
count_images_with_all_attributes(dataset_dir, attributes_list1, attributes_list2)

Prepared dataset with images that match attributes in [] and do not have attributes in ['Male'].
Number of images copied: 118165
Number of items: 118165
Number of images that match the attributes [] (1) and ['Male'] (-1): 118165


In [5]:
print("Number of items:", count_items_in_folder(os.getcwd() + "/celebA_class_dataset/Male_60_Not_Male_40"))

Number of items: 80000
