In [1]:
import os
import shutil

In [5]:
# Step 1: Read the file and create a dictionary mapping each identity to its images
with open('identity_CelebA.txt', 'r') as file:
    lines = file.readlines()

identity_dict = {}
for line in lines:
    image, identity = line.strip().split()
    identity_dict.setdefault(identity, []).append(image)

# Step 2: Filter identities that have at least 11 images
filtered_identities = {ident: imgs for ident, imgs in identity_dict.items() if len(imgs) >= 4}

# Step 3: Select the first 1000 identities (using the insertion order)
selected_identities = list(filtered_identities.keys())[:1000]

# Define the source directory where the images are stored
src_dir = os.path.join("..", "datasets", "celeba_HR_resized_128")
lr_dir = os.path.join("..", "datasets", "celeba_LR_factor_0.25")

parent_dir = os.path.join(os.getcwd(), "positive_pairs")
os.makedirs(parent_dir, exist_ok=True)

In [6]:
# for each identity, select 2 images and copy them to a subdirectory within the parent directory
# this means that you should take the same two images from both the src_dir and lr_dir, as they correspond to the same image
# total images should be 4 in each subdirectory, 2 HR and 2 LR
# do it within the same loop for both HR and LR
for identity in selected_identities:
    dest_dir = os.path.join(parent_dir, identity)
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    else:
        for filename in os.listdir(dest_dir):
            file_path = os.path.join(dest_dir, filename)
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)

    count = 0
    for image in filtered_identities[identity]:
        src_path = os.path.join(src_dir, image)
        lr_path = os.path.join(lr_dir, image)
        if os.path.exists(src_path) and os.path.exists(lr_path):
            if count == 2:
                break
            # rename the HR image
            dest_path = os.path.join(dest_dir, f"HR_{image}")
            shutil.copy(src_path, dest_path)
            
            shutil.copy(lr_path, dest_dir)
            count += 1
        else:
            print(f"Image {image} not found")