### Generating CSV Files with Kinship Image Pairs and Non-Kinship Image Pairs

We'll first generate the kinship image pairs to determine how much non-kinship image pairs we need. 

We'll not use any images in ```F0900 - F1000``` as they will be used to combine the results of different models together. 

**Step 1**: Permutate images of each kinship pair stated in ```train_relationships.csv``` and create a new CSV file with all possible image pairs. 

In [None]:
import pandas as pd
import os
import itertools
from tqdm import tqdm

### Paths to Change Start###
ROOT = "/home/UG/c200140/recognizing-faces-in-the-wild" 
RELATIONSHIPS_PATH = ROOT + "/train_relationships.csv"
# train_relationships.csv probably uses the train folder since F0009 is in train but not in train-faces
IMAGE_PATH = ROOT + "/train" 
TRAIN_FOLDER = ROOT + "/Excel/Train (New)"
TEST_FOLDER = ROOT + "/Excel/Test (New)"
### Paths to Change End###

In [None]:
num_members = [0 for x in range(10)]
num_images =  [0 for x in range(10)]

for family in os.listdir(IMAGE_PATH):
    idx = int(family[2]) % 10
    num_members[idx] += len(os.listdir(IMAGE_PATH + f"/{family}"))
    for member in os.listdir(IMAGE_PATH + f"/{family}"):
        num_images[idx] += len(os.listdir(IMAGE_PATH + f"/{family}/{member}"))
        
print(f"Total Number of Families in Dataset: {len(os.listdir(IMAGE_PATH))}")
print(f"Total Number of Unique Members in Dataset: {num_members}")
print(f"Total Number of Images in Dataset: {num_images}")

In [None]:
def generate_image_pairs(path1, path2):
    # Example of input path1 and path2 is "F0001/MID1" 
    images1 = os.listdir(IMAGE_PATH + f"/{path1}") 
    images2 = os.listdir(IMAGE_PATH + f"/{path2}")
    
    # Create a list of paths to the images e.g. "F0001/MID1/P00001_face2" 
    for i in range(0, len(images1)):
        images1[i] = path1 + "/"+ images1[i].replace(".jpg", "") 
        
    for i in range(0, len(images2)):
        images2[i] = path2 + "/" + images2[i].replace(".jpg", "")
    
    # Permutation - https://www.geeksforgeeks.org/python-itertools-product/
    image_pairs = list(itertools.product(images1,images2)) 
        
    return image_pairs

In [None]:
# F0002/MID1 has 10 images, F0002/MID2 has 9 images, Total possible image pairs = 9 * 10 = 90
print(len(generate_image_pairs("F0002/MID1","F0002/MID2")))

In [None]:
for i in range(9):
    LOWER_BOUND = i * 100
    UPPER_BOUND = (i + 1) * 100 # To include F1000, set UPPER_BOUND to 1001
    val_set = f"V0{i}"
    print(f"Val Set: {val_set}, Lower Bound: {LOWER_BOUND}, Upper Bound: {UPPER_BOUND}")

    train_relationships_df = pd.read_csv(RELATIONSHIPS_PATH) 

    kinship_image_pairs_dataset_train = [] # Empty List
    kinship_image_pairs_dataset_test = [] # Empty List

    # For each row (member1, member2) in train_relationships.csv, permutate all their images 
    for index, row in tqdm(train_relationships_df.iterrows(), total = len(train_relationships_df), desc="Progress"):
        ### Exclude V09 for Testing 
        if int(row[0][1:5]) >= 900 or int(row[1][1:5]) >= 900:
            continue
        # TEST SET
        if (LOWER_BOUND <= int(row[0][1:5])) < UPPER_BOUND and (LOWER_BOUND <= int(row[1][1:5]) < UPPER_BOUND):
            try:
                image_pairs = generate_image_pairs(row[0], row[1])
                kinship_image_pairs_dataset_test = kinship_image_pairs_dataset_test + image_pairs # concat list
            except FileNotFoundError as e:
                print(f"{e}")
        # TRAIN SET
        else:
            try:
                image_pairs = generate_image_pairs(row[0], row[1])
                kinship_image_pairs_dataset_train = kinship_image_pairs_dataset_train + image_pairs # concat list
            except FileNotFoundError as e:
                print(f"{e}")


    kinship_image_pairs_dataset_train = pd.DataFrame(kinship_image_pairs_dataset_train, columns=["Image 1", "Image 2"])
    kinship_image_pairs_dataset_train = kinship_image_pairs_dataset_train.reset_index(drop = True)

    kinship_image_pairs_dataset_test = pd.DataFrame(kinship_image_pairs_dataset_test, columns=["Image 1", "Image 2"])
    kinship_image_pairs_dataset_test = kinship_image_pairs_dataset_test.reset_index(drop = True)

    KINSHIP_SIZE_TRAIN = len(kinship_image_pairs_dataset_train)
    KINSHIP_SIZE_TEST = len(kinship_image_pairs_dataset_test)

    kinship_image_pairs_dataset_train.to_csv(TRAIN_FOLDER + f"/train-kin-pairs-{val_set}.csv", index = False) # Save to CSV without index column
    kinship_image_pairs_dataset_test.to_csv(TEST_FOLDER + f"/test-kin-pairs-{val_set}.csv", index = False) # Save to CSV without index column

    print(f"Total Number of Kinship Image Pairs (Train): {KINSHIP_SIZE_TRAIN}")
    print(f"Total Number of Kinship Image Pairs (Test): {KINSHIP_SIZE_TEST}")

    #Free Memory
    kinship_image_pairs_dataset_train = None
    kinship_image_pairs_dataset_test = None
    train_relationships_df = None

**Step 2**: Generate sufficient non-kinship pairs and create a new CSV file with all possible non-kinship pairs

In [None]:
import random
import math

In [None]:
family_train = []
family_test = []
for family in os.listdir(IMAGE_PATH):
    if (int(family[1:]) > 900):
        continue
    if (LOWER_BOUND <= int(family[1:]) < UPPER_BOUND):
        family_test.append(family)
    else:
        family_train.append(family)

family_pairs_train_df = list(itertools.combinations(family_train,2))
family_pairs_train_df = pd.DataFrame(family_pairs_train_df)

family_pairs_test_df = list(itertools.combinations(family_test,2))
family_pairs_test_df = pd.DataFrame(family_pairs_test_df)

print(f"Total Number of Possible Family Pairs Combination (Train): {len(family_pairs_train_df)}")
print(f"Total Number of Possible Family Pairs Combination (Test): {len(family_pairs_test_df)}")

In [None]:
# For each pair of family in family_pairs_df, take a random family member image
# and pair it with another random family member image
def generate_random_image_pair(family1, family2, samples_for_each_pair):
    count1 = 0
    for member in os.listdir(IMAGE_PATH + f"/{family1}"):
        for image in os.listdir(IMAGE_PATH + f"/{family1}" + f"/{member}"):
            count1 += 1
    count2 = 0
    for member in os.listdir(IMAGE_PATH + f"/{family2}"):
        for image in os.listdir(IMAGE_PATH + f"/{family2}" + f"/{member}"):
            count2 += 1
    samples_for_each_pair = min(count1*count2, samples_for_each_pair)
    
    # Example of input family1 and family2 is "F0001"
    path1 = IMAGE_PATH + f"/{family1}"
    path2 = IMAGE_PATH + f"/{family2}"
    
    image_pairs = []
    
    while (samples_for_each_pair > 0):
        
        duplicate = False
        # Choose a randomm member from each family
        member1 = random.choice(os.listdir(path1)) #random member from family1
        member2 = random.choice(os.listdir(path2)) #random member from family2
        
        # Apparently there are some empty folders such as F0101/MID11
        while len(os.listdir(path1 + f"/{member1}")) == 0:
            member1 = random.choice(os.listdir(path1)) #random member from family1
        while len(os.listdir(path2 + f"/{member2}")) == 0:
            member2 = random.choice(os.listdir(path2)) #random member from family2
               
        # random image of member1
        image1 = family1 + "/" + member1 + "/" + random.choice(os.listdir(path1 + f"/{member1}")).replace(".jpg", "") 
        # random image of member2
        image2 = family2 + "/" + member2 + "/" + random.choice(os.listdir(path2 + f"/{member2}")).replace(".jpg", "")
        
        # Ensure no duplicate pairs are generated
        for i in range(len(image_pairs)):
            if (image1, image2) == image_pairs[i]:
                duplicate = True
                break
        if (duplicate == False):
            image_pairs.append((image1, image2))
            samples_for_each_pair -= 1
                                                                                                             
    return image_pairs

In [None]:
generate_random_image_pair("F0002", "F0005", 10) #just for testing

In [None]:
for i in range(9):
    LOWER_BOUND = i * 100
    UPPER_BOUND = (i+1) * 
    val_set = f"V0{i}"
    kin_train_df = pd.read_csv(TRAIN_FOLDER + f"/train-kin-pairs-{val_set}.csv")
    kin_test_df = pd.read_csv(TEST_FOLDER + f"/test-kin-pairs-{val_set}.csv")

    KINSHIP_SIZE_TRAIN = len(kin_train_df)
    KINSHIP_SIZE_TEST = len(kin_test_df)

    print(KINSHIP_SIZE_TRAIN, KINSHIP_SIZE_TEST)

    family_train = []
    family_test = []
    for family in os.listdir(IMAGE_PATH):
        if (int(family[1:]) >= 900):
            continue
        if (LOWER_BOUND <= int(family[1:]) < UPPER_BOUND):
            family_test.append(family)
        else:
            family_train.append(family)

    family_pairs_train_df = list(itertools.combinations(family_train,2))
    family_pairs_train_df = pd.DataFrame(family_pairs_train_df)

    family_pairs_test_df = list(itertools.combinations(family_test,2))
    family_pairs_test_df = pd.DataFrame(family_pairs_test_df)

    print(f"Total Number of Possible Family Pairs Combination (Train): {len(family_pairs_train_df)}")
    print(f"Total Number of Possible Family Pairs Combination (Test): {len(family_pairs_test_df)}")

    non_kinship_image_pairs_dataset_train = [] # Empty List
    non_kinship_image_pairs_dataset_test = [] # Empty List

    for index, row in tqdm(family_pairs_train_df.iterrows(), total = len(family_pairs_train_df), desc="Progress"):
        # For each family pair, generate two unique non-kinship pairs
        image_pairs = generate_random_image_pair(row[0], row[1], math.ceil(KINSHIP_SIZE_TRAIN/len(family_pairs_train_df)))
        non_kinship_image_pairs_dataset_train.extend(image_pairs) # extend is faster than concat?

    for index, row in tqdm(family_pairs_test_df.iterrows(), total = len(family_pairs_test_df), desc="Progress"):
        # For each family pair, generate two unique non-kinship pairs
        image_pairs = generate_random_image_pair(row[0], row[1], math.ceil(KINSHIP_SIZE_TEST/len(family_pairs_test_df)))
        non_kinship_image_pairs_dataset_test.extend(image_pairs) # extend is faster than concat?


    non_kinship_image_pairs_dataset_train = pd.DataFrame(non_kinship_image_pairs_dataset_train, columns=["Image 1", "Image 2"])
    non_kinship_image_pairs_dataset_train = non_kinship_image_pairs_dataset_train.reset_index(drop = True)

    non_kinship_image_pairs_dataset_test = pd.DataFrame(non_kinship_image_pairs_dataset_test, columns=["Image 1", "Image 2"])
    non_kinship_image_pairs_dataset_test  = non_kinship_image_pairs_dataset_test.reset_index(drop = True)

    print("Before Sampling")
    print(f"Total Number of Non-Kinship Image Pairs (Train): {len(non_kinship_image_pairs_dataset_train)}")
    print(f"Total Number of Non-Kinship Image Pairs (Test): {len(non_kinship_image_pairs_dataset_test)}")

    # Randomly sample KINSHIP_SIZE non-kinship image pairs 
    non_kinship_image_pairs_dataset_train = non_kinship_image_pairs_dataset_train.sample(n = KINSHIP_SIZE_TRAIN)
    non_kinship_image_pairs_dataset_test = non_kinship_image_pairs_dataset_test.sample(n = KINSHIP_SIZE_TEST)

    print("After Sampling")
    print(f"Total Number of Non-Kinship Image Pairs (Train): {len(non_kinship_image_pairs_dataset_train)}")
    print(f"Total Number of Non-Kinship Image Pairs (Test): {len(non_kinship_image_pairs_dataset_test)}")


    non_kinship_image_pairs_dataset_train.to_csv(TRAIN_FOLDER + f"/train-non-kin-pairs-{val_set}.csv", index = False) # Save to CSV without index column
    non_kinship_image_pairs_dataset_test.to_csv(TEST_FOLDER + f"/test-non-kin-pairs-{val_set}.csv", index = False) # Save to CSV without index column


    # Free memory
    family_pairs_train_df = None
    family_pairs_test_df = None
    non_kinship_image_pairs_dataset_train = None
    non_kinship_image_pairs_dataset_test = None