In [1]:
import pandas as pd
import dlib
from tqdm import tqdm
from distutils.dir_util import copy_tree
import os
import itertools
import shutil

In [2]:
target_sets = ['set1', 'set5', 'set8', 'set10']
target_percs = [0.1, 0.25, 0.5]
target_folders = ['male_10_female_ori', 'male_25_female_ori', 'male_50_female_ori',
                  'male_ori_female_10', 'male_ori_female_25', 'male_ori_female_50']

In [3]:
preprocessing = "/home/monash/Desktop/fyp-work/fyp-ma-13/fyp-models/preprocessing"
root = f"{preprocessing}/cv_datasets"

In [4]:
def copy_folder(fromDir, toDir):
    """
    Copies entire folder to another directory
    """
    copy_tree(fromDir, toDir)

In [11]:
def debug_num_overlap(male_perc, female_perc):
    assert type(male_perc) == float or male_perc == 'ori'
    assert type(female_perc) == float or female_perc == 'ori'
    male_label = int(male_perc*100) if type(male_perc) == float else male_perc
    female_label = int(female_perc*100) if type(female_perc) == float else female_perc
    target_folder = f"male_{male_label}_female_{female_label}" # male_x_female_x
    target_fp = f"{preprocessing}/experiments_20112021/{target_folder}"

    for s in target_sets:
        m_train_lst = set(os.listdir(f'{target_fp}/{s}/train/male'))
        m_test_lst = set(os.listdir(f'{target_fp}/{s}/test/male'))
        f_train_lst = set(os.listdir(f'{target_fp}/{s}/train/female'))
        f_test_lst = set(os.listdir(f'{target_fp}/{s}/test/female'))
        m_overlap = m_train_lst.intersection(m_test_lst)
        f_overlap = f_train_lst.intersection(f_test_lst)
        if len(m_overlap) > 0:
            print(f"Male train/test {len(m_overlap)} images overlap for set {s}")
        if len(f_overlap) > 0:
            print(f"Female train/test {len(f_overlap)} images overlap for {s}")

Male train/test 32 images overlap for set set1
Female train/test 247 images overlap for set1


In [6]:
# Check that each male and female train have the same length
def debug_train_length_aux(male_perc, female_perc):
    assert type(male_perc) == float or male_perc == 'ori'
    assert type(female_perc) == float or female_perc == 'ori'
    male_label = int(male_perc*100) if type(male_perc) == float else male_perc
    female_label = int(female_perc*100) if type(female_perc) == float else female_perc
    target_folder = f"male_{male_label}_female_{female_label}" # male_x_female_x
    target_fp = f"{preprocessing}/experiments_20112021/{target_folder}"

    for s in target_sets:
        m_lst = os.listdir(f'{target_fp}/{s}/train/male')
        f_lst = os.listdir(f'{target_fp}/{s}/train/female')
        assert len(m_lst) == 11610, f'Wrong male length - {len(m_lst)}'
        assert len(f_lst) == 11610, f'Wrong female length - {len(f_lst)}'
    return True

def debug_train_length():# for female perturbation only
    for perc in percs:
        debug_train_length_aux('ori', perc)

    # for male perturbation only
    for perc in percs: 
        debug_train_length_aux(perc, 'ori')    

    # for both
    for perc in percs:
        debug_train_length_aux(perc, perc)

In [7]:
def gen_debiased_datasets(male_perc, female_perc):
    """
    Generated debiased datasets with a certain portion perturbed/unperturbed
    
    1. Copies set1, set5, set8 and set10 from cv_datasets to target_fp
    2. Replace training images with perturbed images
    
    """
    assert type(male_perc) == float or male_perc == 'ori'
    assert type(female_perc) == float or female_perc == 'ori'
    male_label = int(male_perc*100) if type(male_perc) == float else male_perc
    female_label = int(female_perc*100) if type(female_perc) == float else female_perc
    target_folder = f"male_{male_label}_female_{female_label}" # male_x_female_x
    target_fp = f"{preprocessing}/experiments_20112021/{target_folder}"
    # 1. Copy set1, set5, set8 and set10 from cv_datasets to target_fp
    print("Copying set1, set5, set8 and set10 from cv_datasets to target_fp...")
    for s in target_sets:
        set_folder = f"{root}/{s}" # gives you access to the folder set data
        copy_folder(set_folder, f'{target_fp}/{s}')

    glasses_fp = 'glasses_full/'   # Only for males
    makeup_fp  = 'makeup_female/'
    print('Replacing training images with perturbed...')
    # 2. Replace training images with perturbed
    for s in target_sets:
        target_set_fp = f"{target_fp}/{s}"
        # Get access to necessary folders
        set_folder = f"{root}/{s}" # gives you access to the folder set data
        train_folder = f"{set_folder}/train"
        from_male_train = f"{set_folder}/train/male"
        from_female_train = f"{set_folder}/train/female" 
        # Go to relevant train folder and update the images
        # Replace len(train)*perc of images with perturbed
        if type(male_perc) == float:
            to_male_train = f'{target_fp}/{s}/train/male'
            male_train_folder = os.listdir(to_male_train)
            glasses_folder    = os.listdir(glasses_fp)
            num_replace = int(len(male_train_folder)*male_perc) # Number of images to replace
            for i in tqdm(range(num_replace), f'Replacing training male images for set {s}...'): # num_replace
                src = f'{glasses_fp}{glasses_folder[i]}'
                dst = to_male_train
                shutil.copy(src, dst)
        if type(female_perc) == float:
            to_female_train = f'{target_fp}/{s}/train/female/'
            female_train_folder = os.listdir(to_female_train)
            makeup_folder    = os.listdir(makeup_fp)
            num_replace = int(len(female_train_folder)*female_perc) # Number of images to replace
            for i in tqdm(range(num_replace), f'Replacing training female images for set {s}...'): # num_replace
                src = f'{makeup_fp}{makeup_folder[i]}'
                dst = to_female_train
                shutil.copy(src, dst)

In [8]:
gen_debiased_datasets('ori', 'ori')

Copying set1, set5, set8 and set10 from cv_datasets to target_fp...
Replacing training images with perturbed...


In [6]:
percs = [0.1, 0.25, 0.5]
    
# for female perturbation only
for perc in percs:
    gen_debiased_datasets('ori', perc)
    
# for male perturbation only
for perc in percs: 
    gen_debiased_datasets(perc, 'ori')    

# for both
for perc in percs:
    gen_debiased_datasets(perc, perc)

Copying set1, set5, set8 and set10 from cv_datasets to target_fp...


Replacing training female images for set set1...: 100%|██████████| 1161/1161 [00:00<00:00, 18400.87it/s]
Replacing training female images for set set5...: 100%|██████████| 1161/1161 [00:00<00:00, 14783.79it/s]
Replacing training female images for set set8...:   0%|          | 0/1161 [00:00<?, ?it/s]

Replacing training images with perturbed...


Replacing training female images for set set8...: 100%|██████████| 1161/1161 [00:00<00:00, 14550.94it/s]
Replacing training female images for set set10...: 100%|██████████| 1161/1161 [00:00<00:00, 14765.90it/s]


Copying set1, set5, set8 and set10 from cv_datasets to target_fp...


Replacing training female images for set set1...: 100%|██████████| 2902/2902 [00:00<00:00, 19142.98it/s]
Replacing training female images for set set5...:   0%|          | 0/2902 [00:00<?, ?it/s]

Replacing training images with perturbed...


Replacing training female images for set set5...: 100%|██████████| 2902/2902 [00:00<00:00, 14554.33it/s]
Replacing training female images for set set8...: 100%|██████████| 2902/2902 [00:00<00:00, 15251.14it/s]
Replacing training female images for set set10...: 100%|██████████| 2902/2902 [00:00<00:00, 15041.63it/s]


Copying set1, set5, set8 and set10 from cv_datasets to target_fp...


Replacing training female images for set set1...:  33%|███▎      | 1913/5805 [00:00<00:00, 19119.94it/s]

Replacing training images with perturbed...


Replacing training female images for set set1...: 100%|██████████| 5805/5805 [00:00<00:00, 18331.07it/s]
Replacing training female images for set set5...: 100%|██████████| 5805/5805 [00:00<00:00, 14146.05it/s]
Replacing training female images for set set8...: 100%|██████████| 5805/5805 [00:00<00:00, 14246.18it/s]
Replacing training female images for set set10...: 100%|██████████| 5805/5805 [00:00<00:00, 16036.45it/s]


Copying set1, set5, set8 and set10 from cv_datasets to target_fp...


Replacing training male images for set set1...: 100%|██████████| 1161/1161 [00:00<00:00, 17861.85it/s]
Replacing training male images for set set5...: 100%|██████████| 1161/1161 [00:00<00:00, 13020.81it/s]
Replacing training male images for set set8...:   0%|          | 0/1161 [00:00<?, ?it/s]

Replacing training images with perturbed...


Replacing training male images for set set8...: 100%|██████████| 1161/1161 [00:00<00:00, 14021.39it/s]
Replacing training male images for set set10...: 100%|██████████| 1161/1161 [00:00<00:00, 14557.81it/s]


Copying set1, set5, set8 and set10 from cv_datasets to target_fp...


Replacing training male images for set set1...: 100%|██████████| 2902/2902 [00:00<00:00, 17538.74it/s]
Replacing training male images for set set5...:   0%|          | 0/2902 [00:00<?, ?it/s]

Replacing training images with perturbed...


Replacing training male images for set set5...: 100%|██████████| 2902/2902 [00:00<00:00, 13887.05it/s]
Replacing training male images for set set8...: 100%|██████████| 2902/2902 [00:00<00:00, 13981.38it/s]
Replacing training male images for set set10...: 100%|██████████| 2902/2902 [00:00<00:00, 14453.84it/s]


Copying set1, set5, set8 and set10 from cv_datasets to target_fp...


Replacing training male images for set set1...:  31%|███       | 1780/5805 [00:00<00:00, 17792.59it/s]

Replacing training images with perturbed...


Replacing training male images for set set1...: 100%|██████████| 5805/5805 [00:00<00:00, 17514.48it/s]
Replacing training male images for set set5...: 100%|██████████| 5805/5805 [00:00<00:00, 15546.72it/s]
Replacing training male images for set set8...: 100%|██████████| 5805/5805 [00:00<00:00, 14563.39it/s]
Replacing training male images for set set10...: 100%|██████████| 5805/5805 [00:00<00:00, 14246.76it/s]


Copying set1, set5, set8 and set10 from cv_datasets to target_fp...


Replacing training male images for set set1...: 100%|██████████| 1161/1161 [00:00<00:00, 18056.44it/s]
Replacing training female images for set set1...: 100%|██████████| 1161/1161 [00:00<00:00, 19182.33it/s]
Replacing training male images for set set5...:   0%|          | 0/1161 [00:00<?, ?it/s]

Replacing training images with perturbed...


Replacing training male images for set set5...: 100%|██████████| 1161/1161 [00:00<00:00, 13569.60it/s]
Replacing training female images for set set5...: 100%|██████████| 1161/1161 [00:00<00:00, 15680.42it/s]
Replacing training male images for set set8...: 100%|██████████| 1161/1161 [00:00<00:00, 14260.53it/s]
Replacing training female images for set set8...: 100%|██████████| 1161/1161 [00:00<00:00, 15596.95it/s]
Replacing training male images for set set10...: 100%|██████████| 1161/1161 [00:00<00:00, 14477.61it/s]
Replacing training female images for set set10...: 100%|██████████| 1161/1161 [00:00<00:00, 15560.72it/s]


Copying set1, set5, set8 and set10 from cv_datasets to target_fp...


Replacing training male images for set set1...: 100%|██████████| 2902/2902 [00:00<00:00, 17277.49it/s]
Replacing training female images for set set1...:   0%|          | 0/2902 [00:00<?, ?it/s]

Replacing training images with perturbed...


Replacing training female images for set set1...: 100%|██████████| 2902/2902 [00:00<00:00, 18691.36it/s]
Replacing training male images for set set5...: 100%|██████████| 2902/2902 [00:00<00:00, 12184.16it/s]
Replacing training female images for set set5...: 100%|██████████| 2902/2902 [00:00<00:00, 14395.45it/s]
Replacing training male images for set set8...: 100%|██████████| 2902/2902 [00:00<00:00, 13243.02it/s]
Replacing training female images for set set8...: 100%|██████████| 2902/2902 [00:00<00:00, 13612.86it/s]
Replacing training male images for set set10...: 100%|██████████| 2902/2902 [00:00<00:00, 14067.05it/s]
Replacing training female images for set set10...: 100%|██████████| 2902/2902 [00:00<00:00, 15041.91it/s]


Copying set1, set5, set8 and set10 from cv_datasets to target_fp...


Replacing training male images for set set1...:  31%|███       | 1805/5805 [00:00<00:00, 18043.05it/s]

Replacing training images with perturbed...


Replacing training male images for set set1...: 100%|██████████| 5805/5805 [00:00<00:00, 17818.00it/s]
Replacing training female images for set set1...: 100%|██████████| 5805/5805 [00:00<00:00, 17522.16it/s]
Replacing training male images for set set5...: 100%|██████████| 5805/5805 [00:00<00:00, 14775.70it/s]
Replacing training female images for set set5...: 100%|██████████| 5805/5805 [00:00<00:00, 15309.56it/s]
Replacing training male images for set set8...: 100%|██████████| 5805/5805 [00:00<00:00, 14045.45it/s]
Replacing training female images for set set8...: 100%|██████████| 5805/5805 [00:00<00:00, 14574.64it/s]
Replacing training male images for set set10...: 100%|██████████| 5805/5805 [00:00<00:00, 13560.68it/s]
Replacing training female images for set set10...: 100%|██████████| 5805/5805 [00:00<00:00, 14550.94it/s]
