In [None]:
import pandas as pd
import dlib
from tqdm import tqdm
from distutils.dir_util import copy_tree
import os
import itertools
import shutil

In [None]:
target_sets = ['set1', 'set5', 'set8', 'set10']
target_percs = [0.1, 0.25, 0.5]
target_folders = ['male_10_female_ori', 'male_25_female_ori', 'male_50_female_ori',
                  'male_ori_female_10', 'male_ori_female_25', 'male_ori_female_50']

In [None]:
preprocessing = "/home/monash/Desktop/fyp-work/fyp-ma-13/fyp-models/preprocessing"
root = f"{preprocessing}/cv_datasets"

In [None]:
def copy_folder(fromDir, toDir):
    """
    Copies entire folder to another directory
    """
    copy_tree(fromDir, toDir)

In [None]:
def debug_num_overlap(experiments=False, experiment_folder = "experiments_20112021", *args, **kwargs):
    if experiments:
        male_perc, female_perc = kwargs['male_perc'], kwargs['female_perc']
        male_pert, female_pert = kwargs['male_pert'], kwargs['female_pert']
        assert type(male_perc) == float or male_perc == 'ori'
        assert type(female_perc) == float or female_perc == 'ori'
        male_label = int(male_perc*100) if type(male_perc) == float else male_perc
        female_label = int(female_perc*100) if type(female_perc) == float else female_perc
        if male_pert is not None and female_pert is not None:
            target_folder = f"male_{male_label}_{male_pert}_female_{female_label}_{female_pert}" # male_x_female_x
        else:
            target_folder = f"male_{male_label}_female_{female_label}" # male_x_female_x
        target_fp = f"{preprocessing}/{experiment_folder}/{target_folder}"
    else:
        target_fp = f"{preprocessing}/cv_datasets_debiased_{int(kwargs['perc']*100)}"
    for s in target_sets:
        m_train_lst = set(os.listdir(f'{target_fp}/{s}/train/male'))
        m_test_lst = set(os.listdir(f'{target_fp}/{s}/test/male'))
        f_train_lst = set(os.listdir(f'{target_fp}/{s}/train/female'))
        f_test_lst = set(os.listdir(f'{target_fp}/{s}/test/female'))
        m_overlap = m_train_lst.intersection(m_test_lst)
        f_overlap = f_train_lst.intersection(f_test_lst)
        if len(m_overlap) > 0:
            print(f"Male train/test {len(m_overlap)} images overlap for set {s}")
        else:
            print(f"Male no overlap for set {s}")
        if len(f_overlap) > 0:
            print(f"Female train/test {len(f_overlap)} images overlap for {s}")
        else:
            print(f"Female no overlap for set {s}")

In [None]:
# Test old dataset
for perc in target_percs:
    debug_num_overlap(perc=perc)
    print("----------------------")

In [None]:
# Test new perturbed dataset (both glasses and both makeup)
for perc in target_percs:
    debug_num_overlap(male_perc=perc, female_perc=perc, 
                      male_pert='glasses', female_pert='glasses',
                      experiments=True, experiment_folder = "experiments_12122021")
    print("----------------------")

In [None]:
# Check that each male and female train have the same length
def debug_train_length_aux(male_perc, female_perc):
    assert type(male_perc) == float or male_perc == 'ori'
    assert type(female_perc) == float or female_perc == 'ori'
    male_label = int(male_perc*100) if type(male_perc) == float else male_perc
    female_label = int(female_perc*100) if type(female_perc) == float else female_perc
    target_folder = f"male_{male_label}_female_{female_label}" # male_x_female_x
    target_fp = f"{preprocessing}/experiments_20112021/{target_folder}"

    for s in target_sets:
        m_lst = os.listdir(f'{target_fp}/{s}/train/male')
        f_lst = os.listdir(f'{target_fp}/{s}/train/female')
        assert len(m_lst) == 11610, f'Wrong male length - {len(m_lst)}'
        assert len(f_lst) == 11610, f'Wrong female length - {len(f_lst)}'
    return True

def debug_train_length():# for female perturbation only
    for perc in percs:
        debug_train_length_aux('ori', perc)

    # for male perturbation only
    for perc in percs: 
        debug_train_length_aux(perc, 'ori')    

    # for both
    for perc in percs:
        debug_train_length_aux(perc, perc)

In [None]:
def gen_debiased_datasets(male_perc, female_perc, male_pert, female_pert, experiments_folder):
    """
    Generated debiased datasets with a certain portion perturbed/unperturbed
    
    1. Copies set1, set5, set8 and set10 from cv_datasets to target_fp
    2. Replace training images with perturbed images
    
    """
    assert type(male_perc) == float or male_perc == 'ori'
    assert type(female_perc) == float or female_perc == 'ori'
    male_label = int(male_perc*100) if type(male_perc) == float else male_perc
    female_label = int(female_perc*100) if type(female_perc) == float else female_perc
    target_folder = f"male_{male_label}_{male_pert}_female_{female_label}_{female_pert}" # male_x_female_x
    target_fp = f"{preprocessing}/{experiments_folder}/{target_folder}"
    # 1. Copy set1, set5, set8 and set10 from cv_datasets to target_fp
    print("Copying set1, set5, set8 and set10 from cv_datasets to target_fp...")
    for s in target_sets:
        set_folder = f"{root}/{s}" # gives you access to the folder set data
        copy_folder(set_folder, f'{target_fp}/{s}')
    # Change here to change perturbations
    assert male_pert in ['glasses', 'makeup']
    assert female_pert in ['glasses', 'makeup']
    pert_male_fp = f'{male_pert}_male/'   # Only for males
    pert_female_fp  = f'{female_pert}_female/'  
    print('Replacing training images with perturbed...')
    # 2. Replace training images with perturbed
    for s in target_sets:
        target_set_fp = f"{target_fp}/{s}"
        # Get access to necessary folders
        set_folder = f"{root}/{s}" # gives you access to the folder set data
        train_folder = f"{set_folder}/train"
        from_male_train = f"{set_folder}/train/male"
        from_female_train = f"{set_folder}/train/female" 
        # Go to relevant train folder and update the images
        # Replace len(train)*perc of images with perturbed
        if type(male_perc) == float:
            to_male_train = f'{target_fp}/{s}/train/male'
            male_train_folder = os.listdir(to_male_train)
            pert_male_folder    = os.listdir(pert_male_fp)
            num_replace = int(len(male_train_folder)*male_perc) # Number of images to replace
            for i in tqdm(range(num_replace), f'Replacing training male images for set {s}...'): # num_replace
                src = f'{pert_male_fp}{pert_male_folder[i]}'
                dst = to_male_train
                shutil.copy(src, dst)
        if type(female_perc) == float:
            to_female_train = f'{target_fp}/{s}/train/female/'
            female_train_folder = os.listdir(to_female_train)
            pert_female_folder    = os.listdir(pert_female_fp)
            num_replace = int(len(female_train_folder)*female_perc) # Number of images to replace
            for i in tqdm(range(num_replace), f'Replacing training female images for set {s}...'): # num_replace
                src = f'{pert_female_fp}{pert_female_folder[i]}'
                dst = to_female_train
                shutil.copy(src, dst)

In [None]:
percs = [0.1, 0.25, 0.5]
experiments_folder = 'experiments_12122021'

# compile makeup dataset
# for perc in percs:
#     gen_debiased_datasets(perc, perc, 'makeup', 'makeup', experiments_folder)

# compile glasses dataset
for perc in percs:
    gen_debiased_datasets(perc, perc, 'glasses', 'glasses', experiments_folder)

# compile glasses dataset
# for perc in percs:
#     gen_debiased_datasets(perc, perc, 'glasses', 'glasses', experiments_folder)
    
# # for female perturbation only
# for perc in percs:
#     gen_debiased_datasets('ori', perc)
    
# # for male perturbation only
# for perc in percs: 
#     gen_debiased_datasets(perc, 'ori')    

# # for both
# for perc in percs:
#     gen_debiased_datasets(perc, perc)