# Making Training/Validation set

This notebook constructs the training/validation or Training/Testing sets while keeping the original ratios of the sensitive group across the labels

In [15]:
import os
import random
import shutil
import random

In [16]:
# Define paths for training and validation sets for doctors and nurses
train_path_dr, val_path_dr = "train_test_split/train/doctors", "train_test_split/test/doctors"
train_path_dr_male, val_path_dr_male = "train_test_split/train/doctors_male", "train_test_split/test/doctors_male"
train_path_dr_female, val_path_dr_female = "train_test_split/train/doctors_female", "train_test_split/test/doctors_female"
os.makedirs(train_path_dr, exist_ok=True)
os.makedirs(val_path_dr, exist_ok=True)
os.makedirs(train_path_dr_male, exist_ok=True)
os.makedirs(train_path_dr_female, exist_ok=True)
os.makedirs(val_path_dr_male, exist_ok=True)
os.makedirs(val_path_dr_female, exist_ok=True)

train_path_nur, val_path_nur = "train_test_split/train/nurses", "train_test_split/test/nurses"
train_path_nur_male, val_path_nur_male = "train_test_split/train/nurses_male", "train_test_split/test/nurses_male"
train_path_nur_female, val_path_nur_female = "train_test_split/train/nurses_female", "train_test_split/test/nurses_female"

os.makedirs(train_path_nur, exist_ok=True)
os.makedirs(val_path_nur, exist_ok=True)
os.makedirs(train_path_nur_male, exist_ok=True)
os.makedirs(train_path_nur_female, exist_ok=True)
os.makedirs(val_path_nur_male, exist_ok=True)
os.makedirs(val_path_nur_female, exist_ok=True)

# Define paths for different images of doctors and nurses
path_dr_f_d = 'dr/fem_dr_dark_56/'
path_dr_f_l = 'dr/fem_dr_light_256/'
path_dr_m_d = 'dr/mal_dr_dark_62/'
path_dr_m_l = 'dr/mal_dr_light_308/'

# Get the list of images in each directory
dr_f_d = os.listdir(path_dr_f_d)
dr_f_l = os.listdir(path_dr_f_l)
dr_m_d = os.listdir(path_dr_m_d)
dr_m_l = os.listdir(path_dr_m_l)

path_nur_f_d = 'nurse/fem_nurse_dark_63/'
path_nur_f_l = 'nurse/fem_nurse_light_252/'
path_nur_m_d = 'nurse/mal_nurse_dark_76/'
path_nur_m_l = 'nurse/mal_nurse_light_203/'

nur_f_d = os.listdir(path_nur_f_d)
nur_f_l = os.listdir(path_nur_f_l)
nur_m_d = os.listdir(path_nur_m_d)
nur_m_l = os.listdir(path_nur_m_l)

In [17]:
# Define the split ratio for the training/validation set
split= 0.25

# Calculate the number of male and female doctors
dr_m = (len(dr_m_d) + len(dr_m_l))
dr_f = (len(dr_f_d) + len(dr_f_l))
dr = dr_m + dr_f

# Calculate the ratio of male and female doctors
r_dr_m = dr_m /dr
r_dr_w = 1 - r_dr_m

# Calculate the number of male and female nurses
nur_m = (len(nur_m_d) + len(nur_m_l))
nur_f = (len(nur_f_d) + len(nur_f_l))
nur = nur_m + nur_f

# Calculate the ratio of male and female nurses
r_nur_m = nur_m /nur
r_nur_w = 1 - r_nur_m

In [23]:
# Define the validation set for male doctors
val_dr_m = set(random.sample(list(range(dr_m)), int(dr_m * split))) # Select a random subset of male doctors to be in the validation set

# Copy the images of male doctors to the training or validation set
for i in range(dr_m):
    index = i if i<len(dr_m_d) else i - len(dr_m_d)
    src = path_dr_m_d + dr_m_d[index] if i == index else path_dr_m_l + dr_m_l[index]
    dest = val_path_dr if i in val_dr_m else train_path_dr
    dest2 = val_path_dr_male if i in val_dr_m else train_path_dr_male
    shutil.copy(src, dest)
    shutil.copy(src, dest2)

# Define the validation set for female doctors
val_dr_f = set(random.sample(list(range(dr_f)), int(dr_f * split))) # Select a random subset of female doctors to be in the validation set

# Copy the images of female doctors to the training or validation set
for i in range(dr_f):
    index = i if i<len(dr_f_d) else i - len(dr_f_d)
    src = path_dr_f_d + dr_f_d[index] if i == index else path_dr_f_l + dr_f_l[index]
    dest = val_path_dr if i in val_dr_f else train_path_dr
    dest2 = val_path_dr_female if i in val_dr_m else train_path_dr_female
    shutil.copy(src, dest)
    shutil.copy(src, dest2)

# Define the validation set for male nurses
val_nur_m = set(random.sample(list(range(nur_m)), int(nur_m * split))) # Select a random subset of male nurses to be in the validation set

# Copy the images of male nurses to the training or validation set
for i in range(nur_m):
    index = i if i<len(nur_m_d) else i - len(nur_m_d)
    src = path_nur_m_d + nur_m_d[index] if i == index else path_nur_m_l + nur_m_l[index]
    dest = val_path_nur if i in val_nur_m else train_path_nur
    dest2 = val_path_nur_male if i in val_dr_m else train_path_nur_male
    shutil.copy(src, dest)
    shutil.copy(src, dest2)

# Define the validation set for female nurses
val_nur_f = set(random.sample(list(range(nur_f)), int(nur_f * split))) # Select a random subset of female nurses to be in the validation set

# Copy the images of female nurses to the training or validation set
for i in range(nur_f):
    index = i if i<len(nur_f_d) else i - len(nur_f_d)
    src = path_nur_f_d + nur_f_d[index] if i == index else path_nur_f_l + nur_f_l[index]
    dest = val_path_nur if i in val_nur_f else train_path_nur
    dest2 = val_path_nur_female if i in val_dr_m else train_path_nur_female
    shutil.copy(src, dest)
    shutil.copy(src, dest2)

## Make Bias training set 

In [24]:
# Define the bias ratio
BIAS = 1

# Define paths for biased training set for doctors and nurses
train_b_path_dr = f"train_test_split/train_{BIAS}/doctors"
os.makedirs(train_b_path_dr, exist_ok=True)

train_b_path_nur = f"train_test_split/train_{BIAS}/nurses"
os.makedirs(train_b_path_nur, exist_ok=True)

# Calculate the number of female doctors and male nurses in the biased training set
dr_f_t = int((1-BIAS)*dr_m/BIAS*(1-split))
nur_m_t = int((1-BIAS)*nur_f/BIAS*(1-split))

# Copy the images of doctors and nurses to the biased training set
train_dr, train_nur, c_dr, c_nur = os.listdir(train_path_dr), os.listdir(train_path_nur), 0, 0
for im in train_dr:
    src, dst = None, None
    if im in dr_m_d or im in dr_m_l:
        src, dst = train_path_dr, train_b_path_dr
    elif c_dr < dr_f_t and (im in dr_f_d or im in dr_f_l):
        src, dst = train_path_dr, train_b_path_dr
        c_dr += 1
        
    if src and dst:
        shutil.copy(src + '/' + im, dst + '/' + im)
        
for im in train_nur:  
    src, dst = None, None
    if im in nur_f_d or im in nur_f_l:
        src, dst = train_path_nur, train_b_path_nur
    elif c_nur < nur_m_t and (im in nur_m_d or im in nur_m_l):
        src, dst = train_path_nur, train_b_path_nur
        c_nur += 1
        
    if src and dst:
        shutil.copy(src + '/' + im, dst + '/' + im)