In [1]:
import random
import pandas as pd
import os  
from tqdm import tqdm
import numpy as np
import shutil

## Utils

In [2]:
def save_csv(identity_selection, save_path, file_path):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    identity_selection.to_csv(save_path + file_path, index=False)

In [44]:
def get_race_info(balanced_data):
    race_info = []

    for race in balanced_data['race'].unique():
        race_data = balanced_data[balanced_data['race'] == race]
        unique_ids = race_data['id'].nunique()
        num_images = len(race_data)

        race_info.append({'Race': race, 'Unique IDs': unique_ids, 'Number of Images': num_images})

    race_info_df = pd.DataFrame(race_info)
    print(race_info_df)

def check_data_leakage(balanced_data, set_of_datasets):
    """
    check_data_leakage(balanced_data, {'african_unbalanced': african_unbalanced, 'asian_unbalanced': asian_unbalanced, 'caucasian_unbalanced': caucasian_unbalanced, 'indian_unbalanced': indian_unbalanced})
    or
    check_data_leakage(balanced_data, {'linear_prob_dataset': linear_prob_dataset})

    """
    for dataset_name, dataset in set_of_datasets.items():
        merged_data = pd.merge(dataset, balanced_data, how='outer', indicator=True)

        # Rows that appear in both DataFrames
        overlap = merged_data[merged_data['_merge'] == 'both']

        # If overlap is empty, there is no overlap between the two DataFrames
        if overlap.empty:
            print(f"There is no overlap between {dataset_name} and the new balanced data.")
        else:
            print(f"There is overlap between {dataset_name} and the new balanced data.")

        # Check identity leakage
        dataset_id = dataset['id'].tolist()
        balanced_data_id = balanced_data['id'].tolist()

        print(f"Number of identity overlap between {dataset_name} and balanced data: ", len(set(dataset_id).intersection(balanced_data_id)))


## Get all data

In [96]:
import pandas as pd
import os
def get_all_data_csv(data_path):
    '''get csv of all data, race, id, image_name'''
    coloumns = ['image_name', 'id', 'race']
    print(data_path)

    df = pd.DataFrame(columns=coloumns)
    all_image = []
    race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
    label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}

    for race in race_to_label.keys():
        race_folder = os.path.join(data_path+'/', race)
        print(race_folder)
        race_label = race_to_label[race]
        for id_folder in os.listdir(race_folder):
            if not 'm.' in id_folder:
                continue
            all_id_image = []

            id_path = os.path.join(race_folder, id_folder)
            id = id_folder.split("/")[0]

            for image in os.listdir(id_path):
                all_id_image.append(image)
            for image in all_id_image:
                all_image.append([image, id, race_label])
        
    df = pd.DataFrame(all_image, columns=coloumns)

    return df

In [None]:
all_data_csv = get_all_data_csv("./race_per_7000")

Get balanced data

In [97]:
def get_balanced_per_race(race_df, image_per_id_limit=60, total_img=30000, id_per_race=500):
    '''
    get a new df with 60 image per id
    draw id_per_race uniformly 
    draw image_per_id uniformly
    '''
    # Group by ID and count the number of images per ID
    data_grouped = race_df.groupby("id").count()
    
    # Filter IDs with at least 'image_per_id_limit' images
    data_grouped = data_grouped[data_grouped["image_name"] >= image_per_id_limit]
    
    # Get a list of IDs that meet the image count criteria
    eligible_ids = data_grouped.index.tolist()
    
    # Shuffle the list of eligible IDs
    random.shuffle(eligible_ids)
    
    # Take the first 'id_per_race' IDs to ensure uniform distribution
    selected_ids = eligible_ids[:id_per_race]
    
    # Filter the DataFrame to include only rows with selected IDs
    filtered_id_df = race_df[race_df["id"].isin(selected_ids)]
    
    # Randomly sample images from each selected ID to meet the 'total_img' requirement
    num_selected_images = 0
    selected_rows = []
    
    for id in selected_ids:
        id_df = filtered_id_df[filtered_id_df["id"] == id]
        num_images_for_id = min(image_per_id_limit, total_img - num_selected_images)
        
        # Randomly sample 'num_images_for_id' images for the current ID
        sampled_rows = id_df.sample(n=num_images_for_id, random_state=42)
        selected_rows.extend(sampled_rows.values)
        num_selected_images += num_images_for_id
        
        if num_selected_images >= total_img:
            break
    
    # Create a new DataFrame from the selected rows
    new_df = pd.DataFrame(selected_rows, columns=['image_name', 'id', 'race'])
    
    return new_df

In [98]:
def get_balanced(data, image_per_id_limit=60, total_image=120000, id_per_race=500): 
    race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
    balanced_data = pd.DataFrame()

    for race in race_to_label.keys():
        race_label = race_to_label[race]
        cur_race_data = data[data['race']==race_label]
        race_balanced_df = get_balanced_per_race(cur_race_data, image_per_id_limit, total_image, id_per_race)
        balanced_data = pd.concat([balanced_data, race_balanced_df])
    
    return balanced_data

In [99]:
data = pd.read_csv("all_data.csv")

# Balanced Dataset

In [100]:
balanced_data = get_balanced(data, 60, 120000, 500)

In [13]:
balanced_data = pd.read_csv("./balanced_data.csv")
print(len(balanced_data))
print(balanced_data.head())
print(balanced_data.groupby('id').count()['image_name'])
print(balanced_data.groupby('race').count()['image_name'])

120000
               image_name        id  race
0    0-FaceId-0_align.jpg  m.01q9hh     0
1  102-FaceId-0_align.jpg  m.01q9hh     0
2   33-FaceId-0_align.jpg  m.01q9hh     0
3   14-FaceId-0_align.jpg  m.01q9hh     0
4   51-FaceId-0_align.jpg  m.01q9hh     0
id
m.0100bl7z    60
m.0101hx8n    60
m.0104r_91    60
m.0105jt_f    60
m.010887hy    60
              ..
m.0zmx0h5     60
m.0zrryfx     60
m.0zs796g     60
m.0ztd97b     60
m.0ztdp_f     60
Name: image_name, Length: 2000, dtype: int64
race
0    30000
1    30000
2    30000
3    30000
Name: image_name, dtype: int64


In [102]:
save_csv(balanced_data, "./", "balanced_data.csv")

## Linear Prob Dataset
- Linear prob: 3-10 id/race, 40 id in total, 60 img/id, == 2.4k image. 50 img/id train… 

### 1. Sample 10 ids from each race, each of them already have 60 img/race

In [24]:
import pandas as pd
def sample_linear_prob(original_balanced_data):
    # Load your dataset
    original_balanced_data = pd.read_csv("./balanced_data.csv")

    # Initialize an empty DataFrame to store the sampled data
    sampled_data = pd.DataFrame()

    # Get unique races
    races = original_balanced_data['race'].unique()

    # For each race, randomly sample 10 ids and add them to sampled_data
    for race in races:
        sampled_ids = original_balanced_data[original_balanced_data['race'] == race]['id'].drop_duplicates().sample(10)
        sampled_data = pd.concat([sampled_data, original_balanced_data[original_balanced_data['id'].isin(sampled_ids)]])

    # Remove the sampled ids from original_balanced_data
    new_balanced_data = original_balanced_data[~original_balanced_data['id'].isin(sampled_data['id'])]
    return sampled_data, new_balanced_data


In [25]:
original_balanced_data = pd.read_csv("./balanced_data.csv")

linear_prob_dataset,  new_balanced_data= sample_linear_prob(original_balanced_data)

### 2. Filter Linear Prob ids from Old Uniform to create New Uniform
    - Should contain 118k images
    - Check for data leakage

In [46]:
check_data_leakage(new_balanced_data, {'linear_prob_dataset': linear_prob_dataset})

There is no overlap between linear_prob_dataset and the new balanced data.
Number of identity overlap between linear_prob_dataset and balanced data:  0


### 3. SAVE linear prob and new balanced


In [31]:
save_csv(linear_prob_dataset, "./", "linear_prob_dataset.csv")
save_csv(balanced_data, "./", "new_balanced_data.csv")

Filter out balanced data and get disjoint_balanced_data

In [109]:
balanced_data = pd.read_csv("./new_balanced_data.csv")
data = pd.read_csv("./all_data.csv")
#filter out everything in balanced_data
filtered_data = data[~data['id'].isin(balanced_data['id'])]

#filter out id with less than 20 images
filtered_data = filtered_data.groupby('id').filter(lambda x: len(x) >= 20)

#check number of id per race
id_per_race = filtered_data.groupby('race')['id'].nunique()

In [110]:
len(balanced_data)

117600

In [111]:
save_csv(filtered_data, "./", "disjoint_balanced_data.csv")

Sample biased datasets for each race

In [115]:
import random
from tqdm import tqdm

def sample_by_race(race_data, id_per_race, image_per_id):
    total_img = id_per_race * image_per_id

    # Group by ID and count the number of images per ID
    data_grouped = race_data.groupby("id").count()
    
    # Get a list of IDs that meet the image count criteria
    eligible_ids = data_grouped.index.tolist()
    
    # Shuffle the list of eligible IDs
    random.shuffle(eligible_ids)
    
    # Take the first 'id_per_race' IDs to ensure uniform distribution
    selected_ids = eligible_ids[:id_per_race]
    
    # Filter the DataFrame to include only rows with selected IDs
    filtered_id_df = race_data[race_data["id"].isin(selected_ids)]
    
    # Randomly sample images from each selected ID to meet the 'total_img' requirement
    num_selected_images = 0
    selected_rows = []
    
    for id in tqdm(selected_ids, desc="Sampling Images"):
        id_df = filtered_id_df[filtered_id_df["id"] == id]
        num_images_for_id = min(image_per_id, total_img - num_selected_images)
        
        # Randomly sample 'num_images_for_id' images for the current ID
        sampled_rows = id_df.sample(n=num_images_for_id, random_state=42)
        selected_rows.extend(sampled_rows.values)
        num_selected_images += num_images_for_id
        
        if num_selected_images >= total_img:
            break
    
    # Create a new DataFrame from the selected rows
    new_df = pd.DataFrame(selected_rows, columns=['image_name', 'id', 'race'])

    return new_df


def generate_unbalanced_datasets(filtered_csv, output_dir, majority_num_ids=3600, minority_num_ids=780, image_per_id=20):
    race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
    label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}
    
    filtered_data = pd.read_csv(filtered_csv)
    coloumns = ['image_name', 'id', 'race']

    for race, label in race_to_label.items():
        print(f"Race: {race}")
        print(f"Label: {label}")

        cur_label = label
        cur_biased_data = pd.DataFrame(columns=coloumns)

        for iter_label, _ in label_to_race.items():
            print(f"iter_label: {iter_label}")
            if iter_label == cur_label:
                majority_race = filtered_data[filtered_data['race'] == cur_label]
                majority_sample = sample_by_race(majority_race, majority_num_ids, image_per_id)
                cur_biased_data = pd.concat([cur_biased_data, majority_sample])
            else:
                minority_sample = filtered_data[filtered_data['race'] == iter_label]
                minority_sample = sample_by_race(minority_sample, minority_num_ids, image_per_id)
                cur_biased_data = pd.concat([cur_biased_data, minority_sample])

        # Save the unbalanced data as CSV.
        cur_biased_data.to_csv(f'{output_dir}/unbalanced_{race}.csv', index=False)

In [116]:
generate_unbalanced_datasets("./disjoint_balanced_data.csv", "./", majority_num_ids=3600, minority_num_ids=780, image_per_id=20)

Race: Caucasian
Label: 0
iter_label: 0


Sampling Images: 100%|█████████▉| 3599/3600 [01:41<00:00, 35.38it/s]


iter_label: 1


Sampling Images: 100%|█████████▉| 779/780 [00:06<00:00, 123.08it/s]


iter_label: 2


Sampling Images: 100%|█████████▉| 779/780 [00:05<00:00, 130.28it/s]


iter_label: 3


Sampling Images: 100%|█████████▉| 779/780 [00:08<00:00, 89.62it/s] 


Race: Indian
Label: 1
iter_label: 0


Sampling Images: 100%|█████████▉| 779/780 [00:11<00:00, 66.20it/s] 


iter_label: 1


Sampling Images: 100%|█████████▉| 3599/3600 [01:58<00:00, 30.40it/s]


iter_label: 2


Sampling Images: 100%|█████████▉| 779/780 [00:06<00:00, 122.70it/s]


iter_label: 3


Sampling Images: 100%|█████████▉| 779/780 [00:08<00:00, 92.85it/s] 


Race: Asian
Label: 2
iter_label: 0


Sampling Images: 100%|█████████▉| 779/780 [00:07<00:00, 101.65it/s]


iter_label: 1


Sampling Images: 100%|█████████▉| 779/780 [00:09<00:00, 84.91it/s] 


iter_label: 2


Sampling Images: 100%|█████████▉| 3599/3600 [02:00<00:00, 29.80it/s]


iter_label: 3


Sampling Images: 100%|█████████▉| 779/780 [00:07<00:00, 108.32it/s]


Race: African
Label: 3
iter_label: 0


Sampling Images: 100%|█████████▉| 779/780 [00:08<00:00, 95.84it/s] 


iter_label: 1


Sampling Images: 100%|█████████▉| 779/780 [00:06<00:00, 115.14it/s]


iter_label: 2


Sampling Images: 100%|█████████▉| 779/780 [00:06<00:00, 112.94it/s]


iter_label: 3


Sampling Images: 100%|█████████▉| 3599/3600 [02:03<00:00, 29.21it/s]


Check data leakage

In [35]:

african_unbalnced = pd.read_csv("./unbalanced_African.csv")
asian_unbalnced = pd.read_csv("./unbalanced_Asian.csv")
caucasian_unbalnced = pd.read_csv("./unbalanced_Caucasian.csv")
indian_unbalnced = pd.read_csv("./unbalanced_Indian.csv")
balanced_data = pd.read_csv("./new_balanced_data.csv")

In [118]:
print(african_unbalnced.groupby('id').count()['image_name'].value_counts())
print(asian_unbalnced.groupby('id').count()['image_name'].value_counts())
print("Caucasian unbalanced", caucasian_unbalnced.groupby('id').count()['image_name'].value_counts())
print("Indian unbalanced", indian_unbalnced.groupby('id').count()['image_name'].value_counts())

image_name
20    5940
Name: count, dtype: int64
image_name
20    5940
Name: count, dtype: int64
Caucasian unbalanced image_name
20    5940
Name: count, dtype: int64
Indian unbalanced image_name
20    5940
Name: count, dtype: int64


In [119]:
african_unbalnced.groupby('race')['id'].nunique()

race
0     780
1     780
2     780
3    3600
Name: id, dtype: int64

In [120]:
#check identity leakage
african_unbalnced_id = african_unbalnced['id'].tolist()
asian_unbalnced_id = asian_unbalnced['id'].tolist()
caucasian_unbalnced_id = caucasian_unbalnced['id'].tolist()
indian_unbalnced_id = indian_unbalnced['id'].tolist()

balanced_data_id = balanced_data['id'].tolist()

print("number of identity overlap between african_unbalnced and balanced_data: ", len(set(african_unbalnced_id).intersection(balanced_data_id)))
print("number of identity overlap between asian_unbalnced and balanced_data: ", len(set(asian_unbalnced_id).intersection(balanced_data_id)))
print("number of identity overlap between caucasian_unbalnced and balanced_data: ", len(set(caucasian_unbalnced_id).intersection(balanced_data_id)))
print("number of identity overlap between indian_unbalnced and balanced_data: ", len(set(indian_unbalnced_id).intersection(balanced_data_id)))

number of identity overlap between african_unbalnced and balanced_data:  0
number of identity overlap between asian_unbalnced and balanced_data:  0
number of identity overlap between caucasian_unbalnced and balanced_data:  0
number of identity overlap between indian_unbalnced and balanced_data:  0


Get Train and Test in Balanced


In [186]:
def mark_train_test(data, percentage_test_per_id=0.2):
    image_per_id = data.groupby('id').count()['image_name'][0]
    print(f"image_per_id: {image_per_id}")
    num_test_per_id = (image_per_id * percentage_test_per_id).astype(int)
    print(f"num_test_per_id: {num_test_per_id}")

    result_df = data.copy()
    result_df['split'] = 'train'
    print(f"result_df: {result_df}")

    unique_ids = data['id'].unique()
    print(f"len(unique_ids): {len(unique_ids)}")

    # return
    for id in unique_ids:
        test_indices = data[data['id'] == id].head(num_test_per_id).index
        result_df.loc[test_indices, 'split'] = 'test'
    return result_df

In [130]:
data = """000020_00@en.jpg,m.0j5bv6r,1,test
000000_00@fa.jpg,m.0j5bv6r,1,test
000001_00@fa.jpg,m.0j5bv6r,1,test
000048_00@fa.jpg,m.0j5bv6r,1,test
000006_00@en.jpg,m.0j5bv6r,1,test
000018_00@fa.jpg,m.0j5bv6r,1,test
000044_00@en.jpg,m.0j5bv6r,1,test
000066_00@en.jpg,m.0j5bv6r,1,test
000021_00@fa.jpg,m.0j5bv6r,1,test
000006_00@fa.jpg,m.0j5bv6r,1,test
000059_00@en.jpg,m.0j5bv6r,1,test
000005_01@en.jpg,m.0j5bv6r,1,test
000058_00@en.jpg,m.0j5bv6r,1,train
000026_00@en.jpg,m.0j5bv6r,1,train
000003_00@en.jpg,m.0j5bv6r,1,train
000061_00@en.jpg,m.0j5bv6r,1,train
000024_00@fa.jpg,m.0j5bv6r,1,train
000025_00@en.jpg,m.0j5bv6r,1,train
000028_00@en.jpg,m.0j5bv6r,1,train
000069_00@en.jpg,m.0j5bv6r,1,train
000049_00@fa.jpg,m.0j5bv6r,1,train
000060_00@en.jpg,m.0j5bv6r,1,train
000014_00@en.jpg,m.0j5bv6r,1,train
000037_00@en.jpg,m.0j5bv6r,1,train
000007_00@fa.jpg,m.0j5bv6r,1,train
000055_00@en.jpg,m.0j5bv6r,1,train
000054_00@en.jpg,m.0j5bv6r,1,train
000025_00@fa.jpg,m.0j5bv6r,1,train
000027_00@en.jpg,m.0j5bv6r,1,train
000013_00@en.jpg,m.0j5bv6r,1,train
000004_00@en.jpg,m.0j5bv6r,1,train
000045_01@en.jpg,m.0j5bv6r,1,train
000021_00@en.jpg,m.0j5bv6r,1,train
000002_00@en.jpg,m.0j5bv6r,1,train
000067_00@en.jpg,m.0j5bv6r,1,train
000068_00@en.jpg,m.0j5bv6r,1,train
000022_00@en.jpg,m.0j5bv6r,1,train
000036_01@en.jpg,m.0j5bv6r,1,train
000000_01@en.jpg,m.0j5bv6r,1,train
000012_00@fa.jpg,m.0j5bv6r,1,train
000047_00@en.jpg,m.0j5bv6r,1,train
000038_00@en.jpg,m.0j5bv6r,1,train
000009_01@en.jpg,m.0j5bv6r,1,train
000045_00@fa.jpg,m.0j5bv6r,1,train
000033_00@fa.jpg,m.0j5bv6r,1,train
000017_00@fa.jpg,m.0j5bv6r,1,train
000035_00@en.jpg,m.0j5bv6r,1,train
000032_00@en.jpg,m.0j5bv6r,1,train
000008_00@en.jpg,m.0j5bv6r,1,train
000071_01@en.jpg,m.0j5bv6r,1,train
000016_00@fa.jpg,m.0j5bv6r,1,train
000024_00@en.jpg,m.0j5bv6r,1,train
000049_00@en.jpg,m.0j5bv6r,1,train
000015_00@en.jpg,m.0j5bv6r,1,train
000040_00@en.jpg,m.0j5bv6r,1,train
000039_00@en.jpg,m.0j5bv6r,1,train
000023_00@en.jpg,m.0j5bv6r,1,train
000057_00@en.jpg,m.0j5bv6r,1,train
000002_00@fa.jpg,m.0j5bv6r,1,train
000005_00@en.jpg,m.0j5bv6r,1,train
"""

# Split the data into lines
lines = data.strip().split('\n')

# Create counters for 'train' and 'test' splits
train_count = 0
test_count = 0

# Iterate through the lines and count 'train' and 'test' images
for line in lines:
    _, _, _, split = line.split(',')
    if split == 'train':
        train_count += 1
    elif split == 'test':
        test_count += 1

print(f"Number of 'train' images: {train_count}")
print(f"Number of 'test' images: {test_count}")


Number of 'train' images: 48
Number of 'test' images: 12


In [None]:
#ignor the warning, it works fine 
race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}

balanced_data = pd.read_csv("./unbalanced_African.csv")
african_balanced = balanced_data[balanced_data['race'] == 3]
african_balanced_split = mark_train_test(african_balanced, 0.2)

save_csv(african_balanced_split, "./", "african_balanced_split.csv")

  image_per_id = balanced_data.groupby('id').count()['image_name'][0]


In [181]:
# Define the race-to-label and label-to-race mappings
race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}

# Read your balanced data from a CSV file
balanced_data = pd.read_csv("./new_balanced_data.csv")

# Create an empty dictionary to store splits by race
race_splits = {}

# Iterate through races and split the data
for race, label in race_to_label.items():
    race_data = balanced_data[balanced_data['race'] == label]
    
    # Assuming you have a function mark_train_test to split the data
    race_split = mark_train_test(balanced_data, percentage_test_per_id=0.2)
    break
    # Save the split into a CSV file
    save_csv(race_split, "./", f"{race}_balanced_split.csv")

    # Store the split in the dictionary
    race_splits[race] = race_split


image_per_id: 60
num_test_per_id: 12
result_df:                     image_name        id  race  split
0         0-FaceId-0_align.jpg  m.01q9hh     0  train
1       102-FaceId-0_align.jpg  m.01q9hh     0  train
2        33-FaceId-0_align.jpg  m.01q9hh     0  train
3        14-FaceId-0_align.jpg  m.01q9hh     0  train
4        51-FaceId-0_align.jpg  m.01q9hh     0  train
...                        ...       ...   ...    ...
117595    8-FaceId-0_align.jpg  m.0fn4cg     3  train
117596   40-FaceId-0_align.jpg  m.0fn4cg     3  train
117597   37-FaceId-0_align.jpg  m.0fn4cg     3  train
117598    0-FaceId-0_align.jpg  m.0fn4cg     3  train
117599   41-FaceId-0_align.jpg  m.0fn4cg     3  train

[117600 rows x 4 columns]
len(unique_ids): 1960


In [184]:
1960 * 4

7840

In [None]:
from sklearn.model_selection import train_test_split
# Define a function to split each group
def split_data(group):
    train, test = train_test_split(group, test_size=0.2, random_state=42)
    return pd.concat([train, test], keys=['train', 'test'], names=['split'])

# Apply the function to each group
data = data.groupby('id').apply(split_data).reset_index(level='split')


In [187]:
import pandas as pd

# Define the race-to-label and label-to-race mappings
race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}

# Load unbalanced datasets
african_unbalanced = pd.read_csv("./unbalanced_African.csv")
asian_unbalanced = pd.read_csv("./unbalanced_Asian.csv")
caucasian_unbalanced = pd.read_csv("./unbalanced_Caucasian.csv")
indian_unbalanced = pd.read_csv("./unbalanced_Indian.csv")

# Create an empty dictionary to store splits by race for unbalanced datasets
unbalanced_race_splits = {}
len(f"african_unbalanced: {african_unbalanced}")
len(f"asian_unbalanced: {asian_unbalanced}")
len(f"caucasian_unbalanced: {caucasian_unbalanced}")
len(f"indian_unbalanced: {indian_unbalanced}")


# Iterate through races and split the unbalanced data
for race, _ in race_to_label.items():
    if race == 'African':
        race_data = african_unbalanced
    elif race == 'Asian':
        race_data = asian_unbalanced
    elif race == 'Caucasian':
        race_data = caucasian_unbalanced
    elif race == 'Indian':
        race_data = indian_unbalanced
    
    # Assuming you have a function mark_train_test to split the data
    print(f"race: {race}")
    race_split = mark_train_test(race_data, percentage_test_per_id=0.2)
    # break
    # Save the split into a CSV file
    save_csv(race_split, "./", f"unbalanced_{race}_split.csv")

    # Store the split in the dictionary
    unbalanced_race_splits[race] = race_split

# Now you have splits for each race within the unbalanced datasets stored in unbalanced_race_splits


race: Caucasian
image_per_id: 20
num_test_per_id: 4
result_df:                     image_name         id  race  split
0        28-FaceId-1_align.jpg  m.0g550l7     0  train
1         5-FaceId-0_align.jpg  m.0g550l7     0  train
2         3-FaceId-0_align.jpg  m.0g550l7     0  train
3        15-FaceId-0_align.jpg  m.0g550l7     0  train
4        47-FaceId-0_align.jpg  m.0g550l7     0  train
...                        ...        ...   ...    ...
118795  105-FaceId-0_align.jpg   m.02pyk8     3  train
118796  102-FaceId-0_align.jpg   m.02pyk8     3  train
118797   89-FaceId-0_align.jpg   m.02pyk8     3  train
118798   35-FaceId-0_align.jpg   m.02pyk8     3  train
118799   21-FaceId-0_align.jpg   m.02pyk8     3  train

[118800 rows x 4 columns]
len(unique_ids): 5940
race: Indian
image_per_id: 20
num_test_per_id: 4
result_df:                     image_name         id  race  split
0        14-FaceId-1_align.jpg  m.01v2gc6     0  train
1        13-FaceId-0_align.jpg  m.01v2gc6     0  train
2  

In [171]:
print(f"african_unbalanced len: {len(african_unbalanced)}")
print(f"asian_unbalanced len: {len(asian_unbalanced)}")
print(f"caucasian_unbalanced len: {len(caucasian_unbalanced)}")
print(f"indian_unbalanced len: {len(indian_unbalanced)}")


african_unbalanced len: 118800
asian_unbalanced len: 118800
caucasian_unbalanced len: 118800
indian_unbalanced len: 118800


In [164]:
def count_unique_ids(train_data, test_data):
    train_unique_ids = train_data['id'].nunique()
    test_unique_ids = test_data['id'].nunique()
    return train_unique_ids, test_unique_ids
# print(race_splits)
# For balanced dataset
for race, split_data in race_splits.items():
    train_data = split_data[split_data['split'] == 'train']
    test_data = split_data[split_data['split'] == 'test']
    train_unique_ids, test_unique_ids = count_unique_ids(train_data, test_data)
    # print(f'Race: {race}, Train Unique IDs: {train_unique_ids}, Test Unique IDs: {test_unique_ids}')
    print(f'Race: {race}, Total len: {len(split_data)}')
    print(f'Race: {race}, Train len: {len(train_data)}, Test len: {len(test_data)}')



# For unbalanced datasets
for race, split_data in unbalanced_race_splits.items():
    train_data = split_data[split_data['split'] == 'train']
    test_data = split_data[split_data['split'] == 'test']
    train_unique_ids, test_unique_ids = count_unique_ids(train_data, test_data)
    # print(f'Race: {race}, Train Unique IDs: {train_unique_ids}, Test Unique IDs: {test_unique_ids}')
    print(f'Race: {race}, Total len: {len(split_data)}')
    print(f'Race: {race}, Train len: {len(train_data)}, Test len: {len(test_data)}')




Race: Caucasian, Total len: 29400
Race: Caucasian, Train len: 23520, Test len: 5880
Race: Indian, Total len: 29400
Race: Indian, Train len: 23520, Test len: 5880
Race: Asian, Total len: 29400
Race: Asian, Train len: 23520, Test len: 5880
Race: African, Total len: 29400
Race: African, Train len: 23520, Test len: 5880


In [173]:
71280>47520

True

In [176]:
5880 / 29400 

0.2

In [165]:
29400 > 118800

False

In [172]:
29400 * 4 - 118800 

-1200

In [133]:
import pandas as pd

# Initialize an empty list to collect the data
id_ratios = []

# For balanced dataset
for race, split_data in race_splits.items():
    # Calculate ratio for each ID
    total_ids = len(train_data['id'].unique()) + len(test_data['id'].unique())
    
    for id in train_data['id'].unique():
        train_ratio = len(train_data[train_data['id'] == id]) / total_ids
        test_ratio = len(test_data[test_data['id'] == id]) / total_ids
        id_ratios.append([id, race, train_ratio, test_ratio])

# For unbalanced datasets
for race, split_data in unbalanced_race_splits.items():
    # Calculate ratio for each ID
    total_ids = len(train_data['id'].unique()) + len(test_data['id'].unique())
    
    for id in train_data['id'].unique():
        train_ratio = len(train_data[train_data['id'] == id]) / total_ids
        test_ratio = len(test_data[test_data['id'] == id]) / total_ids
        id_ratios.append([id, race, train_ratio, test_ratio])

# Create a DataFrame from the list of ratios
id_ratio_df = pd.DataFrame(id_ratios, columns=['ID', 'Race', 'Train Ratio', 'Test Ratio'])

# Print the resulting DataFrame
print(id_ratio_df)


KeyboardInterrupt: 

# Linear Prob Split

In [134]:
linear_prob_dataset = pd.read_csv("./linear_prob_dataset.csv")
linear_prob_dataset_split = mark_train_test(linear_prob_dataset, 0.2)

save_csv(linear_prob_dataset_split, "./", "linear_prob_dataset_split.csv")


In [136]:
# Load the 'linear_prob_dataset_split' dataset
linear_prob_dataset_split = pd.read_csv("linear_prob_dataset_split.csv")


# Create a dictionary to store the counts for each race
race_id_counts = {}

# Separate the 'train' and 'test' data
train_data = linear_prob_dataset_split[linear_prob_dataset_split['split'] == 'train']
test_data = linear_prob_dataset_split[linear_prob_dataset_split['split'] == 'test']

# Calculate the number of unique IDs and their ratios
train_unique_ids, test_unique_ids = count_unique_ids(train_data, test_data)
total_unique_ids = len(linear_prob_dataset_split['id'].unique())
train_ratio = train_unique_ids / total_unique_ids
test_ratio = test_unique_ids / total_unique_ids

# Count unique IDs per race
for race in linear_prob_dataset_split['race'].unique():
    race_data = linear_prob_dataset_split[linear_prob_dataset_split['race'] == race]
    race_train_data = train_data[train_data['race'] == race]
    race_test_data = test_data[test_data['race'] == race]
    
    train_unique_ids, test_unique_ids = count_unique_ids(race_train_data, race_test_data)
    total_race_ids = len(race_data['id'].unique())
    
    race_id_counts[race] = {
        'Total Unique IDs': total_race_ids,
        'Train Unique IDs': train_unique_ids,
        'Test Unique IDs': test_unique_ids,
    }

    # Print results for each race
    print(f"Race: {race}")
    print(f"Total Unique IDs: {total_race_ids}")
    print(f"Train Unique IDs: {train_unique_ids}")
    print(f"Test Unique IDs: {test_unique_ids}")

# Print overall results
print("\nOverall Statistics:")
print(f"Total Unique IDs: {total_unique_ids}")
print(f"Train Unique IDs: {train_unique_ids} ({train_ratio * 100:.2f}%)")
print(f"Test Unique IDs: {test_unique_ids} ({test_ratio * 100:.2f}%)")


Race: 0
Total Unique IDs: 10
Train Unique IDs: 10
Test Unique IDs: 10
Race: 1
Total Unique IDs: 10
Train Unique IDs: 10
Test Unique IDs: 10
Race: 2
Total Unique IDs: 10
Train Unique IDs: 10
Test Unique IDs: 10
Race: 3
Total Unique IDs: 10
Train Unique IDs: 10
Test Unique IDs: 10

Overall Statistics:
Total Unique IDs: 40
Train Unique IDs: 10 (100.00%)
Test Unique IDs: 10 (100.00%)


In [140]:
import pandas as pd

def number_img_per_id(linear_prob_dataset_split):
    # Create a dictionary to store the counts for each race
    race_id_counts = {}

    # Separate the 'train' and 'test' data
    train_data = linear_prob_dataset_split[linear_prob_dataset_split['split'] == 'train']
    test_data = linear_prob_dataset_split[linear_prob_dataset_split['split'] == 'test']

    # Calculate the number of unique IDs and their ratios
    total_unique_ids = len(linear_prob_dataset_split['id'].unique())

    # Count unique IDs per race
    for race in linear_prob_dataset_split['race'].unique():
        race_data = linear_prob_dataset_split[linear_prob_dataset_split['race'] == race]
        race_train_data = train_data[train_data['race'] == race]
        race_test_data = test_data[test_data['race'] == race]

        race_id_counts[race] = {'Total Unique IDs': 0, 'ID Image Counts': {}}

        for id in race_data['id'].unique():
            total_race_images = len(race_data[race_data['id'] == id])
            train_race_images = len(race_train_data[race_train_data['id'] == id])
            test_race_images = len(race_test_data[race_test_data['id'] == id])

            race_id_counts[race]['ID Image Counts'][id] = {
                'Total Images': total_race_images,
                'Train Images': train_race_images,
                'Test Images': test_race_images,
            }

            # Update the total unique IDs count for the race
            race_id_counts[race]['Total Unique IDs'] += 1

    # Print results for each race
    for race, race_info in race_id_counts.items():
        print(f"Race: {race}")
        print(f"Total Unique IDs: {race_info['Total Unique IDs']}")

        for id, id_info in race_info['ID Image Counts'].items():
            print(f"ID: {id}")
            print(f"Total Images: {id_info['Total Images']}")
            print(f"Train Images: {id_info['Train Images']}")
            print(f"Test Images: {id_info['Test Images']}")
            print()

    # Print overall results
    print("\nOverall Statistics:")
    print(f"Total Unique IDs: {total_unique_ids}")



In [141]:
number_img_per_id(linear_prob_dataset_split)

Race: 0
Total Unique IDs: 10
ID: m.07h5rn
Total Images: 60
Train Images: 48
Test Images: 12

ID: m.0181j_
Total Images: 60
Train Images: 48
Test Images: 12

ID: m.0240pk
Total Images: 60
Train Images: 48
Test Images: 12

ID: m.01lb8z
Total Images: 60
Train Images: 48
Test Images: 12

ID: m.09fdg1
Total Images: 60
Train Images: 48
Test Images: 12

ID: m.0cc99yf
Total Images: 60
Train Images: 48
Test Images: 12

ID: m.03tjn_
Total Images: 60
Train Images: 48
Test Images: 12

ID: m.09d6n9
Total Images: 60
Train Images: 48
Test Images: 12

ID: m.020skv
Total Images: 60
Train Images: 48
Test Images: 12

ID: m.0bb8pbs
Total Images: 60
Train Images: 48
Test Images: 12

Race: 1
Total Unique IDs: 10
ID: m.04q43q
Total Images: 60
Train Images: 48
Test Images: 12

ID: m.0ffgks
Total Images: 60
Train Images: 48
Test Images: 12

ID: m.0pcvrt5
Total Images: 60
Train Images: 48
Test Images: 12

ID: m.0cfxd5
Total Images: 60
Train Images: 48
Test Images: 12

ID: m.09z1b2
Total Images: 60
Train Images:

Save train and test image

In [30]:
def save_train_image(identity_selection, data_path, save_path, label_to_race):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    
    # Make a copy of the identity_selection
    identity_selection = identity_selection.copy()
    if 'split' in identity_selection.columns:
        identity_selection = identity_selection[identity_selection['split'] == 'train']

    for _, row in tqdm(identity_selection.iterrows(), total=len(identity_selection), desc="Processing Images"):
        image_name = row['image_name']
        id = row['id']
        race = label_to_race[row['race']]

        source_path = os.path.join(data_path, race, id, image_name)
        identity_save_path = os.path.join(save_path, id)

        # Create a subdirectory for the identity if it doesn't exist
        if not os.path.exists(identity_save_path):
            os.makedirs(identity_save_path)

        destination_path = os.path.join(identity_save_path, image_name)
        
        # Check if the image already exists
        if os.path.exists(destination_path):
            continue

        #check if the source image exists
        if not os.path.exists(source_path):
            print("{} does not exist".format(source_path))
            continue

        # Copy the image without converting
        shutil.copy(source_path, destination_path)

    print('Finished copying training images to folders in JPG format')

In [25]:
def save_test_image(identity_selection, data_path, save_path, label_to_race):
    if not os.path.exists(save_path):
        os.makedirs(save_path)    

    identity_selection = identity_selection.copy()
    
    if 'split' in identity_selection.columns:
        identity_selection = identity_selection[identity_selection['split'] == 'test']

    for _, row in tqdm(identity_selection.iterrows(), total=len(identity_selection), desc="Processing Images"):
        image_name = row['image_name']
        id = row['id']
        race = label_to_race[row['race']]

        source_path = os.path.join(data_path, race, id, image_name)

        # Save to each race folder
        race_path = os.path.join(save_path, race)
        if not os.path.exists(race_path):
            os.makedirs(race_path)

        identity_save_path = os.path.join(race_path, id)

        # Create a subdirectory for the identity if it doesn't exist
        if not os.path.exists(identity_save_path):
            os.makedirs(identity_save_path)

        destination_path = os.path.join(identity_save_path, image_name)
        
        # Check if the image already exists
        if os.path.exists(destination_path):
            continue

        # Copy the image without converting
        shutil.copy(source_path, destination_path)

    print('Finished copying images to folders in JPG format')

In [26]:
#just saving one race for now
data = pd.read_csv("./balanced_data_split.csv")
african_balanced_split = data[data['race']==3]
caucasian_balanced_split = data[data['race']==0]

african_and_cau = pd.concat([african_balanced_split, caucasian_balanced_split])


data_folder_path = "./race_per_7000"
save_path = "./"


race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}

save_train_image(african_and_cau, data_folder_path, save_path+'/train', label_to_race)
save_test_image(african_and_cau, data_folder_path, save_path+'/test', label_to_race)

Processing Images: 100%|██████████| 48000/48000 [00:03<00:00, 14421.37it/s]


Finished copying training images to folders in JPG format


Processing Images: 100%|██████████| 12000/12000 [00:00<00:00, 14139.97it/s]

Finished copying images to folders in JPG format





#just getting mvp mini data

In [33]:
african_unbalanced = pd.read_csv("./unbalanced_African.csv")

#take first 50 id from african and first 10 caucasian 
african_only = african_unbalanced[african_unbalanced['race']==3]
cau_only = african_unbalanced[african_unbalanced['race']==0]

data = pd.concat([african_only, cau_only])


In [34]:
#get dataset
#just saving one race for now

data_folder_path = "./race_per_7000"
save_path = "./"


race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}

save_train_image(data, data_folder_path, save_path+'/con_train_mini', label_to_race)
# save_test_image(data, data_folder_path, save_path+'/test', label_to_race)

Processing Images:   0%|          | 0/87600 [00:00<?, ?it/s]

Processing Images:  93%|█████████▎| 81520/87600 [01:46<00:07, 768.19it/s] 


FileNotFoundError: [Errno 2] No such file or directory: './race_per_7000/Caucasian/m.03j2zfm/6-FaceId-0_align.jpg'

In [None]:
data_folder_path = "./race_per_7000"
save_path = "./"


race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}

In [None]:
linear_prob_dataset_split

# from the linear_prob_dataset_split.csv, i want to actually get those images from the race_per_700/ folder and put them into a folder in my current directpry called linear_prob_data, and this folder should have the sub folders for test and train data from the linear_prob_dataset_split.csv
# race_per_700 has 4 sub folders for each race. Each race has subfolder for their ids. And each id has their images
# race_per_700/
    # -> [RACE]
    #     -> id_1
    #     -> id_2

# we should copy the data from race_per_700 that belongs to linear_prob_dataset_split and save it

In [None]:
race_per_7000/Caucasian 
race_per_700/Caucasian/

In [153]:
import os
import shutil
import pandas as pd

# Load the 'linear_prob_dataset_split' dataset
linear_prob_dataset_split = pd.read_csv("linear_prob_dataset_split.csv")

# Define source and destination directories
source_dir = "race_per_7000"  # Replace with your actual source directory
destination_dir = "linear_prob_data"
race_to_label = {'Caucasian': 0, 'Indian': 1, 'Asian': 2, 'African': 3}
label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}
# Create destination directories for test and train if they don't exist
for split in ["test", "train"]:
    os.makedirs(os.path.join(destination_dir, split), exist_ok=True)

# Iterate through each row in the dataset
for _, row in linear_prob_dataset_split.iterrows():
    image_name = row['image_name']
    race = label_to_race[row['race']]
    id = row['id']
    split = row['split']

    # Define source and destination paths for the image
    source_path = os.path.join(source_dir, race, id, image_name)
    if split == "train":
        
        destination_path = os.path.join(destination_dir, split, race, id, image_name)
        # Create destination directories for each race and ID if they don't exist
        os.makedirs(os.path.join(destination_dir, split, race, id), exist_ok=True)

    elif split == "test":
        destination_path = os.path.join(destination_dir, split, id, image_name)
        os.makedirs(os.path.join(destination_dir, split, id), exist_ok=True)


    # Copy the image to the destination directory
    shutil.copy(source_path, destination_path)

print("Images copied and organized into 'linear_prob_data' directory.")


Images copied and organized into 'linear_prob_data' directory.


In [2]:
import os
import shutil
import pandas as pd

# Load the 'linear_prob_dataset_split' dataset
linear_prob_dataset_split = pd.read_csv("linear_prob_dataset_split.csv")

# Define source and destination directories
source_dir = "race_per_7000"  # Replace with your actual source directory
destination_dir = "linear_prob_data"
label_to_race = {0: 'Caucasian', 1: 'Indian', 2: 'Asian', 3: 'African'}

# Create destination directories for test and train if they don't exist
for split in ["test", "train"]:
    os.makedirs(os.path.join(destination_dir, split), exist_ok=True)

# Define races to exclude from 'test_2'
excluded_races = ['Asian', 'Indian']

# Iterate through each row in the dataset
for _, row in linear_prob_dataset_split.iterrows():
    image_name = row['image_name']
    race = label_to_race[row['race']]
    id = row['id']
    split = row['split']

    # Check if the race should be excluded from 'test_2'
    if split == "test" and race in excluded_races:
        print(f"Excluding race: {race}")
        continue

    # Define source and destination paths for the image
    source_path = os.path.join(source_dir, race, id, image_name)

    if split == "train":
        destination_path = os.path.join(destination_dir, split, race, id, image_name)
        # Create destination directories for each race and ID if they don't exist
        os.makedirs(os.path.join(destination_dir, split, race, id), exist_ok=True)

    elif split == "test":
        destination_path = os.path.join(destination_dir, split, id, image_name)
        os.makedirs(os.path.join(destination_dir, split, id), exist_ok=True)

    # Copy the image to the destination directory
    shutil.copy(source_path, destination_path)

print("Images copied and organized into 'linear_prob_data' directory.")


Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding race: Indian
Excluding r