In [2]:
import pandas as pd
import numpy as np
import torch
# from scipy.special import softmax

In [3]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def thresholded_argmax(x, threshold):
    max_val = x.max()
    if max_val > 1/len(x) + threshold:
        return x.argmax()
    else:
        return -1

In [4]:
# load the text file into a list of lines
with open('attributes_lfw.txt', 'r') as file:
    lines = file.readlines()

column_names = ['Person', 'Male', 'Asian', 'White', 'Black']

# load the rest of the lines as data, skipping the first line
data = []
for line in lines[2:]:  
    line_split = line.split("\t")
    data.append([line_split[0], sigmoid(float(line_split[2])), float(line_split[3]), float(line_split[4]), float(line_split[5])])

# create a pandas dataframe with the column names and data
df = pd.DataFrame(data, columns=column_names)
df = df.groupby('Person').mean()

# threshold for unsure decisions
threshold = 0.1

# calculate gender from logits
df['Gender'] = (df['Male'] > 0.5).astype(int)
df.loc[(df['Male'] >= 0.5 - threshold) & (df['Male'] <= 0.5 + threshold), 'Gender'] = "Unknown"
df.loc[df['Gender'] == 0, 'Gender'] = "Woman"
df.loc[df['Gender'] == 1, 'Gender'] = "Man"

# use softmax on race predictions
softmax_cols = ['Asian', 'White', 'Black']
df[softmax_cols] = df[softmax_cols].apply(lambda x: softmax(x), axis=1)
df['Race'] = df[softmax_cols].apply(lambda x: thresholded_argmax(x, threshold), axis=1)
df.loc[df['Race'] == 0, 'Race'] = "Asian"
df.loc[df['Race'] == 1, 'Race'] = "White"
df.loc[df['Race'] == 2, 'Race'] = "Black"
df.loc[df['Race'] == -1, 'Race'] = "Unknown"

# reorder the columns
df = df[['Gender', 'Male', 'Race', 'Asian', 'White', 'Black']]


# print and save the dataframe
print(df)
df.to_csv("total.csv")
df = df[df['Race'] != 'Unknown']
df.to_csv("total_without_unknowns.csv")


# # split the dataframe by race and save a sample of each race
# sample_size = 300
# race_groups = df.groupby('Race')
# for race, group in race_groups:
#     if race != "Unknown":
#         group = group.sample(n=sample_size)
#         group.to_csv(f'{race}_{sample_size}.csv', index=True)


                        Gender      Male   Race     Asian     White     Black
Person                                                                       
AJ Cook                  Woman  0.108678  White  0.023884  0.910933  0.065182
AJ Lamas               Unknown  0.515324  White  0.171984  0.693517  0.134499
Aaron Eckhart              Man  0.827548  White  0.024283  0.912341  0.063376
Aaron Guiel            Unknown  0.542361  White  0.171910  0.700707  0.127383
Aaron Patterson            Man  0.730616  White  0.161835  0.540988  0.297177
...                        ...       ...    ...       ...       ...       ...
Winona Ryder             Woman  0.261678  White  0.091128  0.876057  0.032815
Winston Churchill          Man  0.643083  White  0.270574  0.683685  0.045741
Wolfgang Becker            Man  0.613176  White  0.121514  0.826783  0.051703
Wolfgang Clement           Man  0.752395  White  0.082400  0.867524  0.050076
Wolfgang Schneiderhan  Unknown  0.542174  White  0.196759  0.767