In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import seaborn as sns

%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)
%matplotlib inline

In [None]:
respath='../../data/evaluation/processed/'




#prompt sentiment and tda sentiment are the same

# Map trait sentiment values to skin color and gender

In [None]:
tda_res = pd.read_csv(respath+'TDA_Results.csv' )

print(f"Total rows: {len(tda_res)}")

tda_res = tda_res[~tda_res['gender_detected_val'].isin({'no face', 'unknown'})]

print(f"Number of rows after removing faceless and unknown gender bboxes: {len(tda_res)}")

tda_res['gender_woman'] = tda_res['gender_woman'].apply(lambda x: x / 100.)
tda_res['gender_man'] = tda_res['gender_man'].apply(lambda x: x / 100.)

tda_res.head()

## Gender imbalance

In [None]:
from scipy.stats import binomtest

gender_sig = binomtest(len(tda_res[tda_res['gender_detected_val'] == 'woman']), n=len(tda_res), p=0.5)
print(f"p-value of hypothesis that both men and women are represented equally: {gender_sig}")

In [None]:
tda_res.hist(column='tda_compound', by='gender_detected_val')

In [None]:
# Visual test of RGB skin color intensity sorting

from saac.eval_utils import rgb_sorter

ax1 = plt.subplots(1, 1)

sorted_rgb = rgb_sorter(tda_res['skincolor'].apply(eval))
x_vals = np.linspace(0, len(sorted_rgb))

for x, c in enumerate(sorted_rgb):
    plt.plot(x*np.ones(2), [0, 1], color=np.array(c)/255)

# fig2 = plt.figure(figsize=(1, 14))
fig2, ax2 = plt.subplots(3, 1)

sorted_rgb_neg = rgb_sorter(tda_res[tda_res['tda_compound'] < 0]['skincolor'].apply(eval))
sorted_rgb_neu = rgb_sorter(tda_res[tda_res['tda_compound'] == 0]['skincolor'].apply(eval))
sorted_rgb_pos = rgb_sorter(tda_res[tda_res['tda_compound'] > 0]['skincolor'].apply(eval))

x_vals_neg = np.linspace(0, len(sorted_rgb_neg))
x_vals_neu = np.linspace(0, len(sorted_rgb_neu))
x_vals_pos = np.linspace(0, len(sorted_rgb_pos))

for x, c in enumerate(sorted_rgb_neg):
    ax2[0].plot(x*np.ones(2), [0, 1], color=np.array(c)/255)
for x, c in enumerate(sorted_rgb_neu):
    ax2[1].plot(x*np.ones(2), [0, 1], color=np.array(c)/255)
for x, c in enumerate(sorted_rgb_pos):
    ax2[2].plot(x*np.ones(2), [0, 1], color=np.array(c)/255)

# Map salary to skin color and gender

In [None]:
occ_res = pd.read_csv(respath +'Occupation_Results.csv')

print(f"Total rows: {len(occ_res)}")

occ_res = occ_res[~occ_res['gender_detected_val'].isin({'no face', 'unknown'})]

print(f"Number of rows after removing faceless and unknown gender bboxes: {len(occ_res)}")

occ_res['gender_woman'] = occ_res['gender_woman'].apply(lambda x: x / 100.)
occ_res['gender_man'] = occ_res['gender_man'].apply(lambda x: x / 100.)

occ_res.head()

In [None]:
# Histograms

n_bins = 20

occ_hist = occ_res.hist(column='a_median', bins=n_bins)

In [None]:
# Mostly just a visual test of intensity sorting per salary bin

from saac.eval_utils import rgb_sorter, rgb_intensity

ax1 = plt.subplots(1, 1)

occ_count, occ_division = np.histogram(occ_res['a_median'], bins=n_bins)

for idx in range(1, len(occ_division)):
    if idx + 1 == len(occ_division):
        mask = (occ_res['a_median'] >= occ_division[idx - 1]) & (occ_res['a_median'] <= occ_division[idx])
    else:
        mask = (occ_res['a_median'] >= occ_division[idx - 1]) & (occ_res['a_median'] < occ_division[idx])
        
    sorted_rgb = rgb_sorter(occ_res[mask]['skincolor'].apply(eval))
    
    for y, c in enumerate(sorted_rgb):
        plt.plot(occ_division[idx - 1: idx + 1], y * np.ones(2), color=np.array(c)/255)

In [None]:
# Violin plots of skin intensity per yearly salary bin

from saac.eval_utils import rgb_sorter, rgb_intensity

fig, ax = plt.subplots(1, 1)

occ_count, occ_division = np.histogram(occ_res['a_median'], bins=n_bins)

all_rgb_intensities = []

for idx in range(1, len(occ_division)):
    if idx + 1 == len(occ_division):
        mask = (occ_res['a_median'] >= occ_division[idx - 1]) & (occ_res['a_median'] <= occ_division[idx])
    else:
        mask = (occ_res['a_median'] >= occ_division[idx - 1]) & (occ_res['a_median'] < occ_division[idx])
        
    if sum(mask) <= 0:
        continue
        
    rgb_intensities = occ_res[mask]['skincolor'].apply(eval).apply(rgb_intensity)
    all_rgb_intensities.append(list(rgb_intensities.values))
    
    parts = ax.violinplot(rgb_intensities, positions=[np.mean(occ_division[idx - 1:idx + 1])],
                          # showmedians=True,
                          showmeans=True,
                          showextrema=False,
                          widths=7500.0,
                          points=100)
    
    hex_str = str(hex(int(np.median(rgb_intensities))))[2:]
    hex_color = f"#{hex_str}{hex_str}{hex_str}"
    
    for pc in parts['bodies']:
        pc.set_facecolor(hex_color)
        pc.set_edgecolor(hex_color)
        pc.set_alpha(1)
    parts['cmeans'].set_facecolor(hex_color)
    parts['cmeans'].set_edgecolor('black')

In [None]:
rgb_intensities = list()

for idx in range(1, len(occ_division)):
    if idx + 1 == len(occ_division):
        mask = (occ_res['a_median'] >= occ_division[idx - 1]) & (occ_res['a_median'] <= occ_division[idx])
    else:
        mask = (occ_res['a_median'] >= occ_division[idx - 1]) & (occ_res['a_median'] < occ_division[idx])
        
    rgb_intensities.append(np.mean(occ_res[mask]['skincolor'].apply(eval).apply(rgb_intensity)))

ax1 = plt.subplots(1, 1)

for idx, c in enumerate(rgb_intensities):
    if c is not np.nan:
        plt.plot(occ_division[idx: idx + 2], c * np.ones(2), color=c*np.ones(3)/255)

In [None]:
# ANOVA test

from scipy.stats import f_oneway

F, p = f_oneway(*all_rgb_intensities)
print(F)
print(p)