In [1]:
import pandas as pd
from saac.statistics import ks2sample_test
import seaborn as sns; sns.set(style='darkgrid', palette ='colorblind', color_codes=True)

respath='../../data/evaluation/processed/'

### Occupational Data Results

In [None]:
occ_res_all = pd.read_csv(respath +'Occupation_Results.csv').sort_values('a_median')
print(f'Total rows: {len(occ_res_all)}')
print('Counts of sampled wage categories for median annual wage for all possible gender detected values ')

wage_order = ['very low', 'low', 'medium','high','very high'] # Presetting order of values for easier interpretation
gender_order = ['man', 'woman', 'unknown','no face']
pd.crosstab(occ_res_all['gender_detected_val'], occ_res_all['wage_val']).reindex(gender_order)[wage_order] 

In [None]:
occ_res = occ_res_all[~occ_res_all['gender_detected_val'].isin(['unknown','no face'])]
print(f"Total rows after removing faceless and unknown gender detected results: {len(occ_res)}")
occ_res.head(3)

In [None]:
o = [x for x in ks2sample_test(occ_res, id_key='gender_detected_val', value_key='a_median')]
print(o)
#We can conclude that the a_median data for men and women were not drawn from the same distribution

In [None]:
sns.histplot(occ_res, x='a_median', hue='gender_detected_val', element='poly',stat='density')
# sns.displot(occ_res, x="a_median", hue="gender_detected_val", multiple="dodge")
sns.displot(data=occ_res, x='a_median', col='gender_detected_val',hue="gender_detected_val")
g = sns.JointGrid(data=occ_res, x='a_median', y='a_mean', hue='gender_detected_val')
g.plot(sns.scatterplot, sns.histplot)

In [None]:
### Trait Descriptive Adjective (TDA) Results

In [None]:
tda_res_all = pd.read_csv(respath+'TDA_Results.csv' )
print(f'Total rows: {len(tda_res_all)}')

sentcheck = tda_res_all[tda_res_all['tda_compound']==tda_res_all['prompt_compound']]
print(f'Total rows where tda sentiment is equal to prompt sentiment : {len(sentcheck)}')

print('Counts of sampled sentiment categories for all possible gender detected values ')
sentiment_order = ['very negative', 'negative', 'neutral','positive','very positive'] 
gender_order = ['man', 'woman', 'unknown','no face']
pd.crosstab(tda_res_all['gender_detected_val'], tda_res_all['tda_sentiment_val']).reindex(gender_order)[sentiment_order] 

In [None]:
tda_res = tda_res_all[~tda_res_all['gender_detected_val'].isin(['unknown','no face'])]
print(f"Total rows after removing faceless and unknown gender detected results: {len(tda_res)}")
tda_res.head(3)

In [None]:
t = [x for x in ks2sample_test(tda_res, id_key='gender_detected_val', value_key='tda_compound')]
t
# The samples of sentiment scores for men and women are not drawn from the same distribution.

In [None]:
sns.histplot(tda_res, x='prompt_compound', hue='gender_detected_val', element='poly',stat='density')
sns.displot(data=tda_res, x="tda_compound", hue="gender_detected_val", kind="kde")
sns.displot(data=tda_res, x='tda_compound', col='gender_detected_val',hue="gender_detected_val")
