In [None]:
import os
import glob
import pandas as pd
import re
import warnings
from saac.eval_utils import load_tda_data, load_occupation_data, rgb_intensity

warnings.filterwarnings('once')

In [None]:
'''
Midjourney has a file name limit of 100 characters,which cuts off a number of the submitted prompts in the file name
In order to merge the results with the generated prompt data
We will parse and extract the base prompt from the image_file column
'''
def load_image_analysis_results():
    eval_data_path = os.path.join('..','..', 'data','evaluation','raw')
    files = glob.glob(os.path.join(eval_data_path, '*.csv'))
    colnames = ['file_prompt','image_file','quadrant','bbox','skincolor','gender_woman','gender_man']
    results = pd.concat([pd.read_csv(fp,header=0, names=colnames)\
                     .assign(model =os.path.basename(fp).split('_')[0]) for fp in files],sort=False)
    base_prompt= []
    for row in results['image_file']:
        try:
            base_prompt.append(row.split('_')[1:-2])
        except:
            base_prompt.append('')
    results['base_prompt'] = base_prompt
    results['base_prompt'] = results['base_prompt'].apply(lambda x: ' '.join(x)+" photorealistic")
    results['base_prompt'] = results['base_prompt'].str.strip().str.rstrip()
    #Normalizing gender categories
    results['gender_woman'] = results['gender_woman'].apply(lambda x: x / 100.)
    results['gender_man'] = results['gender_man'].apply(lambda x: x / 100.)
    #Mapping gender detection values to single column
    noface= (results['skincolor'].isnull()).values
    unknown=  ((results['gender_woman']<=.50) & (results['gender_man']<=.50)).values
    woman= ((results['gender_woman']>=.50) & (results['gender_man']<.50)).values
    man= ((results['gender_man']>=.50) & (results['gender_woman']<.50)).values

    results['gender_detected_cat'] = 0
    results['gender_detected_cat'][noface]=1
    results['gender_detected_cat'][unknown]=2
    results['gender_detected_cat'][woman]=3
    results['gender_detected_cat'][man]=4
    gender_dict=  {
                 1: 'no face',
                 2: 'unknown',
                 3: 'woman',
                 4: 'man'
                    }
    results['gender_detected_val'] = results['gender_detected_cat'].map(gender_dict)
    #Extracting RGB intensity from skincolor 
    results['rgb_intensity'] = results['skincolor'].apply(lambda x:rgb_intensity(eval(x)) if not pd.isna(x) else None)
    return results

In [None]:
def load_prompts():
    prompt_data_path = os.path.join('..','..', 'data','prompt_generation','processed')
    files = glob.glob(os.path.join(prompt_data_path, '*.csv'))
    cols = [0, 1, 5]
    colnames = ['full_prompt','tag','prompt_compound']
    prompts = pd.concat([pd.read_csv(fp,header=0,usecols= cols, names=colnames)for fp in files],sort=False)
    base_prompt =[]
    for line in prompts['full_prompt']:
        if '/imagine prompt:' in line:
            line = line.replace('/imagine prompt:','')
            line = line.replace(', photorealistic --s 625',' photorealistic')
            line = line.replace(',',' ')
            line = re.sub(' +',' ',line).strip().rstrip()
            base_prompt.append(line)
        else:
            base_prompt.append(line)
    prompts['base_prompt'] = base_prompt
    return prompts

In [None]:
def get_occupation_results():
    results = load_image_analysis_results()
    prompts = load_prompts()
    res_prompts = results.merge(prompts, right_on=['base_prompt'], left_on=['base_prompt'])
    occupations = load_occupation_data()
    occ_results = res_prompts.merge(occupations,right_on=['norm_title'],left_on=['tag'],how='inner')
    return occ_results

def get_tda_results():
    results = load_image_analysis_results()
    prompts = load_prompts()
    res_prompts = results.merge(prompts, right_on=['base_prompt'], left_on=['base_prompt'])
    tda_data = load_tda_data()
    tda_results = res_prompts.merge(tda_data,right_on=['tda'],left_on=['tag'],how='inner')
    return tda_results

In [None]:
tda = get_tda_results()
print(len(tda))

In [None]:
tda.head()

In [None]:
occ = get_occupation_results()
print(len(occ))

In [None]:
occ.head()

In [None]:
occ = get_occupation_results()
print(len(occ))

In [None]:
occ.head()

In [None]:
occ = get_occupation_results()
print(len(occ))

In [None]:
occ.head()

In [6]:
occ = get_occupation_results()
print(len(occ))

1440


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['gender_detected_cat'][noface]=1


In [8]:
occ.head()

Unnamed: 0,file_prompt,image_file,quadrant,bbox,skincolor,gender_woman,gender_man,model,base_prompt,gender_detected_cat,...,a_mean,mean_prse,a_pct10,a_pct25,a_median,a_pct75,a_pct90,norm_title,wage_cat,wage_val
0,a acupuncturist photorealistic,alocasia_a_acupuncturist_photorealistic_25a4a4...,0,"[69, 46, 184, 202]","(214.89999999999998, 200.55, 169.6)",1.0,0.0,midjourney,a acupuncturist photorealistic,3,...,71770.0,6.0,29580.0,38000.0,60570.0,98440.0,126440.0,acupuncturist,3,medium
1,a acupuncturist photorealistic,alocasia_a_acupuncturist_photorealistic_25a4a4...,1,"[83, 298, 188, 432]","(203.225, 180.25, 151.5)",0.969697,0.030303,midjourney,a acupuncturist photorealistic,3,...,71770.0,6.0,29580.0,38000.0,60570.0,98440.0,126440.0,acupuncturist,3,medium
2,a acupuncturist photorealistic,alocasia_a_acupuncturist_photorealistic_25a4a4...,2,"[301, 55, 454, 248]","(223.375, 209.5, 169.675)",1.0,0.0,midjourney,a acupuncturist photorealistic,3,...,71770.0,6.0,29580.0,38000.0,60570.0,98440.0,126440.0,acupuncturist,3,medium
3,a acupuncturist photorealistic,alocasia_a_acupuncturist_photorealistic_25a4a4...,3,"[300, 258, 476, 481]","(228.9, 215.42499999999998, 183.675)",0.969697,0.030303,midjourney,a acupuncturist photorealistic,3,...,71770.0,6.0,29580.0,38000.0,60570.0,98440.0,126440.0,acupuncturist,3,medium
4,a acupuncturist photorealistic,HerculePoirot_a_acupuncturist_photorealistic_e...,0,"[98, 33, 195, 153]","(221.05, 213.3, 192.0)",0.5,0.5,midjourney,a acupuncturist photorealistic,2,...,71770.0,6.0,29580.0,38000.0,60570.0,98440.0,126440.0,acupuncturist,3,medium


In [9]:
savepath='../../data/evaluation/processed/'

tda.to_csv(savepath+ 'TDA_Results.csv',index=False)
occ.to_csv(savepath+ 'Occupation_Results.csv',index=False)