In [1]:
import pandas as pd
import re
import warnings
warnings.filterwarnings('once')

def load_image_analysis_results(readpath='../../data/evaluation/raw/'):
    cols = [1,2,4,5,6]
    colnames = ['image','quadrant','skincolor','gender_woman','gender_man']
    results = pd.read_csv(readpath+'midjourney_deepface_calibrated_equalized.csv', header=0, usecols=cols, names=colnames)
    #Midjourney appears to have a file name limitation which cuts off a number of the submitted prompts 
    #at or after the word photorealistic
    #Parsing the image column(raw file name) allows us to extract the base prompt to merge on 
    base_prompt= []
    for row in results['image']:
        try:
            base_prompt.append(row.split('_')[1:-2])
        except:
            base_prompt.append('none')
    results['base_prompt'] = base_prompt
    results['base_prompt'] = results['base_prompt'].apply(lambda x: ' '.join(x)+" photorealistic")
    results['base_prompt'] = results['base_prompt'].str.strip().str.rstrip()
    #Two prompt generations (8 images total) were corrected to singular.
    #While these are grammatically improved, they can no longer be used in the evaluation
    strings = ['a architecture teacher postsecondary',
               'a environmental science teacher postsecondary']
    pat = '|'.join(strings)
    results = results[~results['base_prompt'].str.contains(pat)]
    #Mapping gender detection values to single column
    noface= (results['skincolor'].isnull()).values 
    unknown=  ((results['gender_woman']<=50.000000) & (results['gender_man']<=50.000000)).values
    woman= ((results['gender_woman']>50.000000) & (results['gender_man']<=50.000000)).values
    man= ((results['gender_man']>50.000000) & (results['gender_woman']<=50.000000)).values
    results['gender_detected'] = 0
    results['gender_detected'][noface]='no_face'
    results['gender_detected'][unknown]='unknown'
    results['gender_detected'][woman]='woman'
    results['gender_detected'][man]='man'
    return(results)

def load_prompts(readpath='../../data/text_generation/processed/'):
    cols = [0,1,5]
    colnames = ['full_prompt','tag','prompt_compound']
    prompts = pd.read_csv(readpath+'generated_mj_prompts.csv',header=0, usecols=cols, names=colnames)
    base_prompt =[]
    for line in prompts['full_prompt']:
        line = line.replace('/imagine prompt:','')
        line = line.replace(', photorealistic --s 625',' photorealistic')
        line = line.replace(',',' ')
        line = re.sub(' +',' ',line).strip().rstrip()
        base_prompt.append(line)
    prompts['base_prompt'] = base_prompt  
    return prompts

def load_tda_data(readpath='../../data/text_generation/interim/'):
    cols = [0,4,5]
    colnames = ['tda','tda_compound','tda_sentiment_cat']
    tda_data = pd.read_csv(readpath+'TDA_Bank.csv',header=0, usecols=cols, names=colnames)
    return tda_data

def load_occupation_data(readpath='../../data/text_generation/interim/'):
    cols=[ 'occ_code','norm_title','a_median','wage_cat']
    occupation_data = pd.read_csv(readpath+'AnnualOccupations_TitleBank.csv',usecols=cols)
    return occupation_data

In [2]:
def get_occupation_results():
    results = load_image_analysis_results()
    prompts = load_prompts()
    res_prompts = results.merge(prompts, right_on=['base_prompt'], left_on=['base_prompt'])
    occupations = load_occupation_data()
    occ_results = res_prompts.merge(occupations,right_on=['norm_title'],left_on=['tag'],how='inner')
    return occ_results

def get_tda_results():
    results = load_image_analysis_results()
    prompts = load_prompts()
    res_prompts = results.merge(prompts, right_on=['base_prompt'], left_on=['base_prompt'])
    tda = load_tda_data()
    tda_results = res_prompts.merge(tda,right_on=['tda'],left_on=['tag'],how='inner')
    return tda_results

In [3]:
savepath='../../data/evaluation/processed/'

tda = get_tda_results()
tda.to_csv(savepath+ 'TDA_Results.csv',index=False)

occ = get_occupation_results()
occ.to_csv(savepath+ 'Occupation_Results.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
