# 5. Questionnaire responses parsing

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# we dont need the timestamp
responses = pd.read_csv('responses.csv').drop('Timestamp', 1)
responses.head()

Replace wanted answers with 1., unwanted answers with 0.0 and can't tell with 0.5. This the effect of why has beard + male section swaped male/female values compared to  smile + female + young section.

In [None]:
# eyeglasses
responses_1cat = responses.iloc[:,:20].replace('Áno, človek na obrázku má okuliare', 1.0)\
                                      .replace('Nie, človek na obrázku nemá okuliare', 0.0)\
                                      .replace('Z obrázku neviem povedať, či človek má, alebo nemá okuliare', 0.5)

# beard + male
responses_2cat = responses.iloc[:,20:60].replace('Áno, človek na obrázku má bradu', 1.0)\
                                        .replace('Nie, človek na obrázku nemá bradu', 0.0)\
                                        .replace('Z obrázku neviem povedať, či človek má, alebo nemá bradu', 0.5)\
                                        .replace('Človek na obrázku sa podobá skôr na ženu', 0.0)\
                                        .replace('Človek na obrázku sa podobá skôr na muža', 1.0)\
                                        .replace('Z obrázku neviem povedať, či sa človek podobá skôr na ženu, alebo na muža', 0.5)

# smile + female + young
responses_3cat = responses.iloc[:,60:120].replace('Áno, človek na obrázku sa usmieva', 1.0)\
                                         .replace('Nie, človek na obrázku sa neusmieva', 0.0)\
                                         .replace('Z obrázku neviem povedať, či sa človek na obrázku usmieva, alebo neusmieva', 0.5)\
                                         .replace('Človek na obrázku sa podobá skôr na ženu', 1.0)\
                                         .replace('Človek na obrázku sa podobá skôr na muža', 0.0)\
                                         .replace('Z obrázku neviem povedať, či sa človek podobá skôr na ženu, alebo na muža', 0.5)\
                                         .replace('Človek na obrázku sa zdá byť skôr mladší', 1.0)\
                                         .replace('Človek na obrázku sa zdá byť skôr starší', 0.0)\
                                         .replace('Z obrázku neviem povedať, či je človek skôr mladší, alebo starší', 0.5)

In [None]:
# function to create pie plots for each attribute per image, where the most common answer was bellow percentage
# returns count of these attributes and their colnames
def plot_less_than(df, percent):
    colors = ['#ff9999','#66b3ff','#99ff99']
    count = 0
    colnames = []
    
    for i, colname in enumerate(df):
        vals = df[colname].value_counts().values
        # check if the most common value is over percentage
        if (max(vals)/sum(vals) > percent):
            continue

        count += 1
        #colnames.append(int(colname.split('.')[0]))
        colnames.append(colname)
        fig1, ax1 = plt.subplots()
        explode = None

        df[colname].value_counts().plot.pie(colors = colors, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode = explode)

        # create white circle in the middle
        centre_circle = plt.Circle((0,0),0.70,fc='white')
        fig = plt.gcf()
        fig.gca().add_artist(centre_circle)

        ax1.axis('equal')  
        plt.tight_layout()
        plt.title('img-' + str(i+1))
        plt.show()
    
    return (count, colnames)

## Uncertain values plots
On the right of the plot, you can see the image number.

### Eyeglasses uncertain values

In [None]:
threshold = 0.8

eyeglasses_count, eyeglasses_colnames = plot_less_than(responses_1cat, threshold)
df_cat1_neutral_count = pd.DataFrame({'Only Eyeglasses': [eyeglasses_count]},['Eyeglasses'])

### Beard + Male uncertain values

In [None]:
male_count, male_colnames = plot_less_than(responses_2cat[responses_2cat.columns[::2]], threshold)
beard_count, beard_colnames = plot_less_than(responses_2cat[responses_2cat.columns[1::2]], threshold)

df_cat2_neutral_count = pd.DataFrame({'Male and beard': [beard_count, male_count]}, ['Male', 'Beard'])

### Smile + young + female uncertain values

In [None]:
smile_count, smile_colnames = plot_less_than(responses_3cat[responses_3cat.columns[::3]], threshold)
young_count, young_colnames = plot_less_than(responses_3cat[responses_3cat.columns[2::3]], threshold)
female_count, female_colnames = plot_less_than(responses_3cat[responses_3cat.columns[1::3]], threshold)

df_cat3_neutral_count = pd.DataFrame({'Smile, young and female': [smile_count, young_count, female_count]}, ['Smile', 'Young', 'Female'])

## Uncertain values counts

In [None]:
df_cat1_neutral_count

In [None]:
df_cat2_neutral_count

In [None]:
df_cat3_neutral_count

In [None]:
print('Not relevant responses count: ' + str(male_count + beard_count + eyeglasses_count + smile_count + young_count + female_count))

In [None]:
# Drop unrelevant columns
relevant_responses_1cat = responses_1cat.drop(eyeglasses_colnames, 1)
relevant_responses_2cat = responses_2cat.drop(male_colnames + beard_colnames, 1) 
relevant_responses_3cat = responses_3cat.drop(smile_colnames + young_colnames + female_colnames, 1)

In [None]:
# split to linear (1-9) and tanh methods (11-19)
def split_methods(df):
    linear = df.filter(regex='^[1-9]\.')
    arctanh = df.filter(regex='^(1[1-9])')
    
    return (linear, arctanh)

In [None]:
linear_1cat, arctanh_1cat = split_methods(relevant_responses_1cat)
linear_2cat, arctanh_2cat = split_methods(relevant_responses_2cat)
linear_3cat, arctanh_3cat = split_methods(relevant_responses_3cat)

In [None]:
# get accuracy for each attribute separately, by counting 1
def get_accuracy(attributes):
    acc = []
    for attr in attributes:
        val = 0 
        counter = 0

        for colname in attr:
            val += attr[colname].mode().values[0]
            counter += 1

        acc.append(val/counter*100)
        
    return acc

## Accuracy per image per relevant attribute answer

In [None]:
pd.DataFrame({'Eyeglasses': [get_accuracy([linear_1cat])[0], get_accuracy([arctanh_1cat])[0]]}, ['Linear', 'Arctanh'])

In [None]:
linear = get_accuracy([linear_2cat.filter(regex='muža'), linear_2cat.filter(regex='bradu')])
arctanh = get_accuracy([arctanh_2cat.filter(regex='muža'), arctanh_2cat.filter(regex='bradu')])
pd.DataFrame({'Male': [linear[0], arctanh[0]], 'Beard': [linear[1], arctanh[1]]}, ['Linear', 'Arctanh'])

In [None]:
linear = get_accuracy([linear_3cat.filter(regex='Usmieva'), linear_3cat.filter(regex='muža'), linear_3cat.filter(regex='starší')])
arctanh = get_accuracy([arctanh_3cat.filter(regex='Usmieva'), arctanh_3cat.filter(regex='muža'), arctanh_3cat.filter(regex='starší')])
pd.DataFrame({'Smile': [linear[0], arctanh[0]], 'Female': [linear[1], arctanh[1]], 'Young': [linear[2], arctanh[2]]}, ['Linear', 'Arctanh'])

In [None]:
# # this is just for thesis figure
# thesis_df = responses_3cat['2. Usmieva sa človek na obrázku?'].rename('')
# my_dpi = 120
# fig1, ax1 = plt.subplots(figsize=(1024/my_dpi, 1024/my_dpi), dpi=my_dpi)
# explode = None
# colors = ['#ff9999','#66b3ff','#99ff99']

# thesis_df.value_counts().plot.pie(colors = colors, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode = explode, textprops={'fontsize': 20, 'fontweight': 'bold'})

# centre_circle = plt.Circle((0,0),0.70,fc='white')
# fig = plt.gcf()
# fig.gca().add_artist(centre_circle)

# ax1.axis('equal')
# plt.tight_layout()
# plt.title('Is the person on the image smiling?', y=0.95, fontsize=26, fontweight='bold')
# plt.show()

Now we are going to count every anwswer per image

In [None]:
linear_1cat, arctanh_1cat = split_methods(responses_1cat)
linear_2cat, arctanh_2cat = split_methods(responses_2cat)
linear_3cat, arctanh_3cat = split_methods(responses_3cat)

In [None]:
def get_accuracy_one(df_one):
    val = 0
    for one_colname in df_one:
        if df_one[one_colname].mode().values[0] == 1.0:
            val += 1
    
    return val/9*100

In [None]:
def get_accuracy_two(df_one, df_two):
    val = 0
    for one_colname in df_one:
        for two_colname in df_two:
            if int(one_colname.split('.')[0]) == int(two_colname.split('.')[0]):
                if df_one[one_colname].mode().values[0] == 1.0 and df_two[two_colname].mode().values[0] == 1.0:
                    val += 1            
    return val/9*100

In [None]:
def get_accuracy_three(df_one, df_two, df_three):
    val = 0
    for one_colname in df_one:
        for two_colname in df_two:
            for three_colname in df_three:
                if int(one_colname.split('.')[0]) == int(two_colname.split('.')[0]) == int(three_colname.split('.')[0]):
                    if df_one[one_colname].mode().values[0] == 1.0 and df_two[two_colname].mode().values[0] == 1.0 and df_three[three_colname].mode().values[0] == 1.0:
                        val += 1            
    return val/9*100

In [None]:
linear = get_accuracy_one(linear_1cat)
arctanh = get_accuracy_one(arctanh_1cat)
pd.DataFrame({'Eyeglasses': [linear, arctanh]}, ['Linear', 'Arctanh'])

In [None]:
linear = get_accuracy_two(linear_2cat.filter(regex='muža'), linear_2cat.filter(regex='bradu'))
arctanh = get_accuracy_two(arctanh_2cat.filter(regex='muža'), arctanh_2cat.filter(regex='bradu'))
pd.DataFrame({'Male and Beard': [linear, arctanh]}, ['Linear', 'Arctanh'])

In [None]:
linear = get_accuracy_three(linear_3cat.filter(regex='Usmieva'), linear_3cat.filter(regex='muža'), linear_3cat.filter(regex='starší'))
arctanh = get_accuracy_three(arctanh_3cat.filter(regex='Usmieva'), arctanh_3cat.filter(regex='muža'), arctanh_3cat.filter(regex='starší'))
pd.DataFrame({'Smile, Female and Young': [linear, arctanh]}, ['Linear', 'Arctanh'])

## This is the end! Thank you!
This was just a preview of results that we would like to share as an appendix to our thesis.

We have created also an unofficial repo, that is full of experiments and uncommented code. Proceed with caution.