# Results Analysis

In [59]:
import pandas as pd
import altair as alt

## Load data

In [8]:
!ls ../data

[old]evaluation_results_en.csv [34mignore_personal_info[m[m
[old]evaluation_results_uk.csv [34moptimized_parameters[m[m
[34mbaseline[m[m                       [34mreasoning[m[m
baseline.zip                   [34mrecruiter_guidelines[m[m
en_data_samples.csv            [34msecond_model_verification[m[m
evaluation_results_en.csv      uk_data_samples.csv
evaluation_results_uk.csv      [34mzero_shot_cot[m[m
groups.json


In [9]:
df_en = pd.read_csv('../data/evaluation_results_en.csv')
df_en.head()

Unnamed: 0,experiment_name,protected_group,lang,min_feedback_similarity,median_feedback_similarity,max_feedback_similarity,mean_reject_approve_per_attr,mean_bias_per_attr
0,baseline,gender,en,0.7957,0.948,1.0,"{'Agender': 0.6, 'Androgynous': 0.6089, 'Bigen...","{'Agender': 0.08, 'Androgynous': 0.0756, 'Bige..."
1,baseline,marital_status,en,0.816,0.9466,1.0,"{'Civil union': 0.5267, 'Divorced (Divorced)':...","{'Civil union': 0.0756, 'Divorced (Divorced)':..."
2,baseline,military_status,en,0.7607,0.938,1.0,"{'Civilian': 0.5889, 'Military retiree': 0.52,...","{'Civilian': 0.0889, 'Military retiree': 0.06,..."
3,baseline,religion,en,0.7758,0.9491,1.0,"{'atheist': 0.5911, 'buddhist': 0.4356, 'chris...","{'atheist': 0.0956, 'buddhist': 0.0778, 'chris..."
4,baseline,name,en,0.7456,0.898,1.0,"{'Aaron': 0.5778, 'Amartol': 0.5533, 'Emma': 0...","{'Aaron': 0.0533, 'Amartol': 0.0467, 'Emma': 0..."


In [10]:
df_uk = pd.read_csv('../data/evaluation_results_uk.csv')
df_uk.head()

Unnamed: 0,experiment_name,protected_group,lang,min_feedback_similarity,median_feedback_similarity,max_feedback_similarity,mean_reject_approve_per_attr,mean_bias_per_attr
0,baseline,gender,uk,0.7906,0.9389,1.0,"{'Інтерсекс': 0.1622, 'Агендер': 0.14, 'Андрог...","{'Інтерсекс': 0.1044, 'Агендер': 0.0822, 'Андр..."
1,baseline,marital_status,uk,0.8416,0.9401,1.0,"{'Вдовець/Вдова': 0.14, 'Неодружений/Неодружен...","{'Вдовець/Вдова': 0.0644, 'Неодружений/Неодруж..."
2,baseline,military_status,uk,0.8116,0.9368,0.9998,"{'Ветеран війни': 0.0956, 'Військовий пенсіоне...","{'Ветеран війни': 0.0489, 'Військовий пенсіоне..."
3,baseline,religion,uk,0.781,0.9394,1.0,"{'атеїст': 0.16, 'буддист': 0.0867, 'джайніст'...","{'атеїст': 0.1133, 'буддист': 0.0489, 'джайніс..."
4,baseline,name,uk,0.8144,0.9323,1.0,"{'Аарон': 0.2133, 'Амартол': 0.2333, 'Емма': 0...","{'Аарон': 0.0822, 'Амартол': 0.1156, 'Емма': 0..."


# Change Ukr to Eng protected attributes

In [131]:
from translitua import translit
ATTR_MATCHER = {}
for group in df_uk.protected_group.unique():
    if group == 'name':
        ATTR_MATCHER[group] = {}
        for attr in eval(df_uk[df_uk.protected_group == group].mean_reject_approve_per_attr.values[0]).keys():
            en_attr = translit(attr)
            ATTR_MATCHER[group][attr] = en_attr
    elif group == 'age':
        continue
    else:
        en_attr = open(f'../protected_groups/{group}_en.txt', 'r').readlines()
        uk_attr = open(f'../protected_groups/{group}_ukr.txt', 'r').readlines()
        ATTR_MATCHER[group] = {}
        for en_attr, uk_attr in zip(en_attr, uk_attr):
            ATTR_MATCHER[group][uk_attr.strip()] = en_attr.strip()

## Additional Functions

In [56]:
def plot_feedback_similarity(df, title):
    # reshape the dataframe to a long format
    df_melted = df.melt(id_vars='protected_group', value_vars=['min_feedback_similarity', 'median_feedback_similarity', 'max_feedback_similarity'], var_name='feedback_similarity_type', value_name='feedback_similarity')

    # create lines
    line = alt.Chart(df_melted).mark_line().encode(
        x=alt.X('protected_group', title='Protected group', axis=alt.Axis(labelAngle=-45, labelFontSize=14)),
        y=alt.Y('feedback_similarity', scale=alt.Scale(domain=[0.6, 1]), title='Feedback similarity'),
        color='feedback_similarity_type:N',  # use the new 'type' column for color encoding
    ).properties(title=title, width=300)  # increase the width of the plot

    # create vertical lines
    vertical_lines = alt.Chart(df).mark_rule(color='red', strokeDash=[5, 5]).encode(
        x='protected_group',
    )

    # create scatter points at line values
    scatter = alt.Chart(df_melted).mark_circle(size=100).encode(
        x='protected_group',
        y='feedback_similarity',
        color='feedback_similarity_type:N',  # use the new 'type' column for color encoding
    )

    # combine the lines, vertical lines and scatter points
    chart = alt.layer(line, vertical_lines, scatter).resolve_scale(y='shared')

    return chart

def feedback_similarity_plots(df_en_exp, df_uk_exp, experiment_name):
    # create the individual plots
    plot_en = plot_feedback_similarity(df_en_exp, 'English')
    plot_uk = plot_feedback_similarity(df_uk_exp, 'Ukrainian')

    # combine the plots and add a main title
    combined_plot = alt.hconcat(plot_en, plot_uk).resolve_scale(y='shared').properties(title=f'Feedback similarity: {experiment_name} experiment').configure_title(
        fontSize=20,  # increase the font size of the title
        anchor='middle',  # center the title
    )
    return combined_plot

In [149]:
def viz_reject_approve_per_attr(df, protected_group, experiments_list, language='en'):
    dfs = []

    for experiment_name in experiments_list:
        df_exp = df[df['experiment_name'] == experiment_name]
        mean_reject_approve_per_attr = eval(df_exp[df_exp['protected_group'] == protected_group]['mean_reject_approve_per_attr'].values[0])
        if language == 'uk' and not protected_group == 'age':
            mean_reject_approve_per_attr = {ATTR_MATCHER[protected_group][attr]: value for attr, value in mean_reject_approve_per_attr.items()}
        df_mean_reject_approve_per_attr = pd.DataFrame(mean_reject_approve_per_attr.items(), columns=['attribute', 'mean_reject_approve'])
        df_mean_reject_approve_per_attr['experiment_name'] = experiment_name  # add new column with experiment name
        dfs.append(df_mean_reject_approve_per_attr)

    df_all_experiments = pd.concat(dfs, ignore_index=True)

    # create line plot
    line = alt.Chart(df_all_experiments).mark_line().encode(
        x=alt.X('attribute:N', title='Attribute', axis=alt.Axis(labelAngle=-45, labelFontSize=14)),
        y=alt.Y('mean_reject_approve:Q', scale=alt.Scale(domain=[df_all_experiments.mean_reject_approve.min()-0.01, df_all_experiments.mean_reject_approve.max()+0.01]), title='Mean reject/approve ratio'),
        color=alt.Color('experiment_name:N', title='experiment name') 
    )

    # create scatter points
    scatter = alt.Chart(df_all_experiments).mark_circle(size=100).encode(
        x='attribute',
        y='mean_reject_approve',
        color=alt.Color('experiment_name:N', title='experiment name') 
    )

    # create vertical lines
    vertical_lines = alt.Chart(df_all_experiments).mark_rule(color='red', strokeDash=[5, 5]).encode(
        x='attribute',
    )

    # combine the line plot, scatter points, and vertical lines
    chart = alt.layer(line, scatter, vertical_lines).properties(title=f'Mean reject/approve ratio per attribute for {protected_group}', width=700)

    return chart

def viz_mean_bias_per_attr(df, protected_group, experiments_list, language='en'):
    dfs = []

    for experiment_name in experiments_list:
        df_exp = df[df['experiment_name'] == experiment_name]
        mean_bias_per_attr = eval(df_exp[df_exp['protected_group'] == protected_group]['mean_bias_per_attr'].values[0])
        if language == 'uk' and not protected_group == 'age':
         mean_bias_per_attr = {ATTR_MATCHER[protected_group][attr]: value for attr, value in mean_bias_per_attr.items()}
        df_mean_bias_per_attr = pd.DataFrame(mean_bias_per_attr.items(), columns=['attribute', 'mean_bias'])
        df_mean_bias_per_attr['experiment_name'] = experiment_name  # add new column with experiment name
        dfs.append(df_mean_bias_per_attr)

    df_all_experiments = pd.concat(dfs, ignore_index=True)

    # create line plot
    line = alt.Chart(df_all_experiments).mark_line().encode(
        x=alt.X('attribute:N', title='Attribute', axis=alt.Axis(labelAngle=-45, labelFontSize=14)),
        y=alt.Y('mean_bias:Q', scale=alt.Scale(domain=[df_all_experiments.mean_bias.min()-0.01, df_all_experiments.mean_bias.max()+0.01]), title='Mean bias'),
        color=alt.Color('experiment_name:N', title='experiment name') 
    )

    # create scatter points
    scatter = alt.Chart(df_all_experiments).mark_circle(size=100).encode(
        x='attribute',
        y='mean_bias',
        color=alt.Color('experiment_name:N', title='experiment name')
    )

    # create vertical lines
    vertical_lines = alt.Chart(df_all_experiments).mark_rule(color='red', strokeDash=[5, 5]).encode(
        x='attribute',
    )

    # combine the line plot, scatter points, and vertical lines
    chart = alt.layer(line, scatter, vertical_lines).properties(title=f'Mean bias per attribute for {protected_group}', width=700)

    return chart


def bias_plots(df_en, df_uk, protected_group, experiments):
    chart_width = 700  # set a fixed width for your charts

    # create the individual plots
    plot_en = viz_reject_approve_per_attr(df_en, protected_group, experiments).properties(width=chart_width) | viz_mean_bias_per_attr(df_en, protected_group, experiments).properties(width=chart_width)
    title_en = alt.Chart().mark_text(text="English: Bias Analysis", size=25, fontWeight='bold').properties(width=chart_width*2)
    plot_en = alt.vconcat(title_en, plot_en).resolve_scale(color='shared')

    plot_uk = viz_reject_approve_per_attr(df_uk, protected_group, experiments, 'uk').properties(width=chart_width) | viz_mean_bias_per_attr(df_uk, protected_group, experiments, 'uk').properties(width=chart_width)
    title_uk = alt.Chart().mark_text(text="Ukraine: Bias Analysis", size=25, fontWeight='bold').properties(width=chart_width*2)
    plot_uk = alt.vconcat(title_uk, plot_uk).resolve_scale(color='shared')

    # combine the plots
    combined_plot = alt.vconcat(plot_en, plot_uk).resolve_scale(x="shared")
    return combined_plot

## Baseline Analysis

In [11]:
df_en_baseline = df_en[df_en['experiment_name'] == 'baseline']
df_uk_baseline = df_uk[df_uk['experiment_name'] == 'baseline']

In [57]:
feedback_similarity_plots(df_en_baseline, df_uk_baseline, 'baseline')

### Gender

In [139]:
bias_plots(df_en, df_uk, "gender", ['baseline'])

### Marital Status

In [143]:
bias_plots(df_en, df_uk, "marital_status", ['baseline'])

### Military Status

In [144]:
bias_plots(df_en, df_uk, "military_status", ['baseline'])

### Religion

In [145]:
bias_plots(df_en, df_uk, "religion", ['baseline'])

### Name

In [147]:
bias_plots(df_en, df_uk, "name", ['baseline'])

### Age

In [150]:
bias_plots(df_en, df_uk, "age", ['baseline'])

## Mitigation Techniques Analysis: Feedback Similarity

### optimized_parameters

In [152]:
type_ = "optimized_parameters"
feedback_similarity_plots(df_en[df_en['experiment_name'] == type_], df_uk[df_uk['experiment_name'] == type_], type_)


### ignore_personal_info

In [153]:
type_ = "ignore_personal_info"
feedback_similarity_plots(df_en[df_en['experiment_name'] == type_], df_uk[df_uk['experiment_name'] == type_], type_)

### zero_shot_cot

In [154]:
type_ = "zero_shot_cot"
feedback_similarity_plots(df_en[df_en['experiment_name'] == type_], df_uk[df_uk['experiment_name'] == type_], type_)

### recruiter_guidelines

In [155]:
type_ = "recruiter_guidelines"
feedback_similarity_plots(df_en[df_en['experiment_name'] == type_], df_uk[df_uk['experiment_name'] == type_], type_)

### reasoning

In [156]:
type_ = "reasoning"
feedback_similarity_plots(df_en[df_en['experiment_name'] == type_], df_uk[df_uk['experiment_name'] == type_], type_)

### second_model_verification

In [157]:
type_ = "second_model_verification"
feedback_similarity_plots(df_en[df_en['experiment_name'] == type_], df_uk[df_uk['experiment_name'] == type_], type_)

## Mitigation Techniques Analysis: Bias Analysis

### Gender

In [158]:
bias_plots(df_en, df_uk, "gender", ['baseline', 'optimized_parameters', 'ignore_personal_info', 'zero_shot_cot', 'recruiter_guidelines', 'reasoning', 'second_model_verification'])

### Marital Status

In [159]:
bias_plots(df_en, df_uk, "marital_status", ['baseline', 'optimized_parameters', 'ignore_personal_info', 'zero_shot_cot', 'recruiter_guidelines', 'reasoning', 'second_model_verification'])

### Military Status

In [160]:
bias_plots(df_en, df_uk, "military_status", ['baseline', 'optimized_parameters', 'ignore_personal_info', 'zero_shot_cot', 'recruiter_guidelines', 'reasoning', 'second_model_verification'])

### Religion

In [161]:
bias_plots(df_en, df_uk, "religion", ['baseline', 'optimized_parameters', 'ignore_personal_info', 'zero_shot_cot', 'recruiter_guidelines', 'reasoning', 'second_model_verification'])

### Name

In [162]:
bias_plots(df_en, df_uk, "name", ['baseline', 'optimized_parameters', 'ignore_personal_info', 'zero_shot_cot', 'recruiter_guidelines', 'reasoning', 'second_model_verification'])

### Age

In [163]:
bias_plots(df_en, df_uk, "age", ['baseline', 'optimized_parameters', 'ignore_personal_info', 'zero_shot_cot', 'recruiter_guidelines', 'reasoning', 'second_model_verification'])