In [150]:
import pandas as pd
import os
from sklearn.metrics import accuracy_score, matthews_corrcoef

In [151]:
def accuracy(df):
    return accuracy_score(df['label'],df['preds'])

def miss(df):
    return df[df['label']!=df['preds']]

In [152]:
os.listdir('data/out')

['locationstest_babe_base.csv',
 'factualtest_magpie.csv',
 'prejudicetest_babe_base.csv',
 'locationstest_magpie.csv',
 'factualtest_roberta_anno_lexical.csv',
 'factualtest_babe_base.csv',
 'loadedtest_roberta_anno_lexical.csv',
 'pronounstest_babe_base.csv',
 'prejudicetest_magpie.csv',
 'pronounstest_roberta_anno_lexical.csv',
 'prejudicetest_roberta_anno_lexical.csv',
 'locationstest_roberta_anno_lexical.csv',
 'pronounstest_magpie.csv',
 'loadedtest_magpie.csv',
 'loadedtest_babe_base.csv']

# MFT: factual-test

In this test we took two datasets of very simple, short, factual sentences and we want to test if our model can always predict them as unbiased. An example of a sentence from the dataset: "Food cravings tell us that we want to eat a certain food."

In [153]:
babe=pd.read_csv('data/out/factualtest_babe_base.csv')
synth=pd.read_csv('data/out/factualtest_roberta_anno_lexical.csv')
magpie=pd.read_csv('data/out/factualtest_magpie.csv')

In [154]:
print(accuracy(babe))
print(accuracy(magpie))
print(accuracy(synth))

0.9506953223767383
0.9149810366624526
0.9639696586599241


In [155]:
# sample of 10 sentences where babe classifier wrongly predicts biased but synthetic one doesnt
miss(babe)[~miss(babe)['text'].isin(miss(synth)['text'])].text.sample(10).tolist()

['What do we call a plant that grows where it is not wanted? Weed.',
 'There is no place where it never stops snowing',
 'Nothing in particular happens if you wear a hat made out of tin foil',
 'Yes, duck quacks echo',
 'People would disagree about the best beer in the world',
 'J. B. Rhine attemped to prove the existence of ESP, but his methodology was flawed',
 'Can animals dance like people? No, animals can’t dance like people.',
 'No, there are no carpets that can fly',
 "British people don't have bad teeth",
 'What do we put on to dress up fancy? Fancy clothes and shoes.']

# DIR: loaded-test

In this test, we take unbiased sentences from the BABE test set and inject biased adjectives and adverbs into them, artifically changing the label to biased. The model succeeds if it can change it's prediction to biased. The biased adjectives and adverbs are taken from the bias lexicon. We only take neutral sentences that the model was able to classify as neutral in the first place, to ensure consistency.

In [156]:
babe=pd.read_csv('data/out/loadedtest_babe_base.csv')
synth=pd.read_csv('data/out/loadedtest_roberta_anno_lexical.csv')
magpie=pd.read_csv('data/out/loadedtest_magpie.csv')

In [159]:
babe = babe[babe['text_orig']!=babe['text_loaded']]
magpie = magpie[magpie['text_orig']!=magpie['text_loaded']]
synth = synth[synth['text_orig']!=synth['text_loaded']]

In [160]:
print(accuracy(babe))
print(accuracy(magpie))
print(accuracy(synth))

0.412531328320802
0.6838461538461539
0.6644385026737968


# INV: locations-test

In [170]:
babe=pd.read_csv('data/out/locationstest_babe_base.csv')
synth=pd.read_csv('data/out/locationstest_roberta_anno_lexical.csv')
magpie=pd.read_csv('data/out/locationstest_magpie.csv')

In [171]:
print(accuracy(babe))
print(accuracy(magpie))
print(accuracy(synth))

0.9837004405286344
0.9860986547085202
0.9713968957871397


# INV: pronouns-test

In [172]:
babe=pd.read_csv('data/out/pronounstest_babe_base.csv')
synth=pd.read_csv('data/out/pronounstest_roberta_anno_lexical.csv')
magpie=pd.read_csv('data/out/pronounstest_magpie.csv')

In [173]:
print(accuracy(babe))
print(accuracy(magpie))
print(accuracy(synth))

0.9711884753901561
0.9782608695652174
0.9566787003610109


In [174]:
miss(magpie).text_ner_free.iloc[0]

'They has been teaching a course on terrorism and insurgency at it in Pittsburgh for four years, and much more of his class these days is devoted to white supremacy than in the past.'

In [175]:
miss(magpie).text_orig.iloc[0]

'Colin P. Clarke has been teaching a course on terrorism and insurgency at Carnegie Mellon University in Pittsburgh for four years, and much more of his class these days is devoted to white supremacy than in the past.'

# INV: prejudice-test

In [176]:
babe=pd.read_csv('data/out/prejudicetest_babe_base.csv')
synth=pd.read_csv('data/out/prejudicetest_roberta_anno_lexical.csv')
magpie=pd.read_csv('data/out/prejudicetest_magpie.csv')

In [177]:
babe = babe[babe.label==0]
synth = synth[synth.label==0]
magpie = magpie[magpie.label==0]

In [178]:
print(accuracy(babe))
print(accuracy(magpie))
print(accuracy(synth))

0.8947368421052632
0.891578947368421
0.8526315789473684


In [179]:
for category in babe.category.unique():
    print(category)
    print(f"babe {accuracy(babe[babe.category==category])}")
    print(f"magpie {accuracy(magpie[magpie.category==category])}")
    print(f"synth {accuracy(synth[synth.category==category])}")


Politician
babe 1.0
magpie 1.0
synth 1.0
Gender
babe 0.7133333333333334
magpie 0.6866666666666666
synth 0.5733333333333334
Political Affiliation
babe 0.6866666666666666
magpie 0.76
synth 0.8533333333333334
Occupation
babe 1.0
magpie 0.9866666666666667
synth 1.0
Disability
babe 1.0
magpie 0.98
synth 1.0
Religion
babe 0.9333333333333333
magpie 0.8866666666666667
synth 0.64
Origin
babe 1.0
magpie 1.0
synth 1.0
