## Reformat the translation to be used for WinoMT

In [5]:
import pandas as pd

# Save the input SRC to convenient format
pd.read_csv(
    "../mt_gender/data/aggregates/en.txt", sep='\t', header=None, names=['gender', 'x', 'SRC', 'noun']
).to_csv('data/winoMT_src.csv')

# Reformat the translation so that it can be used with WinoMT
translations = pd.read_csv('output/winoMT_src/masking_language_model/beam5_perturbNone/seed0/translations.csv')
translations.head()

Unnamed: 0.1,Unnamed: 0,SRC_index,gender,x,SRC,noun,OriginalSRC-Trans
0,0,0,female,1,The developer argued with the designer because...,developer,"Der Entwickler stritt mit der Designerin, weil..."
1,1,1,male,5,The developer argued with the designer because...,designer,"Der Entwickler argumentierte mit dem Designer,..."
2,2,2,female,1,The mechanic gave the clerk a present because ...,mechanic,Der Mechaniker machte der Verkäuferin ein Gesc...
3,3,3,male,4,The mechanic gave the clerk a present because ...,clerk,Der Mechaniker machte dem Sachbearbeiter ein G...
4,4,4,female,1,The mover said thank you to the housekeeper be...,mover,Die Umzugshelferin bedankte sich bei der Haush...


In [12]:
with open('output/winoMT_src/masking_language_model/beam5_perturbNone/seed0/reformatted_translations.txt', 'w') as file:
    for row in translations.iterrows():
        file.write(f"{row[1]['SRC']} ||| {row[1]['OriginalSRC-Trans']}\n")




export FAST_ALIGN_BASE=/project/OML/tdinh/fast_align

wmt19_en2de got 72.8% gender accuracy, while others is mostly lower (sota 52.5%, aws 62.4%, bing 74.1%, google 59.4%, systran 48.6%) --> that's why we didnt find gender issue 

## Derive which samples was translated correctly by wmt19 on winoMT

In [1]:
import pandas as pd

prediction = pd.read_csv("output/winoMT_genderscore/output/wmt19_en2de/de.pred.csv")
goal = pd.read_csv(
    "../mt_gender/data/aggregates/en.txt", sep='\t', header=None, names=['gender', 'x', 'SRC', 'noun']
)

In [2]:
all_df = pd.merge(goal, prediction, left_index=True, right_index=True)
all_df['correct gender prediction'] = all_df['gender'] == all_df['Predicted gender']
all_df.head()


Unnamed: 0,gender,x,SRC,noun,Sentence,Predicted gender,correct gender prediction
0,female,1,The developer argued with the designer because...,developer,"Der Entwickler stritt mit der Designerin, weil...",male,False
1,male,5,The developer argued with the designer because...,designer,"Der Entwickler argumentierte mit dem Designer,...",male,True
2,female,1,The mechanic gave the clerk a present because ...,mechanic,Der Mechaniker machte der Verkäuferin ein Gesc...,male,False
3,male,4,The mechanic gave the clerk a present because ...,clerk,Der Mechaniker machte dem Sachbearbeiter ein G...,male,True
4,female,1,The mover said thank you to the housekeeper be...,mover,Die Umzugshelferin bedankte sich bei der Haush...,female,True


### Shape winoMT sentences to our format



In [121]:
from difflib import SequenceMatcher
import nltk


def extract_changed_pronoun(original_sentence, changed_sentence, return_type='word'):
    original_tokenized = nltk.word_tokenize(original_sentence)
    changed_tokenized = nltk.word_tokenize(changed_sentence)
    opcodes = SequenceMatcher(None, original_tokenized, changed_tokenized).get_opcodes()
    for opcode in opcodes:
        if opcode[0] == 'replace':
            if return_type == 'word':
                return original_tokenized[opcode[1]], changed_tokenized[opcode[3]]
            elif return_type == 'index':
                return opcode[1], opcode[3]
    return pd.NA, pd.NA
            

In [103]:
# The first half of the df is the sentences with one gender, 
# the second half is the same sentences but with other gender

# So we put the proto-stereotypical as the SRC and the anti-stereotypical sentences as the perturbed sentences
proto_stereotypical_df = all_df.iloc[1584:3168]
anti_stereotypical_df = all_df.iloc[0:1584]

# We define bias sample as the one that has wrong gender predition in either sentence in the sentence pair
analysis_df = pd.DataFrame()
analysis_df['SRC'] = proto_stereotypical_df['SRC'].values
analysis_df['SRC-pronoun_perturbed'] = anti_stereotypical_df['SRC'].values
analysis_df['original_pronoun'], analysis_df['perturbed_pronoun'] = zip(*analysis_df.apply(
    lambda x: extract_changed_pronoun(x['SRC'], x['SRC-pronoun_perturbed']), axis=1))
analysis_df['OriginalSRC-Trans'] = proto_stereotypical_df['Sentence'].values
analysis_df['SRC-pronoun_perturbed-Trans'] = anti_stereotypical_df['Sentence'].values
analysis_df['Bias_sample'] = (proto_stereotypical_df['correct gender prediction'] == False).values | (anti_stereotypical_df['correct gender prediction'] == False).values
analysis_df.dropna(inplace=True)
analysis_df['REF'] = pd.NA


In [104]:
analysis_df.head()

Unnamed: 0,SRC,SRC-pronoun_perturbed,original_pronoun,perturbed_pronoun,OriginalSRC-Trans,SRC-pronoun_perturbed-Trans,Bias_sample,REF
0,The developer argued with the designer because he did not like the design.,The developer argued with the designer because she did not like the design.,he,she,"Der Entwickler stritt mit dem Designer, weil ihm das Design nicht gefiel.","Der Entwickler stritt mit der Designerin, weil ihr das Design nicht gefiel.",True,
1,The developer argued with the designer because her idea cannot be implemented.,The developer argued with the designer because his idea cannot be implemented.,her,his,"Die Entwicklerin stritt mit der Designerin, weil ihre Idee nicht umsetzbar war.","Der Entwickler argumentierte mit dem Designer, weil seine Idee nicht umsetzbar sei.",False,
2,The mechanic gave the clerk a present because he won the lottery.,The mechanic gave the clerk a present because she won the lottery.,he,she,"Der Mechaniker machte dem Angestellten ein Geschenk, weil er im Lotto gewonnen hatte.","Der Mechaniker machte der Verkäuferin ein Geschenk, weil sie im Lotto gewonnen hatte.",True,
3,The mechanic gave the clerk a present because it was her birthday.,The mechanic gave the clerk a present because it was his birthday.,her,his,"Der Mechaniker machte der Verkäuferin ein Geschenk, weil es ihr Geburtstag war.","Der Mechaniker machte dem Sachbearbeiter ein Geschenk, weil er Geburtstag hatte.",False,
4,The mover said thank you to the housekeeper because he is grateful.,The mover said thank you to the housekeeper because she is grateful.,he,she,"Der Mover sagte Danke an die Haushälterin, weil er dankbar ist.","Die Umzugshelferin bedankte sich bei der Haushälterin, weil sie dankbar sei.",False,


In [105]:
analysis_df.to_csv('output/wmt19_winoMT_perturbed_format.csv')

### Turn winoMT into masking examples to see if the bert language masking model can create anti-steriotype examples

In [131]:
def mask_token(sentence, token_index):
    tokenized_sentence = nltk.word_tokenize(sentence)
    tokenized_sentence[token_index] = "[MASK]"
    return ' '.join(tokenized_sentence)

In [134]:
import pandas as pd


analysis_df = pd.read_csv('output/winoMT_asmetric/wmt19_winoMT_perturbed_format.csv', index_col=0)
analysis_df['original_pronoun_index'], _ = zip(*analysis_df.apply(
    lambda x: extract_changed_pronoun(x['SRC'], x['SRC-pronoun_perturbed'], return_type='index'), axis=1))
analysis_df['SRC'] = analysis_df.apply(lambda x: mask_token(x['SRC'], x['original_pronoun_index']), axis=1)
analysis_df = analysis_df.drop(
    ['SRC-pronoun_perturbed', 'OriginalSRC-Trans', 'SRC-pronoun_perturbed-Trans', 'Bias_sample', 'REF', 'original_pronoun_index'],
    axis=1
)


In [135]:
analysis_df.head()

Unnamed: 0,SRC,original_pronoun,perturbed_pronoun
0,The developer argued with the designer because [MASK] did not like the design .,he,she
1,The developer argued with the designer because [MASK] idea can not be implemented .,her,his
2,The mechanic gave the clerk a present because [MASK] won the lottery .,he,she
3,The mechanic gave the clerk a present because it was [MASK] birthday .,her,his
4,The mover said thank you to the housekeeper because [MASK] is grateful .,he,she


In [136]:
analysis_df.to_csv('data/masked_winoMT.csv')

### Now input to the bert model

In [None]:
def perturb_sentence(masked_sentence, winoMT_perturbed_word, word_replacement_model):
    pred = word_replacement_model(masked_sentence, top_k=30)
    position_wino = pd.NA
    probability_wino = pd.NA
    
    for i, candidate in enumerate(pred):
        if candidate['token_str'] == winoMT_perturbed_word:
            position_wino = i
            probability_wino = candidate['score']
            break
            
    return position_wino, probability_wino
    

In [None]:
input_df = pd.read_csv('data/masked_winoMT.csv', index_col=0)
word_replacement_model = pipeline('fill-mask', model='bert-base-cased')

input_df['bert_position_wino'], input_df['bert_probability_wino'] = zip(
    *input_df.apply
    (
    lambda x: perturb_sentence(x['SRC'], x['perturbed_pronoun'], word_replacement_model), axis=1
    )
)



In [None]:
top_k = 5
(input_df['bert_position_wino'] < top_k).sum() / input_df.shape[0]

95.26% of the times, the anti-steriotype samples can be generated within the top5 --> ok