In [62]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
import gensim.downloader as api
from gensim.models.fasttext import load_facebook_model
import random
from difflib import SequenceMatcher
from scipy import stats
import sacrebleu
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import os
import spacy


# nltk.download()

German word2vec model Facebook https://fasttext.cc/docs/en/crawl-vectors.html (cc.de.300.bin)

In [63]:
# Code taken from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

In [64]:
# https://docs.python.org/3/library/difflib.html
    
def changes_spread(original_tokenized, changed_tokenized, opcodes):
    start_change = -1
    end_change = -1
    for opcode in opcodes:
        if opcode[0] != 'equal':
            start_change = opcode[1]
            break
    for opcode in reversed(opcodes):
        if opcode[0] != 'equal':
            end_change = opcode[2]
            break
    return max(0, end_change-start_change)/len(changed_tokenized)

    

In [65]:
def highlight_in_capital(sentence_tokenized, highlight_positions):
    """
    Params:
        sentence_tokenized: tokenzied sentence
        highlight_positions: list of 2-sized tuples: [(p1, p2), (p3,p4), ...]
            where we want to highlight sentence[p1:p2], sentence[p3:p4]
    """
    highlighted_sentence = []
    
    last = 0  # index of the last position added to the new sentence
    for (start, stop) in highlight_positions:
        highlighted_sentence.extend(
            sentence_tokenized[last:start] + \
            [w.upper() for w in sentence_tokenized[start:stop]]
        )
        last = stop
    if last < len(sentence_tokenized):
        highlighted_sentence.extend(
            sentence_tokenized[last:]
        )
    return ' '.join(highlighted_sentence)

In [66]:
def two_chunk_changed(original_tokenized, changed_tokenized, opcodes, 
                      chunk_max_length=1, spacy_model=None, w2v_model=None):
    # Return the original and changed sentences with the chunk highlighted in capital
    # Return whether this sentence has only two chunk changes within the max length. 
    # And return the distance between the two changed chunks
    
    is_two_chunk_changed = False
    chunk_distance = pd.NA
    is_same_subtree = pd.NA
    changes_similarity = pd.NA
    
    
    
    changes_types = [o[0] for o in opcodes]
    
    # If not exactly two changes, return
    if not (all(changes_type == 'replace' or changes_type == 'equal' for changes_type in changes_types) and \
        changes_types.count('replace') == 2):
        return is_two_chunk_changed, chunk_distance, is_same_subtree, changes_similarity
    
    # Find the positions of the two changed chunks
    i_replace = [i for i, change in enumerate(changes_types) if change == "replace"]
    
    # If two changed chunks not have length less than chunk_max_length, return
    if not (opcodes[i_replace[0]][2] - opcodes[i_replace[0]][1] <= chunk_max_length and \
            opcodes[i_replace[1]][2] - opcodes[i_replace[1]][1] <= chunk_max_length):
        return is_two_chunk_changed, chunk_distance, is_same_subtree, changes_similarity
    
    # At this point, this should be a valid two_chunk within length change
    is_two_chunk_changed = True
    
    # Check if there is indeed an equal chunks in between of the two changed chunk
    # Calculate the distance between two chunks = the equal chunk in between
    i_equal_in_between = (i_replace[1] + i_replace[0]) // 2
    assert opcodes[i_equal_in_between][0] == 'equal'
    chunk_distance = opcodes[i_equal_in_between][2] - opcodes[i_equal_in_between][1]


    if spacy_model is not None: 
        # In the two_chunk_changed case when chunk_max_length=1, i.e., only two words are changed 
        # comparing to the original translation
        # Check if the two changed words are in the same sub tree of the dependency tree
        if (opcodes[i_replace[0]][4] - opcodes[i_replace[0]][3] == 1 and \
            opcodes[i_replace[1]][4] - opcodes[i_replace[1]][3] == 1):
            # Find the ancestors and children of the two changed words
            doc = spacy_model(' '.join(changed_tokenized))
            token1, token2 = None, None
            family1, family2 = None, None
            for token in doc:
                if token.text == changed_tokenized[opcodes[i_replace[0]][3]]:
                    token1 = token.text
                    family1 = list(token.ancestors) + list(token.children)
                    family1 = [t.text for t in family1]
                elif token.text == changed_tokenized[opcodes[i_replace[1]][3]]:
                    token2 = token.text
                    family2 = list(token.ancestors) + list(token.children)
                    family2 = [t.text for t in family2]

            if token1 is None or token2 is None:
                is_same_subtree = pd.NA
            else:
                if token1 in family2 or token2 in family1:
                    is_same_subtree = True
                else:
                    is_same_subtree = False


    # Calculate the senmatic similarity of the two changed words (cosine similarity in [-1, 1])
    if w2v_model is not None:
        # Can only calculate when only two single tokens are changed
        if (opcodes[i_replace[0]][4] - opcodes[i_replace[0]][3] == 1 and \
            opcodes[i_replace[1]][4] - opcodes[i_replace[1]][3] == 1 and \
            opcodes[i_replace[0]][2] - opcodes[i_replace[0]][1] == 1 and \
            opcodes[i_replace[1]][2] - opcodes[i_replace[1]][1] == 1):

            original_word_1 = original_tokenized[opcodes[i_replace[0]][1]]
            changed_word_1 = changed_tokenized[opcodes[i_replace[0]][3]]

            original_word_2 = original_tokenized[opcodes[i_replace[1]][1]]
            changed_word_2 = changed_tokenized[opcodes[i_replace[1]][3]]

            if original_word_1 in w2v_model.index_to_key and original_word_2 in w2v_model.index_to_key and \
                changed_word_1 in w2v_model.index_to_key and changed_word_2 in w2v_model.index_to_key:
                changes_similarity = [{'original_word': original_word_1, 
                                       'changed_word': changed_word_1, 
                                       'semantic_similarity': w2v_model.similarity(original_word_1, changed_word_1)},
                                      {'original_word': original_word_2,
                                       'changed_word': changed_word_2,
                                       'semantic_similarity': w2v_model.similarity(original_word_2, changed_word_2)}]


    return is_two_chunk_changed, chunk_distance, is_same_subtree, changes_similarity
    
    
def highlight_changes(original_tokenized, changed_tokenized, opcodes):
    """
    Params:
        original_tokenized: tokenized original sentence
        changed_tokenized: tokenized changed sentence
        opcodes: changes to get from `original_tokenized` to `changed_tokenized`
    Returns:
        original_sentence and changed_sentence with the changes highlighted in capital
    """
    
    highlighted_original_sentence_positions = []
    highlighted_changed_sentence_positions = []
    
    for opcode in opcodes:
        tag, i1, i2, j1, j2 = opcode[0], opcode[1], opcode[2], opcode[3], opcode[4]
        
        if tag != 'equal':
            highlighted_original_sentence_positions.append((i1, i2))
            highlighted_changed_sentence_positions.append((j1, j2))
            
    original_sentence_highlighted = highlight_in_capital(
        sentence_tokenized=original_tokenized, 
        highlight_positions=highlighted_original_sentence_positions
    )
    
    changed_sentence_highlighted = highlight_in_capital(
        sentence_tokenized=changed_tokenized, 
        highlight_positions=highlighted_changed_sentence_positions
    )
    
    return original_sentence_highlighted, changed_sentence_highlighted
    
    
def calculate_change(original, changed):
    # Return the original and changed sentences with the changes highlighted in capital
    
    original_tokenized = nltk.word_tokenize(original)
    changed_tokenized = nltk.word_tokenize(changed)
    
    opcodes = SequenceMatcher(None, original_tokenized, changed_tokenized).get_opcodes()
    
    
    return original_tokenized, changed_tokenized, opcodes


In [67]:
def load_alignment(path_prefix):
    alignment_file_path = f"{path_prefix}_word_alignment.txt"
    if not os.path.isfile(alignment_file_path):
        raise RuntimeError("Alignment file not exist.")
        
    else:
        with open(alignment_file_path) as f:
            lines = [line.rstrip() for line in f]
            
        translation_alignment = []
        for line in lines:
            word_pairs = line.split()
            word_pairs = [word_pair.split('<sep>') for word_pair in word_pairs]
            translation_alignment.append(dict(word_pairs))
        return translation_alignment

In [68]:
def add_reason_of_change(alignment, changes, perturbed_src_word):
    if type(changes) != list:
        return pd.NA
    elif perturbed_src_word not in alignment.keys():
        changes[0]['change_type'] = None
        changes[1]['change_type'] = None
    elif alignment[perturbed_src_word] == changes[0]['changed_word'] and alignment[perturbed_src_word] == changes[1]['changed_word']:
        # Both changes are due to perturbation --> weird --> pass
        changes[0]['change_type'] = None
        changes[1]['change_type'] = None
    elif alignment[perturbed_src_word] != changes[0]['changed_word'] and alignment[perturbed_src_word] != changes[1]['changed_word']:
        # Both changes NOT due to perturbation --> weird --> pass
        changes[0]['change_type'] = None
        changes[1]['change_type'] = None
    elif alignment[perturbed_src_word] == changes[0]['changed_word']:
        changes[0]['change_type'] = "perturbed"
        changes[1]['change_type'] = "not_perturbed"
    elif alignment[perturbed_src_word] == changes[1]['changed_word']:
        changes[0]['change_type'] = "not_perturbed"
        changes[1]['change_type'] = "perturbed"
        
    return changes
        
        

In [69]:
def pos_tag_not_perturbed_change(changes, spacy_model):
    if type(changes) != list:
        return pd.NA
    elif changes[0]['change_type'] == "not_perturbed":
        doc = spacy_model(changes[0]['changed_word'])
        return [t.pos_ for t in doc][0]
    elif changes[1]['change_type'] == "not_perturbed":
        doc = spacy_model(changes[1]['changed_word'])
        return [t.pos_ for t in doc][0]
    return pd.NA

In [79]:
def read_output_df(dataset, perturb_type, beam, replacement_strategy, analyse_feature=True, 
                   ignore_case=False, no_of_replacements=1, chunk_max_length=1, spacy_model=None, 
                   w2v_model=None, use_alignment=False, winoMT=False, ref_available=False):
    if winoMT:
        path_prefix = "output/winoMT_asmetric/wmt19_winoMT_perturbed"
        output_df = pd.read_csv('output/winoMT_asmetric/wmt19_winoMT_perturbed_format.csv', index_col=0)  
    else:
        if no_of_replacements == 1:
            path_prefix = f"output/{dataset}/{replacement_strategy}/beam{beam}_perturb{perturb_type}/seed0/translations"
        else:
            path_prefix = f"output/{dataset}/{replacement_strategy}/beam{beam}_perturb{perturb_type}/seed0/translations_5replacements"

        output_df = pd.read_csv(f"{path_prefix}.csv", index_col=0)

        # Join to get the translation of the original sentences as well
        output_df = output_df.join(pd.read_csv(
            f"output/{dataset}/{replacement_strategy}/beam{beam}_perturbNone/seed0/translations.csv", index_col=0
        )['OriginalSRC-Trans'])
        
    
    # Convert columns with sentences to str type
    cols = ['SRC', 'REF', 'SRC_perturbed', 'SRC_perturbed-Trans', 'OriginalSRC-Trans']
    if not ref_available:
        cols.remove('REF')
    output_df[cols] = output_df[cols].astype(str)
    
    if ignore_case:
        output_df[cols] = output_df[cols].applymap(lambda x: x.lower())
    
    # Reorder the columns
    if winoMT:
        cols = ['SRC', 'REF', 'original_word', 'perturbed_word', 'SRC_perturbed', 'OriginalSRC-Trans', 'SRC_perturbed-Trans', 'Bias_sample']
    elif no_of_replacements == 1:
        cols = ['SRC', 'REF', 'original_word', 'perturbed_word', 'SRC_perturbed', 'OriginalSRC-Trans', 'SRC_perturbed-Trans']
    else:
        cols = ['SRC_index', 'SRC', 'REF', 'original_word', 'perturbed_word', 'SRC_perturbed', 'OriginalSRC-Trans', 'SRC_perturbed-Trans']
    if not ref_available:
        cols.remove('REF')
    output_df = output_df[cols]
    
    if analyse_feature:
        print(f"Original df shape: {output_df.shape}")
        output_df = output_df.dropna()
        print(f"After dropping none-perturbed sentences: {output_df.dropna().shape}")
        
        
        # Calculate the changes, i.e., how to get from the original trans sentence 
        # to the changed trans sentence
        output_df['tokenized_OriginalSRC-Trans'], output_df['tokenized_SRC_perturbed-Trans'], output_df['opcodes'] \
            = zip(*output_df.apply(
                lambda x: calculate_change(x['OriginalSRC-Trans'], 
                                           x['SRC_perturbed-Trans']), axis=1
            ))
        
        
        # Highlight the changes in the trans sentences
        output_df["OriginalSRC-Trans"], output_df['SRC_perturbed-Trans'] \
            = zip(*output_df.apply(
                lambda x: highlight_changes(
                    x['tokenized_OriginalSRC-Trans'], 
                    x['tokenized_SRC_perturbed-Trans'], 
                    x['opcodes']), axis=1
            ))
        
        
        
        
        if replacement_strategy == 'word2vec_similarity':
            # SRC difference is the number of occurances of the word we perturb
            output_df["SRC-edit_distance"] = output_df.apply(lambda x: x['tokenized_OriginalSRC-Trans'].count(x['original_word']), axis=1)
        else:
            output_df["SRC-edit_distance"] = 1
        output_df['Trans-edit_distance'] =  output_df.apply(
            lambda x: levenshtein(x['tokenized_OriginalSRC-Trans'], x['tokenized_SRC_perturbed-Trans']), axis=1)
        output_df["#TransChanges-#SrcChanges"] = output_df['Trans-edit_distance'] - output_df['SRC-edit_distance']
        
        output_df["#TransChanges-#SrcChanges/SentenceLength"] = (output_df['Trans-edit_distance'] - output_df['SRC-edit_distance']) / output_df['SRC'].apply(lambda x: len(nltk.word_tokenize(x)))
        
        output_df["ChangesSpread/SentenceLength"] = output_df.apply(
            lambda x: changes_spread(x['tokenized_OriginalSRC-Trans'], 
                                     x['tokenized_SRC_perturbed-Trans'], 
                                     x['opcodes']), axis=1)
        
        
        
        # See if only two chunks within given max size are changed, 
        # and do some analysis on this special case
        output_df['TwoChunksChanged'], output_df['ChunkDistance'], \
        output_df["is_same_subtree"], output_df['changes_similarity'] \
            = zip(*output_df.apply(
                lambda x: two_chunk_changed(x['tokenized_OriginalSRC-Trans'],
                                            x['tokenized_SRC_perturbed-Trans'],
                                            x['opcodes'],
                                            chunk_max_length=chunk_max_length,
                                            spacy_model=spacy_model,
                                            w2v_model=w2v_model), axis=1
            ))

        
        
        if use_alignment:
            # In the case where two changes occurs and the two similarities is calculated, 
            # find out which change is due to the perturbation
            output_df['perturbed_trans_alignment'] = load_alignment(path_prefix)
            output_df['changes_similarity'] = output_df.apply(
                lambda x: add_reason_of_change(
                    alignment=x['perturbed_trans_alignment'],
                    changes=x['changes_similarity'],
                    perturbed_src_word=x['perturbed_word']
                ),
                axis=1
            )
            
            if spacy_model is not None:
                # Add POS tagging of the not-perturbed change
                output_df['not_perturbed_TGT_change_type'] = output_df['changes_similarity'].apply(
                    lambda x: pos_tag_not_perturbed_change(x, spacy_model))
            
        
        # Analyse on group of changes on the same sentence
        if no_of_replacements > 1:
            additional_col_1 = output_df.groupby(by="SRC_index", axis=0)[['Trans-edit_distance', '#TransChanges-#SrcChanges']].std()
            additional_col_2 = output_df.groupby(by="SRC_index", axis=0)[['TwoChunksChanged']].sum()
            
            output_df = output_df.join(additional_col_1, rsuffix='--SD')
            output_df = output_df.join(additional_col_2, rsuffix='--total')
        
    return output_df

    


In [81]:
perturb_type = 'content'
dataset = f'masked_{perturb_type}_covost2_for_en2de'  # 'MuST-SHE-en2fr' 'IWSLT15-en2vi' 'wmt19-newstest2019-en2de'
beam = 5
replacement_strategy = 'masking_language_model'
no_of_replacements = 5
ignore_case = False  # Only Europarls needs ignore case
chunk_max_length=1
spacy_model = spacy.load("de_core_news_sm")
# Loading these models in is time consuming
# de_model = load_facebook_model("data/cc.de.300.bin").wv
# vi_model = load_facebook_model("data/cc.vi.300.bin").wv
winoMT = False

# # This overwrite the above params
# winoMT = True
# perturb_type = 'pronoun'
# no_of_replacements = 1

output = read_output_df(dataset=dataset, perturb_type=perturb_type, beam=beam, 
                        replacement_strategy=replacement_strategy, ignore_case=ignore_case,
                        no_of_replacements=no_of_replacements, chunk_max_length=chunk_max_length,
                        spacy_model=spacy_model, w2v_model=de_model, use_alignment=True, 
                        winoMT=winoMT, analyse_feature=False)

# print('BLEU score: ')
# sacrebleu.corpus_bleu(output['OriginalSRC-Trans'].tolist(), [output['REF'].tolist()]).score

# Comments

- On `wmt19-newstest2019-en2de, chunk_max_length=2`
    - 902: change to 1 SRC word leads to fixed changes of an irrelevant word
    - In many cases, the form of the verb (e.g., current or past tense) are changed --> harmful in the sense that it hurt performance score?
    - Word not being translated 
    - Spoken/written style
    - Time
    
    
- On `IWSLT15-en2vi, adjective`
    - 1003: change of 1 words consistently leads to change in subject
    
    - 1003, 145, 990 noun: same
    - 236 noun: same, funny but not sure if it is wrong
    - 308 verb same 
    
--> Quantify the verb form change by stemming/lemmatization
    
Chúng, họ, gã, cô ấy, cô ta, anh ta, hắn

Changes in the word "you"


In [63]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 9999999)
# output[output['#TransChanges-#SrcChanges'] > 10].head(5)
# output[output["ChangesSpread/SentenceLength"] > 0.85].head(20)



# Two chunks changed that consistently changed over the different replacement of a word


# output[(output["TwoChunksChanged"] == True) & (output["TwoChunksChanged--total"] == 5)].sort_values(by='ChunkDistance', axis=0, ascending=False).head(1)
# output[(output["TwoChunksChanged"] == True)].sort_values(by='ChunkDistance', axis=0, ascending=False).head(100)

# Two words changed that are not in the same subtree
# output[(output["TwoChunksChanged"] == True) & (output["is_same_subtree"] == False) & (output["TwoChunksChanged--total"] == 5)]




# IWSLT15-en2vi, noun
# output.loc[[1003, 145, 990, 236]]







Sort the samples by the least similarity in changed words

In [64]:
# Filter out the 2-word-changed cases and similarity can be calculated
def get_not_perturbed_change_similarity(changes):
    for change in changes:
        if change['change_type'] == 'not_perturbed':
            return change['semantic_similarity']
    return pd.NA

analyse_df = output[
    (output["TwoChunksChanged"] == True) & output['changes_similarity'].notna() & output['not_perturbed_TGT_change_type'].isin(['NOUN', 'VERB', 'ADJ', 'PRON'])
]
analyse_df['similarity_not_perturbed'] = analyse_df['changes_similarity'].apply(
    lambda x: get_not_perturbed_change_similarity(x)
)
analyse_df.sort_values(by='similarity_not_perturbed')[['SRC', 
                                                f'original_word', 
                                                f'perturbed_word',
                                                'OriginalSRC-Trans',
                                                f'SRC_perturbed-Trans',
                                                'ChunkDistance',
                                                'changes_similarity',
                                                'similarity_not_perturbed',
                                                'not_perturbed_TGT_change_type',
#                                                 'Bias_sample'
                                                      ]].head(50)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  analyse_df['similarity_not_perturbed'] = analyse_df['changes_similarity'].apply(


Unnamed: 0,SRC,original_word,perturbed_word,OriginalSRC-Trans,SRC_perturbed-Trans,ChunkDistance,changes_similarity,similarity_not_perturbed,not_perturbed_TGT_change_type
219367,There is neither customs nor police.,police,religion,Es gibt weder ZOLL noch POLIZEI .,Es gibt weder BRÄUCHE noch RELIGION .,1.0,"[{'original_word': 'Zoll', 'changed_word': 'Bräuche', 'semantic_similarity': 0.035544515, 'change_type': 'not_perturbed'}, {'original_word': 'Polizei', 'changed_word': 'Religion', 'semantic_similarity': 0.29982835, 'change_type': 'perturbed'}]",0.035545,NOUN
141244,"He has wanted to become an accountant since he was a toddler, for he has always loved numbers.",accountant,astronaut,"BUCHHALTER wollte ER werden , seit er ein Kleinkind war , denn Zahlen hat er schon immer geliebt .","ER wollte ASTRONAUT werden , seit er ein Kleinkind war , denn Zahlen hat er schon immer geliebt .",1.0,"[{'original_word': 'Buchhalter', 'changed_word': 'Er', 'semantic_similarity': 0.04772758, 'change_type': 'not_perturbed'}, {'original_word': 'er', 'changed_word': 'Astronaut', 'semantic_similarity': 0.10002759, 'change_type': 'perturbed'}]",0.047728,PRON
219367,There is neither customs nor police.,police,language,Es gibt weder ZOLL noch POLIZEI .,Es gibt weder SITTEN noch SPRACHE .,1.0,"[{'original_word': 'Zoll', 'changed_word': 'Sitten', 'semantic_similarity': 0.089041315, 'change_type': 'not_perturbed'}, {'original_word': 'Polizei', 'changed_word': 'Sprache', 'semantic_similarity': 0.19356343, 'change_type': 'perturbed'}]",0.089041,NOUN
219367,There is neither customs nor police.,police,customs,Es gibt weder ZOLL noch POLIZEI .,Es gibt weder SITTEN noch GEBRÄUCHE .,1.0,"[{'original_word': 'Zoll', 'changed_word': 'Sitten', 'semantic_similarity': 0.089041315, 'change_type': 'not_perturbed'}, {'original_word': 'Polizei', 'changed_word': 'Gebräuche', 'semantic_similarity': 0.17014045, 'change_type': 'perturbed'}]",0.089041,NOUN
219367,There is neither customs nor police.,police,custom,Es gibt weder ZOLL noch POLIZEI .,Es gibt weder SITTEN noch GEBRÄUCHE .,1.0,"[{'original_word': 'Zoll', 'changed_word': 'Sitten', 'semantic_similarity': 0.089041315, 'change_type': 'not_perturbed'}, {'original_word': 'Polizei', 'changed_word': 'Gebräuche', 'semantic_similarity': 0.17014045, 'change_type': 'perturbed'}]",0.089041,NOUN
39174,“Do some of these fingerprints still exist?” Cyrus Smith asked.,smith,had,"`` Gibt es noch einige dieser Fingerabdrücke ? `` , FRAGTE Cyrus SMITH .","`` Gibt es noch einige dieser Fingerabdrücke ? `` , HATTE Cyrus GEFRAGT .",1.0,"[{'original_word': 'fragte', 'changed_word': 'hatte', 'semantic_similarity': 0.44320464, 'change_type': 'perturbed'}, {'original_word': 'Smith', 'changed_word': 'gefragt', 'semantic_similarity': 0.11342771, 'change_type': 'not_perturbed'}]",0.113428,VERB
32968,"“Is this man your servant?” Added the policeman, pointing at Passepartout.",policeman,man,"`` Ist dieser Mann dein Diener ? `` , fügte der POLIZIST hinzu und zeigte auf den MANN .","`` Ist dieser Mann dein Diener ? `` , fügte der MANN hinzu und zeigte auf den PASSEPARTOUT .",5.0,"[{'original_word': 'Polizist', 'changed_word': 'Mann', 'semantic_similarity': 0.5838175, 'change_type': 'perturbed'}, {'original_word': 'Mann', 'changed_word': 'Passepartout', 'semantic_similarity': 0.16116506, 'change_type': 'not_perturbed'}]",0.161165,NOUN
156736,He is different from the mayor of Lice.,mayor,King,Er ist anders als der BÜRGERMEISTER von LICE .,Er ist anders als der KÖNIG von LÄUSEN .,1.0,"[{'original_word': 'Bürgermeister', 'changed_word': 'König', 'semantic_similarity': 0.41626397, 'change_type': 'perturbed'}, {'original_word': 'Lice', 'changed_word': 'Läusen', 'semantic_similarity': 0.18970576, 'change_type': 'not_perturbed'}]",0.189706,NOUN
156736,He is different from the mayor of Lice.,mayor,Prince,Er ist anders als der BÜRGERMEISTER von LICE .,Er ist anders als der PRINZ von LÄUSEN .,1.0,"[{'original_word': 'Bürgermeister', 'changed_word': 'Prinz', 'semantic_similarity': 0.35455632, 'change_type': 'perturbed'}, {'original_word': 'Lice', 'changed_word': 'Läusen', 'semantic_similarity': 0.18970576, 'change_type': 'not_perturbed'}]",0.189706,NOUN
48720,The wretch is a marvelous actress.,actress,species,Der BÖSEWICHT ist eine wunderbare SCHAUSPIELERIN .,Der WRACK ist eine wunderbare SPEZIES .,3.0,"[{'original_word': 'Bösewicht', 'changed_word': 'Wrack', 'semantic_similarity': 0.19097787, 'change_type': 'not_perturbed'}, {'original_word': 'Schauspielerin', 'changed_word': 'Spezies', 'semantic_similarity': 0.15966544, 'change_type': 'perturbed'}]",0.190978,NOUN


### Calculate metrics for detecting the bias samples

High precision --> higher chance that the returned samples are bias --> save human time

High recall --> more bias samples are retreat --> can detect more type of bias

We focus on precision then (save human cost)

In [54]:
from sklearn.metrics import classification_report

print(' -------------------- Most-changes filter -------------------- ')
q = 20  # Take the q% sentences with the highest changes
no_changes_thresthold = np.percentile(output['#TransChanges-#SrcChanges'], 100-q)
bias_prediction = output['#TransChanges-#SrcChanges'] > no_changes_thresthold
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)

print(' -------------------- Most-spreaded_changes filter -------------------- ')
q = 20  # Take the q% sentences with the highest spread
spread_thresthold = np.percentile(output['ChangesSpread/SentenceLength'], 100-q)
bias_prediction = output['ChangesSpread/SentenceLength'] > spread_thresthold
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)

print(' -------------------- Two-changes filter -------------------- ')
bias_prediction = output["TwoChunksChanged"]
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)


print(' -------------------- Two-faraway-changes filter -------------------- ')
q = 20  # Take the q% sentences with the furthest distance between 2 changes 
distance_thresthold = np.nanpercentile(output['ChunkDistance'], 100-q)
bias_prediction = output["TwoChunksChanged"] & (output['ChunkDistance'] > distance_thresthold)
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)

print(' -------------------- Two-changes-different-subtree filter -------------------- ')
bias_prediction = output["TwoChunksChanged"] & (output["is_same_subtree"] == False)
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)


print(' -------------------- Two-change-dissimilar filter -------------------- ')
q = 90  # Take the q% sentences with the lowest similarity of the not-perturbed change
output = output.join(analyse_df['similarity_not_perturbed'])
similiarity_threshold = np.nanpercentile(output['similarity_not_perturbed'], q)

bias_prediction = output["TwoChunksChanged"] & (output['similarity_not_perturbed'] < similiarity_threshold)
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)

 -------------------- Most-changes filter -------------------- 
              precision    recall  f1-score   support

       False       0.56      0.87      0.68       898
        True       0.34      0.09      0.14       683

    accuracy                           0.53      1581
   macro avg       0.45      0.48      0.41      1581
weighted avg       0.46      0.53      0.45      1581

 -------------------- Most-spreaded_changes filter -------------------- 
              precision    recall  f1-score   support

       False       0.53      0.75      0.63       898
        True       0.30      0.14      0.19       683

    accuracy                           0.49      1581
   macro avg       0.42      0.45      0.41      1581
weighted avg       0.43      0.49      0.44      1581

 -------------------- Two-changes filter -------------------- 
              precision    recall  f1-score   support

       False       0.49      0.43      0.46       898
        True       0.36      0.41    

### Analyse on same original_word accross sentences

In [70]:
output[[
    'SRC_index', 'SRC', 'original_word', 'perturbed_word', 'SRC_perturbed',
    'OriginalSRC-Trans', 'SRC_perturbed-Trans', '#TransChanges-#SrcChanges',
    '#TransChanges-#SrcChanges/SentenceLength',
    'ChangesSpread/SentenceLength', 'TwoChunksChanged', 'ChunkDistance',
    'is_same_subtree', 'changes_similarity', 'perturbed_trans_alignment',
    'not_perturbed_TGT_change_type', 'Trans-edit_distance--SD',
    '#TransChanges-#SrcChanges--SD', 'TwoChunksChanged--total'
]].groupby('original_word').mean().head()




Unnamed: 0_level_0,SRC_index,SRC-edit_distance,Trans-edit_distance,#TransChanges-#SrcChanges,#TransChanges-#SrcChanges/SentenceLength,ChangesSpread/SentenceLength,TwoChunksChanged,ChunkDistance,Trans-edit_distance--SD,#TransChanges-#SrcChanges--SD,TwoChunksChanged--total
original_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
accountant,144062.85,1.0,2.66,1.66,0.109156,0.283355,0.01,1.0,1.261833,1.261833,0.05
actor,125710.946746,1.0,2.052071,1.052071,0.089356,0.228125,0.050888,2.488372,0.735556,0.735556,0.254438
actress,118696.210526,1.0,2.115789,1.115789,0.093629,0.23479,0.084211,2.839286,0.884848,0.884848,0.421053
advisor,133874.583333,1.0,1.894444,0.894444,0.0737,0.230821,0.022222,3.25,0.991124,0.991124,0.111111
aide,104538.0,1.0,3.533333,2.533333,0.203435,0.336605,0.1,3.666667,1.337144,1.337144,0.5


### Most changes filter:

In [87]:
groupped_by_word = output.groupby('original_word').mean()

q = 10  # Take the q% groups with the highest changes
no_changes_thresthold = np.percentile(groupped_by_word['#TransChanges-#SrcChanges'], 100-q)
bias_prediction = groupped_by_word['#TransChanges-#SrcChanges'] > no_changes_thresthold

bias_word_predicted = groupped_by_word[bias_prediction].index.values

output[
    output['original_word'].isin(bias_word_predicted) & \
    (output['#TransChanges-#SrcChanges'] > no_changes_thresthold)
].head(2)





Unnamed: 0,SRC_index,SRC,original_word,perturbed_word,SRC_perturbed,OriginalSRC-Trans,SRC_perturbed-Trans,SRC-edit_distance,Trans-edit_distance,#TransChanges-#SrcChanges,...,ChangesSpread/SentenceLength,TwoChunksChanged,ChunkDistance,is_same_subtree,changes_similarity,perturbed_trans_alignment,not_perturbed_TGT_change_type,Trans-edit_distance--SD,#TransChanges-#SrcChanges--SD,TwoChunksChanged--total
1876,1876,He was the first Australian and Oceanian referee at the World Cup.,referee,participant,He was the first Australian and Oceanian participant at the World Cup.,Er war der erste australische und ozeanische Schiedsrichter bei einer Weltmeisterschaft.,Er war der erste australische und ozeanische WM-Teilnehmer.,1,4,3,...,0.111111,False,,,,"{'He': 'Er', 'was': 'war', 'the': 'der', 'first': 'erste', 'Australian': 'australische', 'and': 'und', 'Oceanian': 'ozeanische', 'participant': 'WM-Teilnehmer', 'World': 'WM-Teilnehmer', '.': '.'}",,1.516575,1.516575,0
1876,1876,He was the first Australian and Oceanian referee at the World Cup.,referee,winner,He was the first Australian and Oceanian winner at the World Cup.,Er war der erste australische und ozeanische Schiedsrichter bei einer Weltmeisterschaft.,Er war der erste australische und ozeanische Weltcupsieger.,1,4,3,...,0.111111,False,,,,"{'He': 'Er', 'was': 'war', 'the': 'der', 'first': 'erste', 'Australian': 'australische', 'and': 'und', 'Oceanian': 'ozeanische', 'winner': 'Weltcupsieger', 'Cup': 'Weltcupsieger', '.': '.'}",,1.516575,1.516575,0


### Most-spreaded_changes filter

In [88]:
groupped_by_word = output.groupby('original_word').mean()

q = 10  # Take the q% sentences with the highest spread
spread_thresthold = np.percentile(groupped_by_word['ChangesSpread/SentenceLength'], 100-q)
bias_prediction = groupped_by_word['ChangesSpread/SentenceLength'] > spread_thresthold

bias_word_predicted = groupped_by_word[bias_prediction].index.values

output[
    output['original_word'].isin(bias_word_predicted) & \
    (output['ChangesSpread/SentenceLength'] > spread_thresthold)
].head(2)


Unnamed: 0,SRC_index,SRC,original_word,perturbed_word,SRC_perturbed,OriginalSRC-Trans,SRC_perturbed-Trans,SRC-edit_distance,Trans-edit_distance,#TransChanges-#SrcChanges,...,ChangesSpread/SentenceLength,TwoChunksChanged,ChunkDistance,is_same_subtree,changes_similarity,perturbed_trans_alignment,not_perturbed_TGT_change_type,Trans-edit_distance--SD,#TransChanges-#SrcChanges--SD,TwoChunksChanged--total
97,97,This was unacceptable to the firm believers in the co-operative economic model.,model,sector,This was unacceptable to the firm believers in the co - operative economic sector.,Dies war für die fest an das genossenschaftliche Wirtschaftsmodell glaubenden Menschen inakzeptabel.,Dies war für die festen Gläubigen des genossenschaftlichen Wirtschaftssektors inakzeptabel.,1,7,6,...,0.454545,False,,,,"{'This': 'Dies', 'was': 'war', 'unacceptable': 'inakzeptabel', 'to': 'für', 'the': 'des', 'believers': 'Gläubigen', 'co': 'genossenschaftlichen', 'operative': 'genossenschaftlichen', 'economic': 'Wirtschaftssektors', 'sector': 'Wirtschaftssektors', '.': '.'}",,1.788854,1.788854,0
97,97,This was unacceptable to the firm believers in the co-operative economic model.,model,system,This was unacceptable to the firm believers in the co - operative economic system.,Dies war für die fest an das genossenschaftliche Wirtschaftsmodell glaubenden Menschen inakzeptabel.,Dies war für die festen Gläubigen des kooperativen Wirtschaftssystems inakzeptabel.,1,7,6,...,0.454545,False,,,,"{'This': 'Dies', 'was': 'war', 'unacceptable': 'inakzeptabel', 'to': 'für', 'the': 'des', 'believers': 'Gläubigen', 'co': 'kooperativen', 'operative': 'kooperativen', 'economic': 'Wirtschaftssystems', 'system': 'Wirtschaftssystems', '.': '.'}",,1.788854,1.788854,0


### Two-faraway-changes filter

ACTUALLY two-changes is not a bias filter. It's just an auxilary filter to avoid paraphrasing cases. Using this we will miss out on the cases where the model has both paraphrasing and 

Here we consider in each group: the number of sentences that has 2 changes

In [101]:
two_change_only_groupped_by_word = output[output["TwoChunksChanged"]].groupby('original_word').mean()


q = 20  # Take the q% sentences with the furthest distance between 2 changes 
distance_thresthold = np.percentile(two_change_only_groupped_by_word['ChunkDistance'], 100-q)
bias_prediction = two_change_only_groupped_by_word['ChunkDistance'] > distance_thresthold


bias_word_predicted = two_change_only_groupped_by_word[bias_prediction].index.values

output[
    output["TwoChunksChanged"] & \
    output['original_word'].isin(bias_word_predicted) & \
    (output['ChunkDistance'] > distance_thresthold)
].head(2)



Unnamed: 0,SRC_index,SRC,original_word,perturbed_word,SRC_perturbed,OriginalSRC-Trans,SRC_perturbed-Trans,SRC-edit_distance,Trans-edit_distance,#TransChanges-#SrcChanges,...,TwoChunksChanged,ChunkDistance,is_same_subtree,changes_similarity,perturbed_trans_alignment,not_perturbed_TGT_change_type,Trans-edit_distance--SD,#TransChanges-#SrcChanges--SD,TwoChunksChanged--total,similarity_not_perturbed
311,311,Some physicist then tried to stretch out this group.,physicist,men,Some men then tried to stretch out this group.,"Einige PHYSIKER versuchten dann , diese Gruppe AUSZUDEHNEN .","Einige MÄNNER versuchten dann , diese Gruppe AUSZUSTRECKEN .",1,2,1,...,True,5.0,False,"[{'original_word': 'Physiker', 'changed_word': 'Männer', 'semantic_similarity': 0.29928693, 'change_type': 'perturbed'}, {'original_word': 'auszudehnen', 'changed_word': 'auszustrecken', 'semantic_similarity': 0.52432793, 'change_type': 'not_perturbed'}]","{'Some': 'Einige', 'men': 'Männer', 'then': 'dann', 'tried': 'versuchten', 'to': ',', 'stretch': 'auszustrecken', 'out': 'auszustrecken', 'this': 'diese', 'group': 'Gruppe', '.': '.'}",VERB,1.0,1.0,1,0.524328
319,319,It is also a famous site for fisherman who are on foot or in a bark.,fisherman,people,It is also a famous site for people who are on foot or in a bark.,"Es ist auch ein berühmter Ort für FISCHER , die zu Fuß oder in DER Rinde sind .","Es ist auch ein berühmter Ort für MENSCHEN , die zu Fuß oder in EINER Rinde sind .",1,2,1,...,True,6.0,True,"[{'original_word': 'Fischer', 'changed_word': 'Menschen', 'semantic_similarity': 0.28160858, 'change_type': 'perturbed'}, {'original_word': 'der', 'changed_word': 'einer', 'semantic_similarity': 0.7654743, 'change_type': 'not_perturbed'}]","{'It': 'Es', 'is': 'ist', 'also': 'auch', 'a': 'einer', 'famous': 'berühmter', 'site': 'Ort', 'for': 'für', 'people': 'Menschen', 'who': 'die', 'are': 'sind', 'on': 'zu', 'foot': 'Fuß', 'or': 'oder', 'in': 'in', 'bark': 'Rinde', '.': '.'}",PRON,0.447214,0.447214,4,0.765474


### Two-changes-different-subtree filter

In [102]:
tmp = output[output["TwoChunksChanged"] & output['is_same_subtree'].notna()]
tmp['not_same_subtree'] = 1- tmp['is_same_subtree'].astype(int)
two_change_only_groupped_by_word = tmp.groupby('original_word').sum()

q = 20  # Take the q% groups with the highest number of different subtree changes
count_thresthold = np.percentile(two_change_only_groupped_by_word['not_same_subtree'], 100-q)
bias_prediction = two_change_only_groupped_by_word['ChunkDistance'] > count_thresthold


bias_word_predicted = two_change_only_groupped_by_word[bias_prediction].index.values

output[
    output["TwoChunksChanged"] & \
    output['original_word'].isin(bias_word_predicted) & \
    (output['is_same_subtree'] == 0)
].head(2)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['not_same_subtree'] = 1- tmp['is_same_subtree'].astype(int)


Unnamed: 0,SRC_index,SRC,original_word,perturbed_word,SRC_perturbed,OriginalSRC-Trans,SRC_perturbed-Trans,SRC-edit_distance,Trans-edit_distance,#TransChanges-#SrcChanges,...,TwoChunksChanged,ChunkDistance,is_same_subtree,changes_similarity,perturbed_trans_alignment,not_perturbed_TGT_change_type,Trans-edit_distance--SD,#TransChanges-#SrcChanges--SD,TwoChunksChanged--total,similarity_not_perturbed
311,311,Some physicist then tried to stretch out this group.,physicist,men,Some men then tried to stretch out this group.,"Einige PHYSIKER versuchten dann , diese Gruppe AUSZUDEHNEN .","Einige MÄNNER versuchten dann , diese Gruppe AUSZUSTRECKEN .",1,2,1,...,True,5.0,False,"[{'original_word': 'Physiker', 'changed_word': 'Männer', 'semantic_similarity': 0.29928693, 'change_type': 'perturbed'}, {'original_word': 'auszudehnen', 'changed_word': 'auszustrecken', 'semantic_similarity': 0.52432793, 'change_type': 'not_perturbed'}]","{'Some': 'Einige', 'men': 'Männer', 'then': 'dann', 'tried': 'versuchten', 'to': ',', 'stretch': 'auszustrecken', 'out': 'auszustrecken', 'this': 'diese', 'group': 'Gruppe', '.': '.'}",VERB,1.0,1.0,1,0.524328
319,319,It is also a famous site for fisherman who are on foot or in a bark.,fisherman,visitors,It is also a famous site for visitors who are on foot or in a bark.,"Es ist auch ein berühmter Ort für FISCHER , die zu Fuß oder in DER Rinde sind .","Es ist auch ein berühmter Ort für BESUCHER , die zu Fuß oder in EINER Rinde sind .",1,2,1,...,True,6.0,False,"[{'original_word': 'Fischer', 'changed_word': 'Besucher', 'semantic_similarity': 0.1882075, 'change_type': 'perturbed'}, {'original_word': 'der', 'changed_word': 'einer', 'semantic_similarity': 0.7654743, 'change_type': 'not_perturbed'}]","{'It': 'Es', 'is': 'ist', 'also': 'auch', 'a': 'einer', 'famous': 'berühmter', 'site': 'Ort', 'for': 'für', 'visitors': 'Besucher', 'who': 'die', 'are': 'sind', 'on': 'zu', 'foot': 'Fuß', 'or': 'oder', 'in': 'in', 'bark': 'Rinde', '.': '.'}",PRON,0.447214,0.447214,4,0.765474


### Two-change-dissimilar filter

In [104]:
output = output.join(analyse_df['similarity_not_perturbed'])
two_change_only_groupped_by_word = output[output["TwoChunksChanged"]].groupby('original_word').mean()


q = 20  # Take the q% sentences with the lowest similarity of the not-perturbed change
similiarity_threshold = np.nanpercentile(two_change_only_groupped_by_word['similarity_not_perturbed'], q)
bias_prediction = two_change_only_groupped_by_word['similarity_not_perturbed'] < similiarity_threshold


bias_word_predicted = two_change_only_groupped_by_word[bias_prediction].index.values

output[
    output["TwoChunksChanged"] & \
    output['original_word'].isin(bias_word_predicted) & \
    (output['similarity_not_perturbed'] < similiarity_threshold)
].head(2)






Unnamed: 0,SRC_index,SRC,original_word,perturbed_word,SRC_perturbed,OriginalSRC-Trans,SRC_perturbed-Trans,SRC-edit_distance,Trans-edit_distance,#TransChanges-#SrcChanges,...,TwoChunksChanged,ChunkDistance,is_same_subtree,changes_similarity,perturbed_trans_alignment,not_perturbed_TGT_change_type,Trans-edit_distance--SD,#TransChanges-#SrcChanges--SD,TwoChunksChanged--total,similarity_not_perturbed
2650,2650,These performances are enough for him to be elected Dutch athlete of the year.,athlete,artist,These performances are enough for him to be elected Dutch artist of the year.,"Diese LEISTUNGEN reichen ihm , um zum niederländischen SPORTLER des Jahres gewählt zu werden .","Diese AUFTRITTE reichen ihm , um zum niederländischen KÜNSTLER des Jahres gewählt zu werden .",1,2,1,...,True,6.0,False,"[{'original_word': 'Leistungen', 'changed_word': 'Auftritte', 'semantic_similarity': 0.4426266, 'change_type': 'not_perturbed'}, {'original_word': 'Sportler', 'changed_word': 'Künstler', 'semantic_similarity': 0.51654893, 'change_type': 'perturbed'}]","{'These': 'Diese', 'performances': 'Auftritte', 'are': 'reichen', 'him': 'ihm', 'to': 'zu', 'be': 'werden', 'elected': 'gewählt', 'Dutch': 'niederländischen', 'artist': 'Künstler', 'of': 'des', 'year': 'Jahres', '.': '.'}",NOUN,0.447214,0.447214,1,0.442627
4836,4836,It was named in honor of the American astronomer Annie Jump Cannon.,astronomer,author,It was named in honor of the American author Annie Jump Cannon.,SIE wurde zu Ehren der amerikanischen ASTRONOMIN Annie Jump Cannon benannt .,ES wurde zu Ehren der amerikanischen AUTORIN Annie Jump Cannon benannt .,1,2,1,...,True,5.0,False,"[{'original_word': 'Sie', 'changed_word': 'Es', 'semantic_similarity': 0.32995936, 'change_type': 'not_perturbed'}, {'original_word': 'Astronomin', 'changed_word': 'Autorin', 'semantic_similarity': 0.4326979, 'change_type': 'perturbed'}]","{'It': 'Es', 'was': 'wurde', 'named': 'benannt', 'in': 'zu', 'honor': 'Ehren', 'the': 'der', 'American': 'amerikanischen', 'author': 'Autorin', 'Annie': 'Annie', 'Jump': 'Jump', 'Cannon': 'Cannon', '.': '.'}",PRON,0.0,0.0,5,0.329959


## Find patterns

when a word A is replaced with B, then the change C happens

In [60]:
output.shape

(55775, 21)

In [61]:
output[output['TwoChunksChanged']].shape

(4219, 21)

In [None]:
# Statistical test to see if SRC_similarity is higher than Trans_similarity
print(output["Trans-edit_distance"].mean() - output["SRC-edit_distance"].mean())
stats.ttest_rel(output["SRC-edit_distance"], 
                output["Trans-edit_distance"], 
                alternative='less')

Tiny pvalue --> Indeed SRC-edit_distance is significantly lower than Trans-edit_distance


(Careful with this tho, bc with number of samples too large then statistical test does not make sense)

In [None]:
_ = plt.hist(output["#TransChanges-#SrcChanges"], bins=50)

In [None]:
print(output["ChangesSpread/SentenceLength"].describe())
output["ChangesSpread/SentenceLength"].plot.box()

Some changes seems to have the same meaning but different phrasing, e.g., noun index 24, 36, 47

Both for en-de and en-vi


Kind of bias: en-vi adjective sample 82

Should we cherry-pick examples? Or cherry-pick the replacement?


Or narrow down scope of perturbation? (e.g., on countries, jobs, gender, ...?)



Some cherry-picked examples anyway:

- He comes from England --> Ông ấy đến từ Anh
- He comes from Vietnam --> Hắn đến từ Việt Nam
- He comes from North Korea --> Hắn đến từ Bắc Triều Tiên



- He is european --> Hắn là người Châu Âu
- He is asian --> Anh ấy là người châu Á.



- He has black hair --> Hắn có tóc đen.
- He has blonde hair --> Anh ấy có tóc vàng


But if we limit this then it would hurt the model overal performance as well? 

*Jan: some kind of loss to minimize the number of changes, but not completely forbidden the changes*


# Translation quality vs #changes

In [None]:
from nltk.translate.gleu_score import sentence_gleu

output["OriginalTran_Quality"] = output.apply(
    lambda x: sentence_gleu([nltk.word_tokenize(x['REF'])], nltk.word_tokenize(x['OriginalSRC-Trans'])), axis=1)

In [None]:
output.plot.scatter(x='OriginalTran_Quality', y="#TransChanges-#SrcChanges/SentenceLength")

In [None]:
np.corrcoef(output['OriginalTran_Quality'], output["#TransChanges-#SrcChanges/SentenceLength"])

In [None]:
hist = plt.hist(output["OriginalTran_Quality"], bins='sturges')
bin_boundaries = hist[1]

In [None]:
# # Use bins with same number of samples instead of equal-sized bins

# results, bin_boundaries = pd.qcut(output["OriginalTran_Quality"], q=5, retbins=True)
# bin_boundaries


# Remove bins with too few samples
cut_point = 99999
for i, value in enumerate(hist[0]):
    if value < 5:
        cut_point = i
        break
        
bin_boundaries = bin_boundaries[:i]



In [None]:
bin_boundaries

X = output['OriginalTran_Quality']
Y = output["#TransChanges-#SrcChanges/SentenceLength"]

x_plot = [(bin_boundaries[i] + bin_boundaries[i+1])/2 for i in range(0, len(bin_boundaries)-1)]
y_plot = [stats.trim_mean(Y[(bin_boundaries[i] < X) & (X < bin_boundaries[i+1])], 0.1) for i in range(0, len(bin_boundaries)-1)]
plt.plot(x_plot, y_plot)
plt.xlabel('OriginalTrans_Quality')
plt.ylabel('Avg_changes')

Most of the time downward trend (not as clear for en-de with verb, adverb, pronoun; en-vi adverb, pronoun)

**Note**: the plot has outliers removed in both X and Y dimensions, by removing too small bins (X) and trimmed-mean (Y)

# #changes vs translation quality

In [None]:
hist = plt.hist(output["#TransChanges-#SrcChanges"], bins=20)
bin_boundaries = hist[1]
hist

In [None]:
# # Use bins with same number of samples instead of equal-sized bins
# results, bin_boundaries = pd.qcut(output["#TransChanges-#SrcChanges"], q=5, retbins=True)
# bin_boundaries


# Remove bins with too few samples
cut_point = 99999
for i, value in enumerate(hist[0]):
    if value < 10:
        cut_point = i
        break
        
bin_boundaries = bin_boundaries[:i]

In [None]:
bin_boundaries

X = output['#TransChanges-#SrcChanges']
Y = output["OriginalTran_Quality"]

x_plot = [(bin_boundaries[i] + bin_boundaries[i+1])/2 for i in range(0, len(bin_boundaries)-1)]

y_plot = [stats.trim_mean(Y[(bin_boundaries[i] <= X) & (X <= bin_boundaries[i+1])], 0.1) for i in range(0, len(bin_boundaries)-1)]
plt.plot(x_plot, y_plot)
plt.xlabel('Avg_changes')
plt.ylabel('OriginalTran_Quality')

# SentenceLength vs #changes

In [None]:
output['OriginalSRC-length'] = output.apply(
    lambda x: len(nltk.word_tokenize(x['SRC'])), axis=1
)

In [None]:
output.plot.scatter(x='OriginalSRC-length', y="#TransChanges-#SrcChanges")

In [None]:
np.corrcoef(output['OriginalSRC-length'], output["#TransChanges-#SrcChanges"])

In [None]:
hist = plt.hist(output["OriginalSRC-length"], bins=20)
bin_boundaries = hist[1]

In [None]:
# Remove bins with too few samples
cut_point = 99999
for i, value in enumerate(hist[0]):
    if value < 10:
        cut_point = i
        break
        
bin_boundaries = bin_boundaries[:i]

In [None]:
X = output['OriginalSRC-length']
Y = output["#TransChanges-#SrcChanges"]

x_plot = [(bin_boundaries[i] + bin_boundaries[i+1])/2 for i in range(0, len(bin_boundaries)-1)]
y_plot = [stats.trim_mean(Y[(bin_boundaries[i] < X) & (X < bin_boundaries[i+1])], 0.1) for i in range(0, len(bin_boundaries)-1)]
plt.plot(x_plot, y_plot)
plt.xlabel('OriginalSRC-length')
plt.ylabel('Avg_changes')

# Beam_size vs #changes

In [None]:
beam_dict = {}
beam_values = [1,2,3,4,5]
for beam in beam_values:
    beam_dict[beam] = read_output_df(dataset, perturb_type, beam, replacement_strategy)
    # Make sure the df all have the same index
    if beam > 1:
        assert beam_dict[beam].index.equals(beam_dict[beam].index)


In [None]:
plt.plot(beam_values,
              [stats.trim_mean(beam_dict[x]['#TransChanges-#SrcChanges'], 0.1) for x in beam_values])
plt.xlabel('beam')
plt.ylabel('mean_changes')

The mean might not saying anything

In [None]:
fig, ax = plt.subplots()
ax.boxplot([beam_dict[x]['#TransChanges-#SrcChanges'] for x in beam_values])
ax.set_xticklabels(beam_values)
ax.set_xlabel('beam')
ax.set_ylabel('#changes')

# Perturbed word type vs #changes

In [None]:
word_type_dict = {}
word_type_values = ["noun", "verb", "adjective", "adverb", "pronoun"]
for word_type in word_type_values:
    word_type_dict[word_type] = read_output_df(dataset, perturb_type=word_type, beam=beam, replacement_strategy=replacement_strategy)

    
print('--------------------------------')
print('word type    -   trimmed-mean #changes')

for word_type in word_type_values:
    print(f"{word_type} - {stats.trim_mean(word_type_dict[word_type]['#TransChanges-#SrcChanges'], 0.1)}")


In [None]:
fig, ax = plt.subplots()
ax.boxplot([word_type_dict[x]['#TransChanges-#SrcChanges'] for x in word_type_values])
ax.set_xticklabels(word_type_values)
ax.set_xlabel('word_type')
ax.set_ylabel('#changes')

# #Changes per sentence across word types

See if the chaos changes are sentence-specific. Excluding perturbing pronouns bc not many samples have pronoun

In [None]:
# Find sentences that has multiple word types perturbed
word_type_values = ["noun", "verb", "adjective", "adverb"]
index_intersection = word_type_dict[word_type_values[0]].index
for i in range(1, len(word_type_values)):
    index_intersection = \
        index_intersection.intersection(word_type_dict[word_type_values[i]].index)

len(index_intersection)

In [None]:
changes_per_word_type = pd.DataFrame()
for word_type in word_type_values:
    changes_per_word_type[word_type] = word_type_dict[word_type]["#TransChanges-#SrcChanges"].loc[index_intersection]
    
# Count the number of samples where the changes in trans always bigger than changes in SRC
changes_per_word_type[(changes_per_word_type['noun'] > 0) & (changes_per_word_type['verb'] > 0) & \
                      (changes_per_word_type['adjective'] > 0) & (changes_per_word_type['adverb'] > 0)]



Small portion of rows --> not sentence-specific

In [None]:
import spacy 
from spacy import displacy 
nlp = spacy.load("en_core_web_sm")
sentence = "He is from Vietnam"
doc = nlp(sentence)

print(f"{'Node (from)-->':<15} {'Relation':^10} {'-->Node (to)':>15}\n")

for token in doc:
    print("{:<15} {:^10} {:>15}".format(str(token.head.text), str(token.dep_), str(token.text)))

In [None]:
for token in doc:
    print("------------------------------------------------")
    print(f"Token: {token.text}")
    print(f"Ancestors: {list(token.ancestors)}")
    print(f"Children: {list(token.children)}")

In [None]:
import spacy 
from spacy import displacy 
nlp = spacy.load("de_core_news_sm")
sentence = "Er kommt aus Vietnam"
doc = nlp(sentence)

print(f"{'Node (from)-->':<15} {'Relation':^10} {'-->Node (to)':>15}\n")

for token in doc:
    print("{:<15} {:^10} {:>15}".format(str(token.head.text), str(token.dep_), str(token.text)))