In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import gensim
import gensim.downloader as api
from gensim.models.fasttext import load_facebook_model
import random
from difflib import SequenceMatcher
from scipy import stats
import sacrebleu
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import os
import spacy
import re
import edist.sed as sed
from sacremoses import MosesTokenizer, MosesDetokenizer


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


# nltk.download()

German word2vec model Facebook https://fasttext.cc/docs/en/crawl-vectors.html (cc.de.300.bin)

In [None]:
# Code taken from https://gitlab.ub.uni-bielefeld.de/bpaassen/python-edit-distances/-/blob/master/sed_demo.ipynb
def levenshtein(s1, s2):
    return sed.standard_sed(s1, s2)

In [None]:
# https://docs.python.org/3/library/difflib.html
    
def changes_spread(original_tokenized, changed_tokenized, opcodes):
    start_change = -1
    end_change = -1
    for opcode in opcodes:
        if opcode[0] != 'equal':
            start_change = opcode[1]
            break
    for opcode in reversed(opcodes):
        if opcode[0] != 'equal':
            end_change = opcode[2]
            break
    return max(0, end_change-start_change)

    

In [None]:
def highlight_in_capital(sentence_tokenized, highlight_positions):
    """
    Params:
        sentence_tokenized: tokenzied sentence
        highlight_positions: list of 2-sized tuples: [(p1, p2), (p3,p4), ...]
            where we want to highlight sentence[p1:p2], sentence[p3:p4]
    """
    highlighted_sentence = []
    
    last = 0  # index of the last position added to the new sentence
    for (start, stop) in highlight_positions:
        highlighted_sentence.extend(
            sentence_tokenized[last:start] + \
            [w.upper() for w in sentence_tokenized[start:stop]]
        )
        last = stop
    if last < len(sentence_tokenized):
        highlighted_sentence.extend(
            sentence_tokenized[last:]
        )
    return ' '.join(highlighted_sentence)

In [None]:
def two_chunk_changed(original_tokenized, changed_tokenized, opcodes, 
                      chunk_max_length=1, spacy_model=None, w2v_model=None):
    # Return the original and changed sentences with the chunk highlighted in capital
    # Return whether this sentence has only two chunk changes within the max length. 
    # And return the distance between the two changed chunks
    
    is_two_chunk_changed = False
    chunk_distance = pd.NA
    is_same_subtree = pd.NA
    changes_similarity = pd.NA
    
    
    
    changes_types = [o[0] for o in opcodes]
    
    # If not exactly two changes, return
    if not (all(changes_type == 'replace' or changes_type == 'equal' for changes_type in changes_types) and \
        changes_types.count('replace') == 2):
        return is_two_chunk_changed, chunk_distance, is_same_subtree, changes_similarity
    
    # Find the positions of the two changed chunks
    i_replace = [i for i, change in enumerate(changes_types) if change == "replace"]
    
    # If two changed chunks not have length less than chunk_max_length, return
    if not (opcodes[i_replace[0]][2] - opcodes[i_replace[0]][1] <= chunk_max_length and \
            opcodes[i_replace[1]][2] - opcodes[i_replace[1]][1] <= chunk_max_length):
        return is_two_chunk_changed, chunk_distance, is_same_subtree, changes_similarity
    
    # At this point, this should be a valid two_chunk within length change
    is_two_chunk_changed = True
    
    # Check if there is indeed an equal chunks in between of the two changed chunk
    # Calculate the distance between two chunks = the equal chunk in between
    i_equal_in_between = (i_replace[1] + i_replace[0]) // 2
    assert opcodes[i_equal_in_between][0] == 'equal'
    chunk_distance = opcodes[i_equal_in_between][2] - opcodes[i_equal_in_between][1]


    if spacy_model is not None: 
        # In the two_chunk_changed case when chunk_max_length=1, i.e., only two words are changed 
        # comparing to the original translation
        # Check if the two changed words are in the same sub tree of the dependency tree
        if (opcodes[i_replace[0]][4] - opcodes[i_replace[0]][3] == 1 and \
            opcodes[i_replace[1]][4] - opcodes[i_replace[1]][3] == 1):
            # Find the ancestors and children of the two changed words
            doc = spacy_model(' '.join(changed_tokenized))
            token1, token2 = None, None
            family1, family2 = None, None
            for token in doc:
                if token.text == changed_tokenized[opcodes[i_replace[0]][3]]:
                    token1 = token.text
                    family1 = list(token.ancestors) + list(token.children)
                    family1 = [t.text for t in family1]
                elif token.text == changed_tokenized[opcodes[i_replace[1]][3]]:
                    token2 = token.text
                    family2 = list(token.ancestors) + list(token.children)
                    family2 = [t.text for t in family2]

            if token1 is None or token2 is None:
                is_same_subtree = pd.NA
            else:
                if token1 in family2 or token2 in family1:
                    is_same_subtree = True
                else:
                    is_same_subtree = False


    # Calculate the senmatic similarity of the two changed words (cosine similarity in [-1, 1])
    if w2v_model is not None:
        # Can only calculate when only two single tokens are changed
        if (opcodes[i_replace[0]][4] - opcodes[i_replace[0]][3] == 1 and \
            opcodes[i_replace[1]][4] - opcodes[i_replace[1]][3] == 1 and \
            opcodes[i_replace[0]][2] - opcodes[i_replace[0]][1] == 1 and \
            opcodes[i_replace[1]][2] - opcodes[i_replace[1]][1] == 1):

            original_word_1 = original_tokenized[opcodes[i_replace[0]][1]]
            changed_word_1 = changed_tokenized[opcodes[i_replace[0]][3]]

            original_word_2 = original_tokenized[opcodes[i_replace[1]][1]]
            changed_word_2 = changed_tokenized[opcodes[i_replace[1]][3]]

            if original_word_1 in w2v_model.index_to_key and original_word_2 in w2v_model.index_to_key and \
                changed_word_1 in w2v_model.index_to_key and changed_word_2 in w2v_model.index_to_key:
                changes_similarity = [{'original_word': original_word_1, 
                                       'changed_word': changed_word_1, 
                                       'semantic_similarity': w2v_model.similarity(original_word_1, changed_word_1)},
                                      {'original_word': original_word_2,
                                       'changed_word': changed_word_2,
                                       'semantic_similarity': w2v_model.similarity(original_word_2, changed_word_2)}]


    return is_two_chunk_changed, chunk_distance, is_same_subtree, changes_similarity
    
    
def highlight_changes(original_tokenized, changed_tokenized, opcodes):
    """
    Params:
        original_tokenized: tokenized original sentence
        changed_tokenized: tokenized changed sentence
        opcodes: changes to get from `original_tokenized` to `changed_tokenized`
    Returns:
        original_sentence and changed_sentence with the changes highlighted in capital
    """
    
    highlighted_original_sentence_positions = []
    highlighted_changed_sentence_positions = []
    
    for opcode in opcodes:
        tag, i1, i2, j1, j2 = opcode[0], opcode[1], opcode[2], opcode[3], opcode[4]
        
        if tag != 'equal':
            highlighted_original_sentence_positions.append((i1, i2))
            highlighted_changed_sentence_positions.append((j1, j2))
            
    original_sentence_highlighted = highlight_in_capital(
        sentence_tokenized=original_tokenized, 
        highlight_positions=highlighted_original_sentence_positions
    )
    
    changed_sentence_highlighted = highlight_in_capital(
        sentence_tokenized=changed_tokenized, 
        highlight_positions=highlighted_changed_sentence_positions
    )
    
    return original_sentence_highlighted, changed_sentence_highlighted
    
    
def calculate_change(original_tokenized, changed_tokenized):
    # Return the original and changed sentences with the changes highlighted in capital
    opcodes = SequenceMatcher(None, original_tokenized, changed_tokenized).get_opcodes()
    
    # Convert the opcodes (displayed by word index) to changes in words
    changes = []
    for opcode in opcodes:
        tag, i1, i2, j1, j2 = opcode[0], opcode[1], opcode[2], opcode[3], opcode[4]
        if tag != 'equal':
            changes.append((tag, ' '.join(original_tokenized[i1:i2]), ' '.join(changed_tokenized[j1:j2])))
    
    return opcodes, changes


In [None]:
def load_alignment(path_prefix):
    alignment_file_path = f"{path_prefix}_word_alignment.txt"
    if not os.path.isfile(alignment_file_path):
        raise RuntimeError("Alignment file not exist.")
        
    else:
        with open(alignment_file_path) as f:
            lines = [line.rstrip() for line in f]
            
        translation_alignment = []
        for line in lines:
            word_pairs = line.split()
            word_pairs = [word_pair.split('<sep>') for word_pair in word_pairs]
            translation_alignment.append(dict(word_pairs))
        return translation_alignment

In [None]:
def add_reason_of_change(alignment, changes, perturbed_src_word):
    if type(changes) != list:
        return pd.NA
    elif perturbed_src_word not in alignment.keys():
        changes[0]['change_type'] = None
        changes[1]['change_type'] = None
    elif alignment[perturbed_src_word] == changes[0]['changed_word'] and alignment[perturbed_src_word] == changes[1]['changed_word']:
        # Both changes are due to perturbation --> weird --> pass
        changes[0]['change_type'] = None
        changes[1]['change_type'] = None
    elif alignment[perturbed_src_word] != changes[0]['changed_word'] and alignment[perturbed_src_word] != changes[1]['changed_word']:
        # Both changes NOT due to perturbation --> weird --> pass
        changes[0]['change_type'] = None
        changes[1]['change_type'] = None
    elif alignment[perturbed_src_word] == changes[0]['changed_word']:
        changes[0]['change_type'] = "perturbed"
        changes[1]['change_type'] = "not_perturbed"
    elif alignment[perturbed_src_word] == changes[1]['changed_word']:
        changes[0]['change_type'] = "not_perturbed"
        changes[1]['change_type'] = "perturbed"
        
    return changes
        
        

In [None]:
def pos_tag_not_perturbed_change(changes, spacy_model):
    if type(changes) != list:
        return pd.NA
    elif changes[0]['change_type'] == "not_perturbed":
        doc = spacy_model(changes[0]['changed_word'])
        return [t.pos_ for t in doc][0]
    elif changes[1]['change_type'] == "not_perturbed":
        doc = spacy_model(changes[1]['changed_word'])
        return [t.pos_ for t in doc][0]
    return pd.NA

In [None]:
def read_output_df(dataset, src_lang, tgt_lang, perturb_type, beam, replacement_strategy, analyse_feature=True, 
                   ignore_case=False, no_of_replacements=1, chunk_max_length=1, spacy_model=None, 
                   w2v_model=None, use_alignment=False, winoMT=False, ref_available=False,
                   two_chunks_analysis=False):
    if winoMT:
        path_prefix = "../output/winoMT_asmetric/wmt19_winoMT_perturbed"
        output_df = pd.read_csv('../output/winoMT_asmetric/wmt19_winoMT_perturbed_format.csv', index_col=0)  
    else:
        path_prefix = f"../output/{dataset}/{replacement_strategy}/beam{beam}_perturb{perturb_type}/{no_of_replacements}replacements/seed0/translations"
        output_df = pd.read_csv(f"{path_prefix}.csv", index_col=0)

        # Join to get the translation of the original sentences as well
        original_trans_path_prefix = \
            f"../output/{dataset}/original/translations"
        original_trans = pd.read_csv(
            f"{original_trans_path_prefix}.csv", index_col=0
        )['SRC-Trans']
        output_df = pd.merge(output_df, original_trans, how='left', left_on='SRC_original_idx',
                            right_on=original_trans.index)
        
        
    if 'mustSHE' in dataset:
        output_df = output_df.merge(pd.read_csv(
            f"../data/MuST-SHE_v1.2/MuST-SHE-v1.2-data/tsv/MONOLINGUAL.fr_v1.2.tsv",
            sep='\t')[['ID', 'CATEGORY']],
            how='left', left_on='SRC_original_idx', right_on='ID'
        )
        
    
    # Convert columns with sentences to str type
    cols = ['SRC', 'REF', 'SRC_perturbed', 'SRC_perturbed-Trans', 'SRC-Trans']
    if not ref_available:
        cols.remove('REF')
    output_df[cols] = output_df[cols].astype(str)
    
    if ignore_case:
        output_df[cols] = output_df[cols].applymap(lambda x: x.lower())
    
    # Reorder the columns
    if winoMT:
        cols = ['SRC', 'REF', 'original_word', 'perturbed_word', 'SRC_perturbed', 'SRC-Trans', 'SRC_perturbed-Trans', 'Bias_sample']
    elif no_of_replacements == 1:
        cols = ['SRC', 'REF', 'original_word', 'perturbed_word', 'SRC_perturbed', 'SRC-Trans', 'SRC_perturbed-Trans', 'SRC_original_idx']
    else:
        cols = ['SRC_masked_index', 'SRC', 'REF', 'original_word', 'perturbed_word', 'SRC_perturbed', 'SRC-Trans', 'SRC_perturbed-Trans', 'SRC_original_idx']
    if not ref_available:
        cols.remove('REF')
    if 'mustSHE' in dataset:
        cols.append('CATEGORY')
    output_df = output_df[cols]
    
    if use_alignment:
        if not winoMT:
            original_alignment = load_alignment(original_trans_path_prefix)
            output_df['original_trans_alignment'] = [alignment for alignment in original_alignment for _ in range(no_of_replacements)]
        output_df['perturbed_trans_alignment'] = load_alignment(path_prefix)
    
    if analyse_feature:
        print(f"Original df shape: {output_df.shape}")
        output_df = output_df.dropna()
        print(f"After dropping none-perturbed sentences: {output_df.dropna().shape}")
        
        tgt_tokenizer = MosesTokenizer(lang=tgt_lang)
        src_tokenizer = MosesTokenizer(lang=src_lang)
        
        print("Tokenize everything ...")
        output_df['tokenized_SRC-Trans'] = output_df['SRC-Trans'].apply(
            lambda x: tgt_tokenizer.tokenize(x, escape=False, aggressive_dash_splits=False)
        )
        output_df['tokenized_SRC_perturbed-Trans'] = output_df['SRC_perturbed-Trans'].apply(
            lambda x: tgt_tokenizer.tokenize(x, escape=False, aggressive_dash_splits=False)
        )
        output_df['tokenized_SRC'] = output_df['SRC'].apply(
            lambda x: src_tokenizer.tokenize(x, escape=False, aggressive_dash_splits=False)
        )
        if 'REF' in output_df.columns:
            output_df['tokenized_REF'] = output_df['REF'].apply(
                lambda x: tgt_tokenizer.tokenize(x, escape=False, aggressive_dash_splits=False)
            )
        
        print('Calculating the changes between translations of original SRC and perturbed SRC ...')
        # Calculate the changes, i.e., how to get from the original trans sentence 
        # to the changed trans sentence
        output_df['opcodes'], output_df['changes'] \
            = zip(*output_df.apply(
                lambda x: calculate_change(x['tokenized_SRC-Trans'], 
                                           x['tokenized_SRC_perturbed-Trans']
                                          ),
                axis=1)
              )
        
        
        print('Highlighting the changes ...')
        # Highlight the changes in the trans sentences
        output_df["SRC-Trans"], output_df['SRC_perturbed-Trans'] \
            = zip(*output_df.apply(
                lambda x: highlight_changes(
                    x['tokenized_SRC-Trans'], 
                    x['tokenized_SRC_perturbed-Trans'], 
                    x['opcodes']), axis=1
            ))
        
        
        print('Calculating the edit distance ...')
        if replacement_strategy == 'word2vec_similarity':
            # SRC difference is the number of occurances of the word we perturb
            output_df["SRC-edit_distance"] = output_df.apply(lambda x: x['tokenized_SRC-Trans'].count(x['original_word']), axis=1)
        else:
            output_df["SRC-edit_distance"] = 1
        output_df['Trans-edit_distance'] =  output_df.apply(
            lambda x: levenshtein(x['tokenized_SRC-Trans'], x['tokenized_SRC_perturbed-Trans']), axis=1)
        
#         output_df["#TransChanges-#SrcChanges"] = output_df['Trans-edit_distance'] - output_df['SRC-edit_distance']
        
        output_df["#TransChanges/SentenceLength"] = \
            output_df['Trans-edit_distance'] / output_df['tokenized_SRC-Trans'].apply(lambda x: len(x))
        
        output_df["ChangesSpread"] = output_df.apply(
            lambda x: changes_spread(x['tokenized_SRC-Trans'], 
                                     x['tokenized_SRC_perturbed-Trans'], 
                                     x['opcodes']), axis=1)
        
        output_df["ChangesSpread/SentenceLength"] = \
            output_df["ChangesSpread"] / output_df['tokenized_SRC-Trans'].apply(lambda x: len(x))
        
        
        
        if two_chunks_analysis:
            print("Two-chunks changed analysis")
            # See if only two chunks within given max size are changed, 
            # and do some analysis on this special case
            output_df['TwoChunksChanged'], output_df['ChunkDistance'], \
            output_df["is_same_subtree"], output_df['changes_similarity'] \
                = zip(*output_df.apply(
                    lambda x: two_chunk_changed(x['tokenized_SRC-Trans'],
                                                x['tokenized_SRC_perturbed-Trans'],
                                                x['opcodes'],
                                                chunk_max_length=chunk_max_length,
                                                spacy_model=spacy_model,
                                                w2v_model=w2v_model), axis=1
                ))

        
        print("Find out changes directly caused by perturbation using alignment")
        if use_alignment:
            if two_chunks_analysis:
                # In the case where two changes occurs and the two similarities is calculated, 
                # find out which change is due to the perturbation
                output_df['changes_similarity'] = output_df.apply(
                    lambda x: add_reason_of_change(
                        alignment=x['perturbed_trans_alignment'],
                        changes=x['changes_similarity'],
                        perturbed_src_word=x['perturbed_word']
                    ),
                    axis=1
                )
            
                if spacy_model is not None:
                    # Add POS tagging of the not-perturbed change
                    output_df['not_perturbed_TGT_change_type'] = output_df['changes_similarity'].apply(
                        lambda x: pos_tag_not_perturbed_change(x, spacy_model))
            
            
        print("Stats on some group changes")
        # Analyse on group of changes on the same sentence
        if no_of_replacements > 1:
            additional_col_1 = output_df.groupby(by='SRC_masked_index', axis=0)[['Trans-edit_distance']].std()
            output_df = output_df.join(additional_col_1, rsuffix='--SD')
            
            if two_chunks_analysis:
                additional_col_2 = output_df.groupby(by='SRC_masked_index', axis=0)[['TwoChunksChanged']].sum()
                output_df = output_df.join(additional_col_2, rsuffix='--total')
        
    return output_df

    


In [None]:
perturb_type = 'MultiplePerSentence_allWords'
src_lang = 'en'
tgt_lang = 'de'
dataset = f'WMT21_DA_{src_lang}2{tgt_lang}'  # 'MuST-SHE-en2fr' 'IWSLT15-en2vi' 'wmt19-newstest2019-en2de'
beam = 5
replacement_strategy = 'masking_language_model'
no_of_replacements = 30
ignore_case = False  # Only Europarls needs ignore case
chunk_max_length=1
spacy_model = spacy.load("de_core_news_sm")
# Loading these models in is time consuming
# de_model = load_facebook_model("../data/cc.de.300.bin").wv
# vi_model = load_facebook_model("../data/cc.vi.300.bin").wv
winoMT = False

# # This overwrite the above params
# winoMT = True
# perturb_type = 'pronoun'
# no_of_replacements = 1

output = read_output_df(dataset=dataset, src_lang=src_lang, tgt_lang=tgt_lang, perturb_type=perturb_type, 
                        beam=beam, replacement_strategy=replacement_strategy, ignore_case=ignore_case,
                        no_of_replacements=no_of_replacements, chunk_max_length=chunk_max_length,
                        spacy_model=spacy_model, w2v_model=None, use_alignment=False, 
                        winoMT=winoMT, analyse_feature=True, two_chunks_analysis=False)

# print('BLEU score: ')
# sacrebleu.corpus_bleu(output['SRC-Trans'].tolist(), [output['REF'].tolist()]).score

In [None]:
# output.to_pickle(f'tmp_storages/analyse_{dataset}_{perturb_type}.pkl')

# Comments

- On `wmt19-newstest2019-en2de, chunk_max_length=2`
    - 902: change to 1 SRC word leads to fixed changes of an irrelevant word
    - In many cases, the form of the verb (e.g., current or past tense) are changed --> harmful in the sense that it hurt performance score?
    - Word not being translated 
    - Spoken/written style
    - Time
    
    
- On `IWSLT15-en2vi, adjective`
    - 1003: change of 1 words consistently leads to change in subject
    
    - 1003, 145, 990 noun: same
    - 236 noun: same, funny but not sure if it is wrong
    - 308 verb same 
    
--> Quantify the verb form change by stemming/lemmatization
    
Chúng, họ, gã, cô ấy, cô ta, anh ta, hắn

Changes in the word "you"


In [None]:
# output[output['#TransChanges-#SrcChanges'] > 10].head(5)
# output[output["ChangesSpread/SentenceLength"] > 0.85].head(20)



# Two chunks changed that consistently changed over the different replacement of a word


# output[(output["TwoChunksChanged"] == True) & (output["TwoChunksChanged--total"] == 5)].sort_values(by='ChunkDistance', axis=0, ascending=False).head(1)
# output[(output["TwoChunksChanged"] == True)].sort_values(by='ChunkDistance', axis=0, ascending=False).head(100)

# Two words changed that are not in the same subtree
# output[(output["TwoChunksChanged"] == True) & (output["is_same_subtree"] == False) & (output["TwoChunksChanged--total"] == 5)]




# IWSLT15-en2vi, noun
# output.loc[[1003, 145, 990, 236]]







Sort the samples by the least similarity in changed words

In [None]:
# Filter out the 2-word-changed cases and similarity can be calculated
def get_not_perturbed_change_similarity(changes):
    for change in changes:
        if change['change_type'] == 'not_perturbed':
            return change['semantic_similarity']
    return pd.NA

analyse_df = output[
    (output["TwoChunksChanged"] == True) & output['changes_similarity'].notna() & output['not_perturbed_TGT_change_type'].isin(['NOUN', 'VERB', 'ADJ', 'PRON'])
]
analyse_df['similarity_not_perturbed'] = analyse_df['changes_similarity'].apply(
    lambda x: get_not_perturbed_change_similarity(x)
)
analyse_df.sort_values(by='similarity_not_perturbed')[['SRC', 
                                                f'original_word', 
                                                f'perturbed_word',
                                                'SRC-Trans',
                                                f'SRC_perturbed-Trans',
                                                'ChunkDistance',
                                                'changes_similarity',
                                                'similarity_not_perturbed',
                                                'not_perturbed_TGT_change_type',
#                                                 'Bias_sample'
                                                      ]].head(50)


### Calculate metrics for detecting the bias samples

High precision --> higher chance that the returned samples are bias --> save human time

High recall --> more bias samples are retreat --> can detect more type of bias

We focus on precision then (save human cost)

In [None]:
from sklearn.metrics import classification_report

print(' -------------------- Most-changes filter -------------------- ')
q = 20  # Take the q% sentences with the highest changes
no_changes_thresthold = np.percentile(output['#TransChanges-#SrcChanges'], 100-q)
bias_prediction = output['#TransChanges-#SrcChanges'] > no_changes_thresthold
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)

print(' -------------------- Most-spreaded_changes filter -------------------- ')
q = 20  # Take the q% sentences with the highest spread
spread_thresthold = np.percentile(output['ChangesSpread/SentenceLength'], 100-q)
bias_prediction = output['ChangesSpread/SentenceLength'] > spread_thresthold
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)

print(' -------------------- Two-changes filter -------------------- ')
bias_prediction = output["TwoChunksChanged"]
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)


print(' -------------------- Two-faraway-changes filter -------------------- ')
q = 20  # Take the q% sentences with the furthest distance between 2 changes 
distance_thresthold = np.nanpercentile(output['ChunkDistance'], 100-q)
bias_prediction = output["TwoChunksChanged"] & (output['ChunkDistance'] > distance_thresthold)
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)

print(' -------------------- Two-changes-different-subtree filter -------------------- ')
bias_prediction = output["TwoChunksChanged"] & (output["is_same_subtree"] == False)
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)


print(' -------------------- Two-change-dissimilar filter -------------------- ')
q = 90  # Take the q% sentences with the lowest similarity of the not-perturbed change
output = output.join(analyse_df['similarity_not_perturbed'])
similiarity_threshold = np.nanpercentile(output['similarity_not_perturbed'], q)

bias_prediction = output["TwoChunksChanged"] & (output['similarity_not_perturbed'] < similiarity_threshold)
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)

# Analyse on same original_word accross sentences

In [None]:
output[[
    'SRC_masked_index', 'SRC', 'original_word', 'perturbed_word', 'SRC_perturbed',
    'SRC-Trans', 'SRC_perturbed-Trans', '#TransChanges-#SrcChanges',
    '#TransChanges-#SrcChanges/SentenceLength',
    'ChangesSpread/SentenceLength', 'TwoChunksChanged', 'ChunkDistance',
    'is_same_subtree', 'changes_similarity', 'perturbed_trans_alignment',
    'not_perturbed_TGT_change_type', 'Trans-edit_distance--SD',
    '#TransChanges-#SrcChanges--SD', 'TwoChunksChanged--total'
]].groupby('original_word').mean().head()




### Most changes filter:

In [None]:
groupped_by_word = output.groupby('original_word').mean()

q = 10  # Take the q% groups with the highest changes
no_changes_thresthold = np.percentile(groupped_by_word['#TransChanges-#SrcChanges'], 100-q)
bias_prediction = groupped_by_word['#TransChanges-#SrcChanges'] > no_changes_thresthold

bias_word_predicted = groupped_by_word[bias_prediction].index.values

output[
    output['original_word'].isin(bias_word_predicted) & \
    (output['#TransChanges-#SrcChanges'] > no_changes_thresthold)
].head(2)





### Most-spreaded_changes filter

In [None]:
groupped_by_word = output.groupby('original_word').mean()

q = 10  # Take the q% sentences with the highest spread
spread_thresthold = np.percentile(groupped_by_word['ChangesSpread/SentenceLength'], 100-q)
bias_prediction = groupped_by_word['ChangesSpread/SentenceLength'] > spread_thresthold

bias_word_predicted = groupped_by_word[bias_prediction].index.values

output[
    output['original_word'].isin(bias_word_predicted) & \
    (output['ChangesSpread/SentenceLength'] > spread_thresthold)
].head(2)


### Two-faraway-changes filter

ACTUALLY two-changes is not a bias filter. It's just an auxilary filter to avoid paraphrasing cases. Using this we will miss out on the cases where the model has both paraphrasing and 

Here we consider in each group: the number of sentences that has 2 changes

In [None]:
two_change_only_groupped_by_word = output[output["TwoChunksChanged"]].groupby('original_word').mean()


q = 20  # Take the q% sentences with the furthest distance between 2 changes 
distance_thresthold = np.percentile(two_change_only_groupped_by_word['ChunkDistance'], 100-q)
bias_prediction = two_change_only_groupped_by_word['ChunkDistance'] > distance_thresthold


bias_word_predicted = two_change_only_groupped_by_word[bias_prediction].index.values

output[
    output["TwoChunksChanged"] & \
    output['original_word'].isin(bias_word_predicted) & \
    (output['ChunkDistance'] > distance_thresthold)
].head(2)



### Two-changes-different-subtree filter

In [None]:
tmp = output[output["TwoChunksChanged"] & output['is_same_subtree'].notna()]
tmp['not_same_subtree'] = 1 - tmp['is_same_subtree'].astype(int)
two_change_only_groupped_by_word = tmp.groupby('original_word').sum()

q = 20  # Take the q% groups with the highest number of different subtree changes
count_thresthold = np.percentile(two_change_only_groupped_by_word['not_same_subtree'], 100-q)
bias_prediction = two_change_only_groupped_by_word['ChunkDistance'] > count_thresthold


bias_word_predicted = two_change_only_groupped_by_word[bias_prediction].index.values

output[
    output["TwoChunksChanged"] & \
    output['original_word'].isin(bias_word_predicted) & \
    (output['is_same_subtree'] == 0)
].head(2)



### Two-change-dissimilar filter

In [None]:
output = output.join(analyse_df['similarity_not_perturbed'])
two_change_only_groupped_by_word = output[output["TwoChunksChanged"]].groupby('original_word').mean()


q = 20  # Take the q% sentences with the lowest similarity of the not-perturbed change
similiarity_threshold = np.nanpercentile(two_change_only_groupped_by_word['similarity_not_perturbed'], q)
bias_prediction = two_change_only_groupped_by_word['similarity_not_perturbed'] < similiarity_threshold


bias_word_predicted = two_change_only_groupped_by_word[bias_prediction].index.values

output[
    output["TwoChunksChanged"] & \
    output['original_word'].isin(bias_word_predicted) & \
    (output['similarity_not_perturbed'] < similiarity_threshold)
].head(2)






In [None]:
output.columns

## Find patterns

when a word A is replaced with B, then the change C happens

In [None]:
output.head()

In [None]:
output[['SRC_masked_index', 'SRC', 'original_word', 'perturbed_word', 'SRC_perturbed',
       'SRC-Trans', 'SRC_perturbed-Trans', 'changes']].head()

In [None]:
import string

def lower_remove_non_alphabet(input_str):
    translation = input_str.maketrans(dict.fromkeys(string.punctuation, ' '))
    return input_str.translate(translation).lower()

In [None]:
def is_due_to_perturbation(change, original_word, perturbed_word, 
                           perturbed_trans_alignment_dict, original_trans_alignment_dict):
    """
    A change in translation is directly due to perturbation if the (aligned) translation of perturbed_word
    is in changed_part AND the (aligned) translation of original_word is in original_part
    
    Params:
        change: tuple of (change_type, original_trans_part, changed_trans_part)
        original_word: original word in the SRC that was perturbed
        perturbed_word: the replacement of the original word
        perturbed_trans_alignment_dict: {src_word1:trans_word1, src_word2:trans_word2,...} of the perturbed trans
        original_trans_alignment_dict: {src_word1:trans_word1, src_word2:trans_word2,...} of the original trans
    Return: bool, pd.NA in failed alignment case
    """
    # Turn everything to lowercase, and remove any non-alphabet characters
    change_type, original_trans_part, changed_trans_part = \
        change[0], lower_remove_non_alphabet(change[1]), lower_remove_non_alphabet(change[2])
    perturbed_trans_alignment_dict = dict(
        (lower_remove_non_alphabet(k).replace(' ', ''), lower_remove_non_alphabet(v).replace(' ', '')) for k,v in perturbed_trans_alignment_dict.items()
    )
    original_trans_alignment_dict = dict(
        (lower_remove_non_alphabet(k).replace(' ', ''), lower_remove_non_alphabet(v).replace(' ', '')) for k,v in original_trans_alignment_dict.items()
    )
    original_word = lower_remove_non_alphabet(original_word)
    perturbed_word = lower_remove_non_alphabet(perturbed_word)
    

    perturbed_word_appears_in_new_trans = pd.NA
    if perturbed_word in perturbed_trans_alignment_dict.keys():
        perturbed_word_trans = perturbed_trans_alignment_dict[perturbed_word]
        if perturbed_word_trans in changed_trans_part.split():
            perturbed_word_appears_in_new_trans = True
        else:
            perturbed_word_appears_in_new_trans = False
            
    # Missed-translation, or name-specific case
    if perturbed_word in changed_trans_part.split():
        perturbed_word_appears_in_new_trans = True
            

    original_word_appears_in_old_trans = pd.NA
    if original_word in original_trans_alignment_dict.keys():
        original_word_trans = original_trans_alignment_dict[original_word]
        if original_word_trans in original_trans_part.split():
            original_word_appears_in_old_trans = True
        else:
            original_word_appears_in_old_trans = False
        
        if perturbed_word in perturbed_trans_alignment_dict.keys():
            if original_word == 'fort' and perturbed_word == 'île' and change == ('replace', 'Fort-de-France', 'Île-de-France'):
                print('-------------------------')
                print(change)
                print('-' + original_word_trans + '-')
                print('-' + perturbed_word_trans + '-')
                print(original_word_appears_in_old_trans)
                print(perturbed_word_appears_in_new_trans)
                
    # Missed-translation, or name-specific case
    if original_word in original_trans_part.split():
        original_word_appears_in_old_trans = True
            
    # If perturbed_word_appears_in_new_trans or original_word_appears_in_old_trans is true, then 
    # is_due_to_perturbation is true
    if (not pd.isnull(perturbed_word_appears_in_new_trans)) and \
        (not pd.isnull(original_word_appears_in_old_trans)):
        return (perturbed_word_appears_in_new_trans or original_word_appears_in_old_trans)
    elif (pd.isnull(perturbed_word_appears_in_new_trans)) and \
        (not pd.isnull(original_word_appears_in_old_trans)):
        if original_word_appears_in_old_trans:
            return True
        else:
            return pd.NA
    elif (not pd.isnull(perturbed_word_appears_in_new_trans)) and \
        (pd.isnull(original_word_appears_in_old_trans)):
        if perturbed_word_appears_in_new_trans:
            return True
        else:
            return pd.NA
    else:
        return pd.NA
    
    
def filter_changes(group_df):
    changes = []
    
    for index, row in group_df.iterrows():
        for change in row['changes']:
            # Filter out the changes caused by perturbation
            is_due_to_perturbation_out = is_due_to_perturbation(
                                            change, 
                                            row['original_word'], 
                                            row['perturbed_word'], 
                                            row['perturbed_trans_alignment'],
                                            row['original_trans_alignment']
                                        )
            if pd.isnull(is_due_to_perturbation_out) or is_due_to_perturbation_out:
                continue
                
            # Filter out the weird <unk>
            if change == ('delete', '< unk >', '') or change == ('insert', '', '< unk >'):
                continue
                
            # Filter out the changes that are not content-related
            all_pos_tags = [t.pos_ for t in spacy_model(change[1])] + [t.pos_ for t in spacy_model(change[2])]
            content_related_tags = 'NOUN', 'VERB', 'ADJ', 'PRON'
            overlap = not set(all_pos_tags).isdisjoint(content_related_tags)
            if not overlap:
                continue
                
            changes.append(change)
            
            
    return changes



In [None]:
from collections import Counter


def find_max_freq_change(group_df):
    """
    Params: 
        group_df: the group of results that has the same original_word and perturbed_word
    """
    assert group_df['original_word'].value_counts().shape[0] == 1  # Because this function is for a single group
    assert group_df['perturbed_word'].value_counts().shape[0] == 1  # Because this function is for a single group
    
    # Filter out the changes that are not directly due to perturbation
    all_changes = filter_changes(group_df)
    
    freq_changes = Counter(all_changes)
    
    if len(freq_changes.most_common()) == 0:
        return 0
    return freq_changes.most_common(1)[0][1]

change_freq = output.groupby(
    ['original_word', 'perturbed_word'], as_index=False
).apply(find_max_freq_change).rename(columns={None: 'max_change_freq'}).sort_values(
    by='max_change_freq', ascending=False)
    

change_freq = change_freq[change_freq['perturbed_word'].apply(lambda x: x.isalpha())]

change_freq.head(10)

In [None]:
groups = output.groupby(['original_word', 'perturbed_word'])
groups_as_list = [(original_perturb, group) for original_perturb, group in groups]
re_ordered_groupes = [groups_as_list[i] for i in change_freq.index.values]

for original_perturb, group in re_ordered_groupes:
    print("----------------------")
    print(f"original SRC word: {original_perturb[0]}")
    print(f"perturbed SRC word: {original_perturb[1]}")
    all_changes = filter_changes(group)
    freq_changes = Counter(all_changes)
    print(freq_changes.most_common(2))

# Comments

--> starts to make sense, yet have not seen bias (even gender bias)

--> A bigger dataset for inference could help?

Some correlation is good, some correlation is bad. Is it a good idea to prevent these correlation??

# Filter per sentence with different replacements


**Note**: can use [sequence alignments](https://stackoverflow.com/questions/5055839/word-level-edit-distance-of-a-sentence) to align the sentences on the target side only. ([code](https://gist.github.com/slowkow/06c6dba9180d013dfd82bec217d22eb5))

Pros: could be easier than SRC-TGT alignment

Cons: in the case where more output different sentence structure yet same meaning. <br>
E.g., "Today I think the cat is nice" -- "I think the cat is nice today"
SRC-TGT alignment would probably see these as the same, but edit distance cannot, bc it only has del, insert, substitute operations.

In [None]:

def cast_to_index(string_index):
    """
    In a aligned tuple, the items could either be the index of a word, or the character '-' denoting 
    """
    # removes blank spaces
    string_index = string_index.strip()
    
    if string_index == '-':
        return pd.NA
    else:
        return int(string_index)

def edist_alignment(tokenized_sentence1, tokenized_sentence2):
    """
    Return the list of tuples of aligned indices
    """
    
    alignment = sed.standard_sed_backtrace(tokenized_sentence1, tokenized_sentence2)
    # Reformat the output from editst
    alignment = str(alignment).replace('[', '').replace(']', '').split(', ')
    alignment = [x.split('vs.') for x in alignment]
    alignment = [(cast_to_index(x[0]), cast_to_index(x[1])) for x in alignment]
    
    return alignment

def reorder_according_to_alignment(tokenized_sentence1, tokenized_sentence2, alignment):
    """
    Given the alignment tuples, reorder the second sentence to align to the first sentence
    """
    reordered_tokenized_sentence2 = [pd.NA] * len(tokenized_sentence1)
    for alignment_tuple in alignment:
        sentence1_idx, sentence2_idx = alignment_tuple
        if (not pd.isnull(sentence1_idx)) and (not pd.isnull(sentence2_idx)):
            reordered_tokenized_sentence2[sentence1_idx] = tokenized_sentence2[sentence2_idx]
    return reordered_tokenized_sentence2



In [None]:
def nltk_pos_tag(word):
    return nltk.pos_tag([word])[0][1]

def is_content_tag(nltk_pos):
    content_tags_prefix = ['NN', 'V', 'JJ', 'PRP']  # Noun, verb, adj, adv (RB, but removed), pronoun
    for prefix in content_tags_prefix:
        if nltk_pos.startswith(prefix):
            return True
    return False


def uniquify(df_columns):
    """
    Add suffix to distinguish duplicated colunms' names
    """
    seen = set()

    for item in df_columns:
        fudge = 1
        newitem = item

        while newitem in seen:
            fudge += 1
            newitem = "{}_{}".format(item, fudge)

        yield newitem
        seen.add(newitem)
        
        
def align_src_tgt_translations(sentence_df):
    # Convert everything to lowercase
    sentence_df = sentence_df.copy()
    sentence_df['SRC'] = sentence_df['SRC'].apply(lambda x: x.lower())
    sentence_df['original_trans_alignment'] = sentence_df['original_trans_alignment'].apply(
        lambda x: dict(
            (k.lower(), v.lower()) for k,v in x.items()
        )
    )
    sentence_df['perturbed_trans_alignment'] = sentence_df['perturbed_trans_alignment'].apply(
        lambda x: dict(
            (k.lower(), v.lower()) for k,v in x.items()
        )
    )
    
    original_word = sentence_df['original_word'].values[0]
    original_src = sentence_df['SRC'].values[0]
    original_trans_alignment = sentence_df['original_trans_alignment'].values[0]
    
    original_src_tokenized = sentence_df['tokenized_SRC'].values[0]
    original_word_index = original_src_tokenized.index(original_word)
    original_src_tokenized[original_word_index] = '[MASK]'

    result_df = pd.DataFrame(
        index=[original_word]+sentence_df['perturbed_word'].tolist(), 
        columns=original_src_tokenized
    )
    
    # Add the original translation 
    result_df.loc[original_word] = original_trans_alignment
    result_df.loc[original_word, '[MASK]'] = \
        original_trans_alignment[original_word] if original_word in original_trans_alignment.keys() else pd.NA
    
    # Add the perturbed translation
    for index, row in sentence_df.iterrows():
        perturbed_word = row['perturbed_word']
        perturbed_trans_alignment = row['perturbed_trans_alignment']
        result_df.loc[perturbed_word] = perturbed_trans_alignment
        result_df.loc[perturbed_word, '[MASK]'] = \
            perturbed_trans_alignment[perturbed_word] if perturbed_word in perturbed_trans_alignment.keys() else pd.NA
    
    # Fix columns with same name (due to word occurs twice in a sentence)
    result_df.columns = uniquify(result_df.columns)
    
    return result_df
    
    
def align_translations_tgt_only(sentence_df):
    """
    Align all perturbed translations with the original translation
    """
    original_word = sentence_df['original_word'].values[0]
    original_trans_tokenized = sentence_df['tokenized_SRC-Trans'].values[0]
    
    result_df = pd.DataFrame(
        index=[original_word]+sentence_df['perturbed_word'].tolist(), columns=original_trans_tokenized
    )
    
    # Add the original translation 
    result_df.loc[original_word] = original_trans_tokenized
    
    # Add the perturbed translation
    for index, row in sentence_df.iterrows():
        perturbed_word = row['perturbed_word']
        alignment = edist_alignment(original_trans_tokenized, row['tokenized_SRC_perturbed-Trans'])
        result_df.loc[perturbed_word] = reorder_according_to_alignment(
            original_trans_tokenized, row['tokenized_SRC_perturbed-Trans'], alignment
        )
    
    # Fix columns with same name (due to word occurs twice in a sentence)
    result_df.columns = uniquify(result_df.columns)
    
    return result_df
        
        
def align_translations(sentence_df, align_type="src-trans"):
    """
    Params:
        sentence_df: df containing the different unmasking results of a masked sentence, along with the translations
        align_type: "src-trans" align the translations with the source sentence, using awesome-align
                    "trans-only" align the translations with eachother, using edit distance
    Returns:
        result_df: the aligned translations of different perturbed src
    """
    
    count_original_word = sentence_df['original_word'].value_counts()
    assert count_original_word.shape[0] == 1  # Because this function is for a single group

    if align_type == "src-trans":
        return align_src_tgt_translations(sentence_df)
    elif align_type == "trans-only":
        return align_translations_tgt_only(sentence_df)
    else:
        raise RuntimeError('Invalid alignment type')
    
    
def analyse_single_sentence_perturbed_word(sentence_perturbed_word_df, align_type="trans-only", filter_content_word=True, return_tgt_word_index=False):
    """
    Params:
        sentence_perturbed_word_df: df containing the different unmasking results of a masked sentence, along with the translations
        
    Returns:
    
    """
    nr_replacements = sentence_perturbed_word_df.shape[0]
    
    
    aligned_trans = align_translations(sentence_perturbed_word_df, align_type="trans-only")
    
    result = {'perturbed_or_noise_words': [], 
              'words_with_clustered_trans': {}, 
              'words_with_single_trans': {}
             }
    
    for col_idx, col in enumerate(aligned_trans.columns):
        word = col.split('_')[0]
        
        if filter_content_word and align_type == "trans-only":
#             print('NLTK pos tag only available for English, skip filtering content words.')
            filter_content_word = False
            
        if (not filter_content_word) or is_content_tag(nltk_pos_tag(word)):
            count_unique_translated_words = aligned_trans[col].value_counts()
            nr_unique_words = count_unique_translated_words.shape[0]
            
            if nr_unique_words >= 5:
                # If number of unique translations are large,
                # then this is the column of the perturbed word or noise
                if return_tgt_word_index:
                    result['perturbed_or_noise_words'].append(col_idx)
                else:
                    result['perturbed_or_noise_words'].append(col)
            elif 2 <= nr_unique_words and nr_unique_words < 5:
                # TODO: TEMPORARYLY LEAVING OUT SIMILARITY CALCULATION
#                 # Report the word and the minimum similarity between pair-wise unique translations
#                 unique_words = count_unique_translated_words.index.tolist()
#                 all_similarities = []
#                 for i in range(0, len(unique_words)):
#                     for j in range(i, len(unique_words)):
#                         all_similarities.append(de_model.similarity(unique_words[i], unique_words[j]))
                if return_tgt_word_index:
                    result['words_with_clustered_trans'][col_idx] = count_unique_translated_words.to_dict()
                else:
                    result['words_with_clustered_trans'][col] = count_unique_translated_words.to_dict()
            elif nr_unique_words == 1:
                if return_tgt_word_index:
                    result['words_with_single_trans'][col_idx] = count_unique_translated_words.index[0]
                else:
                    result['words_with_single_trans'][col] = count_unique_translated_words.index[0]
            
            
    return result
    
    

In [None]:
def analyse_single_sentence(sentence_df, 
                            align_type="trans-only", 
                            filter_content_word=True, 
                            return_tgt_word_index=False):
    """
    Params:
        sentence_df: df contaning the different translations of different perturbed version of a sentence
        align_type: whether to align the perturbed translations to the original translation, or to the source sentence
        filter_content_word: whether to filter out the non-content words
        return_tgt_word_index: whether to return the word index rather than the actual word itself
    """
    count_original_sentence_idx = sentence_df['SRC_original_idx'].value_counts()
    assert count_original_sentence_idx.shape[0] == 1  # Because this function is for a single group
    
    groups_by_perturbed_word = sentence_df.groupby("original_word", as_index=False)
    

    collect_results = {}
    for original_word, group_by_perturbed_word in groups_by_perturbed_word:
        collect_results[original_word] = analyse_single_sentence_perturbed_word(group_by_perturbed_word, 
                                                                                align_type="trans-only",
                                                                                filter_content_word=filter_content_word,
                                                                                return_tgt_word_index=return_tgt_word_index)
        
    
    # For ambiguous words, find the perturbed words that makes its trans ambiguous,
    # and the perturbed words that makes its trans consistence
    ambiguous_words = set(
        sum([list(x['words_with_clustered_trans'].keys()) for x in collect_results.values()],
            [])
    )
    
    result = {}
    for ambiguous_word in ambiguous_words:
        no_effect_words = []
        effect_words = []
        
        for original_word, collected_result in collect_results.items():
            if ambiguous_word in collected_result['words_with_clustered_trans']:
                effect_words.append(original_word)
            elif ambiguous_word in collected_result['words_with_single_trans']:
                no_effect_words.append(original_word)
    
        
        result[ambiguous_word] = {'no_effecting_words': no_effect_words,
                                  'effecting_words': effect_words}
    
    
    return result
        
        





In [None]:
# analysis = output.groupby('SRC_masked_index').apply(lambda x: analyse_single_sentence_perturbed_word(x))  #.rename(columns={None: 'influenced_words'})


In [None]:
# output.to_pickle('tmp_storages/analyse_winoMT.pkl')
# output = pd.read_pickle('tmp_storages/analyse_winoMT.pkl')

In [None]:
# Some missing info samples from mustSHE
# output[output['CATEGORY']=='1F'][['SRC', 'SRC_original_idx']].drop_duplicates() #.head()

In [None]:
import pprint


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 9999999)

sentence_idx = 0
print(f"Original SRC sentence: \n {output[['SRC', 'SRC_original_idx']].drop_duplicates().set_index('SRC_original_idx').loc[sentence_idx]}")
print()

pprint.pprint(analyse_single_sentence(output[output['SRC_original_idx'] == sentence_idx], align_type="trans-only", return_tgt_word_index=True))


In [None]:
original_word = 'general'


sentence_df = output[output['SRC_original_idx'] == sentence_idx]
sentence_single_perturbed_word_df = sentence_df[sentence_df['original_word'] == original_word]


pprint.pprint(analyse_single_sentence_perturbed_word(sentence_single_perturbed_word_df, align_type="trans-only"))
align_translations(sentence_single_perturbed_word_df, align_type="trans-only")


# Quality analysis

- Have to use WMT21 data, bc models for 2021 is available. Also they have clear evaluation script
- Have to do some manual fix so that the translation tokenization match completely with the tokenization of the labeled data

### Word-level

A translated word is uncertain if changing other words in the SRC sentence affect its translations. The assumption is the the translation of one word should only depends on a few others word, but not too many.

E.g., My mother, who had a difficult childhood, is a great doctor.

The gender form of "doctor" should only change if we change the word "mother".

Hyperparam: The number of SRC words that effect the translations of the target word

Currently: If a translated word has 3 or more effecting SRC word, mark as "BAD"

Metrics: Matthews correlation coefficient 
"It takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes. The MCC is in essence a correlation coefficient value between -1 and +1. A coefficient of +1 represents a perfect prediction, 0 an average random prediction and -1 an inverse prediction."

Current score: 0.267


Score of WMT21 shared task: baseline 0.370, best 0.510

WMT21 baseline: multilingual transformer-based Predictor-Estimator approach for both sentence level and word level


**How to do hyperparameter tuning if our goal is not to use the training data? Or just using the dev set is oke?**



In [None]:
def fix_tokenization(tokenized_sentence):
    # Some of the sentences is tokenized differently in the labeled data. I.e., the last dot is not tokenized
    # Fix in order to syncronize with the labeled data
    if tokenized_sentence[-1] != '.':
        str_sentence = ' '.join(tokenized_sentence)
        str_sentence = str_sentence[:-1] + ' .'
        return str_sentence.split()
    else:
        return tokenized_sentence


In [None]:
trans_analysed_file = "tmp_storages/analyse_WMT21_DA_en2de_MultiplePerSentence_allWords.pkl"
output = pd.read_pickle(trans_analysed_file)

# Fix tokenization
errornous_idxs = [80,86,109,122,143,285,306,314,427,430,528,560,760,884,908,924,940]

output['tokenized_SRC-Trans'] = output.apply(
    lambda x: fix_tokenization(
        x['tokenized_SRC-Trans']
    ) if x['SRC_original_idx'] in errornous_idxs else x['tokenized_SRC-Trans'],
    axis=1
)

output['tokenized_SRC_perturbed-Trans'] = output.apply(
    lambda x: fix_tokenization(
        x['tokenized_SRC_perturbed-Trans']
    ) if x['SRC_original_idx'] in errornous_idxs else x['tokenized_SRC_perturbed-Trans'],
    axis=1
)


In [None]:
# Load in the gold labels (OK/BAD word-level tags)

# with open("../data/wmt-qe-2022-data/test_data-gold_labels/task1_word-level/en-de/test.2022.en-de.tags", 'r') as f:
#     gold_labels = f.readlines()
#     gold_labels = [line.replace('\n', '').split() for line in gold_labels]
    
gold_labels = pd.read_csv(
    '../data/wmt-qe-2021-data/en-de-test21/goldlabels/task2_wordlevel_mt.tags',
    header=None, sep='\t', quoting=3
)
gold_labels.head()

If a translated word has 3 or more effecting SRC word, mark as "BAD"

In [None]:
def find_bad_word(tgt_src_effects):
    bad_tgt_words = []
    for tgt_word, src_effects in tgt_src_effects.items():
        if len(src_effects['effecting_words']) > 2:
            bad_tgt_words.append(tgt_word)
    return bad_tgt_words

In [None]:
word_tag = []
SRC_original_indices = output['SRC_original_idx'].unique()

for SRC_original_idx in SRC_original_indices:
    sentence_df = output[output['SRC_original_idx'] == SRC_original_idx]
    original_trans_length = len(sentence_df['tokenized_SRC-Trans'].values[0])
    tgt_src_effects = analyse_single_sentence(sentence_df, 
                                              align_type="trans-only", return_tgt_word_index=True)
    bad_words = find_bad_word(tgt_src_effects)
    sentence_word_tags = ['BAD' if x in bad_words else 'OK' for x in range(0, original_trans_length)]
    word_tag.append(sentence_word_tags)





In [None]:
from sklearn.metrics import matthews_corrcoef

flat_gold_labels = gold_labels.iloc[:, -1]
flat_pred_labels = [item for sublist in word_tag for item in sublist]

matthews_corrcoef(y_true=flat_gold_labels, y_pred=flat_pred_labels)

## Sentence level

Approximations:
- Negative corr with DA:
    - Changes edit distance 
    - Changes edit distance / length
    - Changes spread
    - Changes spread / length
    - Number of BAD tokens
   
Metrics: Pearson correlation coefficient: "Correlations of -1 or +1 imply an exact linear relationship"

DA scores:
Scores: highest 0.18
WMT21 scores: baseline 0.403, best 0.584


HTER scores: 
Scores: highest 0.28
WMT21 scores: baseline 0.529, best 0.653

**Can try to apply some function to the prediction, but then that's not unsupervised anymore**

**Can use as feature for QE model, but again not unsupervised anymore**

In [None]:
# trans_analysed_file = "analyse_WMT22_MQM_en2de.pkl"
# output = pd.read_pickle(trans_analysed_file)

In [None]:
from scipy.stats import zscore
from scipy.stats import pearsonr, spearmanr


approximations = output[
    ["SRC_original_idx", 
     "Trans-edit_distance", 
     "#TransChanges/SentenceLength",
     "ChangesSpread",
     "ChangesSpread/SentenceLength"
    ]
].groupby("SRC_original_idx").mean()

approximations['word_level_agg'] = [x.count('BAD') for x in word_tag]


for col in approximations.columns:
    # Normalize the apporximations, invert the sign
    approximations[col] = -approximations[col]
    approximations[col] = zscore(approximations[col].values)
    
    
approximations.head()

Gold labels:

In [None]:
if 'WMT21_DA' in trans_analysed_file:
    with open("../data/wmt-qe-2021-data/en-de-test21/goldlabels/test21.hter", 'r') as f:
        da_scores = f.readlines()
        da_scores = [float(da_score.replace('\n', '')) for da_score in da_scores]
    gold_lables = da_scores
elif 'WMT22_MQM' in trans_analysed_file:
    with open("../data/wmt-qe-2022-data/test_data-gold_labels/task1_mqm/en-de/test.2022.en-de.mqm_z_score", 'r') as f:
        mqm_scores = f.readlines()
        mqm_scores = [float(mqm_score.replace('\n', '')) for mqm_score in mqm_scores]
    gold_lables = mqm_scores

    

Evaluation on gold labels:

In [None]:
for col in approximations.columns:
    print(f"-----------------{col}-----------------")
    print(pearsonr(gold_lables, approximations[col].values))


In [None]:
for col in approximations.columns:
    plot_df = pd.DataFrame({'true': gold_lables, 'pred': approximations[col].values})
#     plot_df = plot_df.sort_values('pred')
    
    X = plot_df['pred']
    Y = plot_df['true']
    
    plt.figure()
    hist = plt.hist(Y, bins=20)
    bin_boundaries = hist[1]
    
#     # Remove bins with too few samples
#     cut_point = 99999
#     for i, value in enumerate(hist[0]):
#         if value < 5:
#             cut_point = i
#             break

#     bin_boundaries = bin_boundaries[:cut_point]
    
    
    plt.figure()
    x_plot = [(bin_boundaries[i] + bin_boundaries[i+1])/2 for i in range(0, len(bin_boundaries)-1)]
    y_plot = [stats.trim_mean(Y[(bin_boundaries[i] < X) & (X < bin_boundaries[i+1])], 0.1) for i in range(0, len(bin_boundaries)-1)]
    plt.plot(x_plot, y_plot)
    plt.xlabel(col)
    plt.ylabel('gold_lables')


    
    

In [None]:
# toks = output.groupby('SRC_original_idx').first()['tokenized_SRC-Trans'].tolist()
# toks = [' '.join(tok) for tok in toks]

# with open('test/tmp.txt', 'w') as f:
#     for x in toks:
#         f.writelines(x + '\n')


In [None]:
# Statistical test to see if SRC_similarity is higher than Trans_similarity
print(output["Trans-edit_distance"].mean() - output["SRC-edit_distance"].mean())
stats.ttest_rel(output["SRC-edit_distance"], 
                output["Trans-edit_distance"], 
                alternative='less')

Tiny pvalue --> Indeed SRC-edit_distance is significantly lower than Trans-edit_distance


(Careful with this tho, bc with number of samples too large then statistical test does not make sense)

In [None]:
_ = plt.hist(output["#TransChanges-#SrcChanges"], bins=50)

In [None]:
print(output["ChangesSpread/SentenceLength"].describe())
output["ChangesSpread/SentenceLength"].plot.box()

Some changes seems to have the same meaning but different phrasing, e.g., noun index 24, 36, 47

Both for en-de and en-vi


Kind of bias: en-vi adjective sample 82

Should we cherry-pick examples? Or cherry-pick the replacement?


Or narrow down scope of perturbation? (e.g., on countries, jobs, gender, ...?)



Some cherry-picked examples anyway:

- He comes from England --> Ông ấy đến từ Anh
- He comes from Vietnam --> Hắn đến từ Việt Nam
- He comes from North Korea --> Hắn đến từ Bắc Triều Tiên



- He is european --> Hắn là người Châu Âu
- He is asian --> Anh ấy là người châu Á.



- He has black hair --> Hắn có tóc đen.
- He has blonde hair --> Anh ấy có tóc vàng


But if we limit this then it would hurt the model overal performance as well? 

*Jan: some kind of loss to minimize the number of changes, but not completely forbidden the changes*


# Translation quality vs #changes

In [None]:
from nltk.translate.gleu_score import sentence_gleu

output["OriginalTran_Quality"] = output.apply(
    lambda x: sentence_gleu([x['tokenized_REF']], x['tokenized_SRC-Trans']), axis=1)


In [None]:
output.plot.scatter(x='OriginalTran_Quality', y="#TransChanges-#SrcChanges/SentenceLength")

In [None]:
np.corrcoef(output['OriginalTran_Quality'], output["#TransChanges-#SrcChanges/SentenceLength"])

In [None]:
hist = plt.hist(output["OriginalTran_Quality"], bins='sturges')
bin_boundaries = hist[1]

In [None]:
# # Use bins with same number of samples instead of equal-sized bins

# results, bin_boundaries = pd.qcut(output["OriginalTran_Quality"], q=5, retbins=True)
# bin_boundaries


# Remove bins with too few samples
cut_point = 99999
for i, value in enumerate(hist[0]):
    if value < 5:
        cut_point = i
        break
        
bin_boundaries = bin_boundaries[:cut_point]



In [None]:
bin_boundaries

X = output['OriginalTran_Quality']
Y = output["#TransChanges-#SrcChanges/SentenceLength"]

x_plot = [(bin_boundaries[i] + bin_boundaries[i+1])/2 for i in range(0, len(bin_boundaries)-1)]
y_plot = [stats.trim_mean(Y[(bin_boundaries[i] < X) & (X < bin_boundaries[i+1])], 0.1) for i in range(0, len(bin_boundaries)-1)]
plt.plot(x_plot, y_plot)
plt.xlabel('OriginalTrans_Quality')
plt.ylabel('Avg_changes')

Most of the time downward trend (not as clear for en-de with verb, adverb, pronoun; en-vi adverb, pronoun)

**Note**: the plot has outliers removed in both X and Y dimensions, by removing too small bins (X) and trimmed-mean (Y)

# #changes vs translation quality

In [None]:
hist = plt.hist(output["#TransChanges-#SrcChanges"], bins=20)
bin_boundaries = hist[1]
hist

In [None]:
# # Use bins with same number of samples instead of equal-sized bins
# results, bin_boundaries = pd.qcut(output["#TransChanges-#SrcChanges"], q=5, retbins=True)
# bin_boundaries


# Remove bins with too few samples
cut_point = 99999
for i, value in enumerate(hist[0]):
    if value < 10:
        cut_point = i
        break
        
bin_boundaries = bin_boundaries[:i]

In [None]:
bin_boundaries

X = output['#TransChanges-#SrcChanges']
Y = output["OriginalTran_Quality"]

x_plot = [(bin_boundaries[i] + bin_boundaries[i+1])/2 for i in range(0, len(bin_boundaries)-1)]

y_plot = [stats.trim_mean(Y[(bin_boundaries[i] <= X) & (X <= bin_boundaries[i+1])], 0.1) for i in range(0, len(bin_boundaries)-1)]
plt.plot(x_plot, y_plot)
plt.xlabel('Avg_changes')
plt.ylabel('OriginalTran_Quality')

# SentenceLength vs #changes

In [None]:
output['SRC-length'] = output.apply(
    lambda x: len(x['tokenized_SRC']), axis=1
)

In [None]:
output.plot.scatter(x='SRC-length', y="#TransChanges-#SrcChanges")

In [None]:
np.corrcoef(output['SRC-length'], output["#TransChanges-#SrcChanges"])

In [None]:
hist = plt.hist(output["SRC-length"], bins=20)
bin_boundaries = hist[1]

In [None]:
# Remove bins with too few samples
cut_point = 99999
for i, value in enumerate(hist[0]):
    if value < 10:
        cut_point = i
        break
        
bin_boundaries = bin_boundaries[:i]

In [None]:
X = output['SRC-length']
Y = output["#TransChanges-#SrcChanges"]

x_plot = [(bin_boundaries[i] + bin_boundaries[i+1])/2 for i in range(0, len(bin_boundaries)-1)]
y_plot = [stats.trim_mean(Y[(bin_boundaries[i] < X) & (X < bin_boundaries[i+1])], 0.1) for i in range(0, len(bin_boundaries)-1)]
plt.plot(x_plot, y_plot)
plt.xlabel('SRC-length')
plt.ylabel('Avg_changes')

# Beam_size vs #changes

In [None]:
beam_dict = {}
beam_values = [1,2,3,4,5]
for beam in beam_values:
    beam_dict[beam] = read_output_df(dataset, perturb_type, beam, replacement_strategy)
    # Make sure the df all have the same index
    if beam > 1:
        assert beam_dict[beam].index.equals(beam_dict[beam].index)


In [None]:
plt.plot(beam_values,
              [stats.trim_mean(beam_dict[x]['#TransChanges-#SrcChanges'], 0.1) for x in beam_values])
plt.xlabel('beam')
plt.ylabel('mean_changes')

The mean might not saying anything

In [None]:
fig, ax = plt.subplots()
ax.boxplot([beam_dict[x]['#TransChanges-#SrcChanges'] for x in beam_values])
ax.set_xticklabels(beam_values)
ax.set_xlabel('beam')
ax.set_ylabel('#changes')

# Perturbed word type vs #changes

In [None]:
word_type_dict = {}
word_type_values = ["noun", "verb", "adjective", "adverb", "pronoun"]
for word_type in word_type_values:
    word_type_dict[word_type] = read_output_df(dataset, perturb_type=word_type, beam=beam, replacement_strategy=replacement_strategy)

    
print('--------------------------------')
print('word type    -   trimmed-mean #changes')

for word_type in word_type_values:
    print(f"{word_type} - {stats.trim_mean(word_type_dict[word_type]['#TransChanges-#SrcChanges'], 0.1)}")


In [None]:
fig, ax = plt.subplots()
ax.boxplot([word_type_dict[x]['#TransChanges-#SrcChanges'] for x in word_type_values])
ax.set_xticklabels(word_type_values)
ax.set_xlabel('word_type')
ax.set_ylabel('#changes')

# #Changes per sentence across word types

See if the chaos changes are sentence-specific. Excluding perturbing pronouns bc not many samples have pronoun

In [None]:
# Find sentences that has multiple word types perturbed
word_type_values = ["noun", "verb", "adjective", "adverb"]
index_intersection = word_type_dict[word_type_values[0]].index
for i in range(1, len(word_type_values)):
    index_intersection = \
        index_intersection.intersection(word_type_dict[word_type_values[i]].index)

len(index_intersection)

In [None]:
changes_per_word_type = pd.DataFrame()
for word_type in word_type_values:
    changes_per_word_type[word_type] = word_type_dict[word_type]["#TransChanges-#SrcChanges"].loc[index_intersection]
    
# Count the number of samples where the changes in trans always bigger than changes in SRC
changes_per_word_type[(changes_per_word_type['noun'] > 0) & (changes_per_word_type['verb'] > 0) & \
                      (changes_per_word_type['adjective'] > 0) & (changes_per_word_type['adverb'] > 0)]



Small portion of rows --> not sentence-specific

In [None]:
import spacy 
from spacy import displacy 
nlp = spacy.load("en_core_web_sm")
sentence = "He is from Vietnam"
doc = nlp(sentence)

print(f"{'Node (from)-->':<15} {'Relation':^10} {'-->Node (to)':>15}\n")

for token in doc:
    print("{:<15} {:^10} {:>15}".format(str(token.head.text), str(token.dep_), str(token.text)))

In [None]:
for token in doc:
    print("------------------------------------------------")
    print(f"Token: {token.text}")
    print(f"Ancestors: {list(token.ancestors)}")
    print(f"Children: {list(token.children)}")

In [None]:
import spacy 
from spacy import displacy 
nlp = spacy.load("de_core_news_sm")
sentence = "Er kommt aus Vietnam"
doc = nlp(sentence)

print(f"{'Node (from)-->':<15} {'Relation':^10} {'-->Node (to)':>15}\n")

for token in doc:
    print("{:<15} {:^10} {:>15}".format(str(token.head.text), str(token.dep_), str(token.text)))