# Word alignment using [awesome-align](https://github.com/neulab/awesome-align)


### Prepare src-tgt to the correct format

"Inputs should be *tokenized* and each line is a source language sentence and its target language translation, separated by (`|||`)"

In [1]:
def read_output_df(dataset, perturb_type, beam, replacement_strategy, analyse_feature=True, 
                   ignore_case=False, no_of_replacements=1, chunk_max_length=1, spacy_model=None, 
                   w2v_model=None, use_alignment=False, winoMT=False, ref_available=False):
    if winoMT:
        path_prefix = "output/winoMT_asmetric/wmt19_winoMT_perturbed"
        output_df = pd.read_csv('output/winoMT_asmetric/wmt19_winoMT_perturbed_format.csv', index_col=0)  
    else:
        if no_of_replacements == 1:
            path_prefix = f"output/{dataset}/{replacement_strategy}/beam{beam}_perturb{perturb_type}/seed0/translations"
        else:
            path_prefix = f"output/{dataset}/{replacement_strategy}/beam{beam}_perturb{perturb_type}/seed0/translations_5replacements"

        output_df = pd.read_csv(f"{path_prefix}.csv", index_col=0)

        # Join to get the translation of the original sentences as well
        output_df = output_df.join(pd.read_csv(
            f"output/{dataset}/{replacement_strategy}/beam{beam}_perturbNone/seed0/translations.csv", index_col=0
        )['OriginalSRC-Trans'])
        
    
    # Convert columns with sentences to str type
    cols = ['SRC', 'REF', 'SRC_perturbed', 'SRC_perturbed-Trans', 'OriginalSRC-Trans']
    if not ref_available:
        cols.remove('REF')
    output_df[cols] = output_df[cols].astype(str)
    
    if ignore_case:
        output_df[cols] = output_df[cols].applymap(lambda x: x.lower())
    
    # Reorder the columns
    if winoMT:
        cols = ['SRC', 'REF', 'original_word', 'perturbed_word', 'SRC_perturbed', 'OriginalSRC-Trans', 'SRC_perturbed-Trans', 'Bias_sample']
    elif no_of_replacements == 1:
        cols = ['SRC', 'REF', 'original_word', 'perturbed_word', 'SRC_perturbed', 'OriginalSRC-Trans', 'SRC_perturbed-Trans']
    else:
        cols = ['SRC_index', 'SRC', 'REF', 'original_word', 'perturbed_word', 'SRC_perturbed', 'OriginalSRC-Trans', 'SRC_perturbed-Trans']
    if not ref_available:
        cols.remove('REF')
    output_df = output_df[cols]
    
    if analyse_feature:
        print(f"Original df shape: {output_df.shape}")
        output_df = output_df.dropna()
        print(f"After dropping none-perturbed sentences: {output_df.dropna().shape}")
        
        if replacement_strategy == 'word2vec_similarity':
            # SRC difference is the number of occurances of the word we perturb
            output_df["SRC-edit_distance"] = output_df.apply(lambda x: nltk.word_tokenize(x['SRC']).count(x['original_word']), axis=1)
        else:
            output_df["SRC-edit_distance"] = 1
        output_df['Trans-edit_distance'] =  output_df.apply(
            lambda x: levenshtein(nltk.word_tokenize(x['OriginalSRC-Trans']), nltk.word_tokenize(x['SRC_perturbed-Trans'])), axis=1)
        output_df["#TransChanges-#SrcChanges"] = output_df['Trans-edit_distance'] - output_df['SRC-edit_distance']
        
        output_df["#TransChanges-#SrcChanges/SentenceLength"] = (output_df['Trans-edit_distance'] - output_df['SRC-edit_distance']) / output_df['SRC'].apply(lambda x: len(nltk.word_tokenize(x)))
        
        output_df["ChangesSpread/SentenceLength"] = output_df.apply(
            lambda x: changes_spread(x['SRC_perturbed-Trans'], x['OriginalSRC-Trans']), axis=1)
        
        
        output_df["OriginalSRC-Trans"], output_df['SRC_perturbed-Trans'], \
        output_df['TwoChunksChanged'],  output_df['ChunkDistance'], \
        output_df["is_same_subtree"], output_df['changes_similarity'] = zip(*output_df.apply(
            lambda x: two_chunk_changed(
                x['SRC_perturbed-Trans'], 
                x['OriginalSRC-Trans'], 
                chunk_max_length=chunk_max_length, 
                spacy_model=spacy_model,
                w2v_model=w2v_model), 
            axis=1))    
        
        
        if use_alignment:
            # In the case where two changes occurs and the two similarities is calculated, 
            # find out which change is due to the perturbation
            output_df['perturbed_trans_alignment'] = load_alignment(path_prefix)
            output_df['changes_similarity'] = output_df.apply(
                lambda x: add_reason_of_change(
                    alignment=x['perturbed_trans_alignment'],
                    changes=x['changes_similarity'],
                    perturbed_src_word=x['perturbed_word']
                ),
                axis=1
            )
            
            if spacy_model is not None:
                # Add POS tagging of the not-perturbed change
                output_df['not_perturbed_TGT_change_type'] = output_df['changes_similarity'].apply(
                    lambda x: pos_tag_not_perturbed_change(x, spacy_model))
            
        
        
        if no_of_replacements > 1:
            print(output_df.columns)
            additional_col_1 = output_df.groupby(by="SRC_index", axis=0)[['Trans-edit_distance', '#TransChanges-#SrcChanges']].std()
            additional_col_2 = output_df.groupby(by="SRC_index", axis=0)[['TwoChunksChanged']].sum()
            
            output_df = output_df.join(additional_col_1, rsuffix='--SD')
            output_df = output_df.join(additional_col_2, rsuffix='--total')
        
    return output_df

    


In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
import gensim.downloader as api
from gensim.models.fasttext import load_facebook_model
import random
from difflib import SequenceMatcher
from scipy import stats
import sacrebleu
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats


perturb_type = 'content'
dataset = f'masked_covost2_for_en2de'  # 'MuST-SHE-en2fr' 'IWSLT15-en2vi' 'wmt19-newstest2019-en2de'
beam = 5
replacement_strategy = 'masking_language_model'
no_of_replacements = 5
ignore_case = False  # Only Europarls needs ignore case
chunk_max_length=1
spacy_model = None
# Loading these models in is time consuming
# de_model = load_facebook_model("data/cc.de.300.bin").wv
# vi_model = load_facebook_model("data/cc.vi.300.bin").wv

# # This overwrite the above params
# winoMT = True
# perturb_type = 'pronoun'
# no_of_replacements = 1

winoMT=False

output = read_output_df(dataset=dataset, perturb_type=perturb_type, beam=beam, 
                        replacement_strategy=replacement_strategy, ignore_case=ignore_case,
                        no_of_replacements=no_of_replacements, chunk_max_length=chunk_max_length,
                        spacy_model=spacy_model, analyse_feature=False)

output.head()

Unnamed: 0,SRC_index,SRC,original_word,perturbed_word,SRC_perturbed,OriginalSRC-Trans,SRC_perturbed-Trans
0,0,So the family will need to relearn everything.,need,have,So the family will have to relearn everything.,Also wird die Familie alles neu lernen müssen.,Also wird die Familie alles neu lernen müssen.
0,0,So the family will need to relearn everything.,need,try,So the family will try to relearn everything.,Also wird die Familie alles neu lernen müssen.,"Also wird die Familie versuchen, alles neu zu ..."
0,0,So the family will need to relearn everything.,need,want,So the family will want to relearn everything.,Also wird die Familie alles neu lernen müssen.,Also wird die Familie alles neu lernen wollen.
0,0,So the family will need to relearn everything.,need,continue,So the family will continue to relearn everyth...,Also wird die Familie alles neu lernen müssen.,Also wird die Familie weiterhin alles neu lernen.
0,0,So the family will need to relearn everything.,need,get,So the family will get to relearn everything.,Also wird die Familie alles neu lernen müssen.,Also wird die Familie alles neu lernen.


In [4]:
if winoMT:
    path_prefix = "output/winoMT_asmetric/wmt19_winoMT_perturbed"
elif no_of_replacements == 1:
    path_prefix = f"output/{dataset}/{replacement_strategy}/beam{beam}_perturb{perturb_type}/seed0/translations"
else:
    path_prefix = f"output/{dataset}/{replacement_strategy}/beam{beam}_perturb{perturb_type}/seed0/translations_5replacements"
        
output_df_path = f"{path_prefix}_reformatted.txt"

with open(output_df_path, 'w') as file:
    for row in output.iterrows():
        tokenized_src = ' '.join(nltk.word_tokenize(row[1][f'SRC_perturbed']))
        tokenized_tgt = ' '.join(nltk.word_tokenize(row[1][f'SRC_perturbed-Trans']))
        file.write(f"{tokenized_src} ||| {tokenized_tgt}\n")


### Load in the alignments

In [21]:
with open(f"{path_prefix}_word_alignment.txt") as f:
    lines = [line.rstrip() for line in f]
  
print(lines)

FileNotFoundError: [Errno 2] No such file or directory: 'output/wmt19-newstest2019-en2de/masking_language_model/beam5_perturbverb/seed0/translations_5replacements_word_alignment.txt'

In [15]:
translation_alignment = []
for line in lines:
    word_pairs = line.split()
    word_pairs = [word_pair.split('<sep>') for word_pair in word_pairs]
    translation_alignment.append(word_pairs)


In [16]:
translation_alignment

[[['Welsh', 'Welsh'],
  ['was', 'hatte'],
  ['worried', 'Angst'],
  ["about'looking", ','],
  ['like', 'wie'],
  ['muppets', 'Muppets'],
  ["'", "''"]],
 [['Welsh', 'Welsh'],
  ['is', 'macht'],
  ['worried', 'Sorgen'],
  ["about'looking", ','],
  ['like', 'wie'],
  ['muppets', 'Muppets'],
  ["'", "''"]],
 [['Welshman', 'Waliser'],
  ['worried', 'besorgt'],
  ["about'looking", 'über'],
  ["about'looking", 'Aussehen'],
  ['like', 'wie'],
  ['muppets', 'Muppets'],
  ["'", "''"]],
 [['Welsh', 'Waliser'],
  ['were', 'machten'],
  ['worried', 'Sorgen'],
  ["about'looking", ','],
  ['like', 'wie'],
  ['muppets', 'Muppets'],
  ["'", "''"]],
 [['Welsh', 'Waliser'],
  [',', ','],
  ['worried', 'Sorge'],
  ['worried', 'Waliser'],
  ["about'looking", '``'],
  ["about'looking", 'auszusehen'],
  ['like', 'wie'],
  ['muppets', 'Muppets'],
  ["'", "''"]],
 [['There', 'herrscht'],
  ['is', 'herrscht'],
  ['consternation', 'Bestürzung'],
  ['among', 'Unter'],
  ['some', 'einigen'],
  ['AMs', 'AMs'],
  [