In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import gensim
import gensim.downloader as api
from gensim.models.fasttext import load_facebook_model
import random
from difflib import SequenceMatcher
from scipy import stats
import sacrebleu
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import os
import spacy
import re
import edist.sed as sed
from sacremoses import MosesTokenizer, MosesDetokenizer
import sys  
sys.path.insert(0, '../')

from read_and_analyse_df import read_output_df
from align_and_analyse_ambiguous_trans import align_translations, analyse_single_sentence_single_perturbed_word, analyse_single_sentence


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


# nltk.download()

In [3]:
mask_type = 'MultiplePerSentence_allWords'
src_lang = 'en'
tgt_lang = 'de'
dataset = f'WMT21_DA_dev_{src_lang}2{tgt_lang}'  # 'MuST-SHE-en2fr' 'IWSLT15-en2vi' 'wmt19-newstest2019-en2de'
beam = 5
replacement_strategy = 'masking_language_model'
no_of_replacements = 30
ignore_case = False  # Only Europarls needs ignore case
chunk_max_length=1
spacy_model = spacy.load("de_core_news_sm")
# Loading these models in is time consuming
# de_model = load_facebook_model("../data/cc.de.300.bin").wv
# vi_model = load_facebook_model("../data/cc.vi.300.bin").wv
winoMT = False


if winoMT:
    perturb_type = 'pronoun'
    no_of_replacements = 1

output = read_output_df(df_root_path='../output', dataset=dataset, src_lang=src_lang, tgt_lang=tgt_lang, mask_type=mask_type, 
                        beam=beam, replacement_strategy=replacement_strategy, ignore_case=ignore_case,
                        no_of_replacements=no_of_replacements, chunk_max_length=chunk_max_length,
                        spacy_model=spacy_model, w2v_model=None, use_src_tgt_alignment=False, 
                        winoMT=winoMT, tokenize_sentences=True, 
                        analyse_feature=[
                                    'highlight_changes',   # Highlight the changes in the translation in capital
                                    'edit_distance',
                                    'change_spread',       # Longest distance between 2 changes
                                    ])

# print('BLEU score: ')
# sacrebleu.corpus_bleu(output['SRC-Trans'].tolist(), [output['REF'].tolist()]).score

Tokenize everything ...
Original df shape: (426720, 15)
After dropping none-perturbed sentences: (426720, 15)
Calculating the changes between translations of original SRC and perturbed SRC ...
Highlighting the changes ...
Calculating the edit distance ...


In [5]:
output.to_pickle(f'tmp_storages/analyse_{dataset}_{mask_type}.pkl')

# Comments

- On `wmt19-newstest2019-en2de, chunk_max_length=2`
    - 902: change to 1 SRC word leads to fixed changes of an irrelevant word
    - In many cases, the form of the verb (e.g., current or past tense) are changed --> harmful in the sense that it hurt performance score?
    - Word not being translated 
    - Spoken/written style
    - Time
    
    
- On `IWSLT15-en2vi, adjective`
    - 1003: change of 1 words consistently leads to change in subject
    
    - 1003, 145, 990 noun: same
    - 236 noun: same, funny but not sure if it is wrong
    - 308 verb same 
    
--> Quantify the verb form change by stemming/lemmatization
    
Chúng, họ, gã, cô ấy, cô ta, anh ta, hắn

Changes in the word "you"


In [None]:
# output[output['#TransChanges-#SrcChanges'] > 10].head(5)
# output[output["ChangesSpread/SentenceLength"] > 0.85].head(20)



# Two chunks changed that consistently changed over the different replacement of a word


# output[(output["TwoChunksChanged"] == True) & (output["TwoChunksChanged--total"] == 5)].sort_values(by='ChunkDistance', axis=0, ascending=False).head(1)
# output[(output["TwoChunksChanged"] == True)].sort_values(by='ChunkDistance', axis=0, ascending=False).head(100)

# Two words changed that are not in the same subtree
# output[(output["TwoChunksChanged"] == True) & (output["is_same_subtree"] == False) & (output["TwoChunksChanged--total"] == 5)]




# IWSLT15-en2vi, noun
# output.loc[[1003, 145, 990, 236]]







Sort the samples by the least similarity in changed words

In [None]:
# Filter out the 2-word-changed cases and similarity can be calculated
def get_not_perturbed_change_similarity(changes):
    for change in changes:
        if change['change_type'] == 'not_perturbed':
            return change['semantic_similarity']
    return pd.NA

analyse_df = output[
    (output["TwoChunksChanged"] == True) & output['changes_similarity'].notna() & output['not_perturbed_TGT_change_type'].isin(['NOUN', 'VERB', 'ADJ', 'PRON'])
]
analyse_df['similarity_not_perturbed'] = analyse_df['changes_similarity'].apply(
    lambda x: get_not_perturbed_change_similarity(x)
)
analyse_df.sort_values(by='similarity_not_perturbed')[['SRC', 
                                                f'original_word', 
                                                f'perturbed_word',
                                                'SRC-Trans',
                                                f'SRC_perturbed-Trans',
                                                'ChunkDistance',
                                                'changes_similarity',
                                                'similarity_not_perturbed',
                                                'not_perturbed_TGT_change_type',
#                                                 'Bias_sample'
                                                      ]].head(50)


### Calculate metrics for detecting the bias samples

High precision --> higher chance that the returned samples are bias --> save human time

High recall --> more bias samples are retreat --> can detect more type of bias

We focus on precision then (save human cost)

In [None]:
from sklearn.metrics import classification_report

print(' -------------------- Most-changes filter -------------------- ')
q = 20  # Take the q% sentences with the highest changes
no_changes_thresthold = np.percentile(output['#TransChanges-#SrcChanges'], 100-q)
bias_prediction = output['#TransChanges-#SrcChanges'] > no_changes_thresthold
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)

print(' -------------------- Most-spreaded_changes filter -------------------- ')
q = 20  # Take the q% sentences with the highest spread
spread_thresthold = np.percentile(output['ChangesSpread/SentenceLength'], 100-q)
bias_prediction = output['ChangesSpread/SentenceLength'] > spread_thresthold
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)

print(' -------------------- Two-changes filter -------------------- ')
bias_prediction = output["TwoChunksChanged"]
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)


print(' -------------------- Two-faraway-changes filter -------------------- ')
q = 20  # Take the q% sentences with the furthest distance between 2 changes 
distance_thresthold = np.nanpercentile(output['ChunkDistance'], 100-q)
bias_prediction = output["TwoChunksChanged"] & (output['ChunkDistance'] > distance_thresthold)
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)

print(' -------------------- Two-changes-different-subtree filter -------------------- ')
bias_prediction = output["TwoChunksChanged"] & (output["is_same_subtree"] == False)
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)


print(' -------------------- Two-change-dissimilar filter -------------------- ')
q = 90  # Take the q% sentences with the lowest similarity of the not-perturbed change
output = output.join(analyse_df['similarity_not_perturbed'])
similiarity_threshold = np.nanpercentile(output['similarity_not_perturbed'], q)

bias_prediction = output["TwoChunksChanged"] & (output['similarity_not_perturbed'] < similiarity_threshold)
results = classification_report(
    y_true=output['Bias_sample'], y_pred=bias_prediction, 
)
print(results)

# Analyse on same original_word accross sentences

In [None]:
output[[
    'SRC_masked_index', 'SRC', 'original_word', 'perturbed_word', 'SRC_perturbed',
    'SRC-Trans', 'SRC_perturbed-Trans', '#TransChanges-#SrcChanges',
    '#TransChanges-#SrcChanges/SentenceLength',
    'ChangesSpread/SentenceLength', 'TwoChunksChanged', 'ChunkDistance',
    'is_same_subtree', 'changes_similarity', 'perturbed_trans_alignment',
    'not_perturbed_TGT_change_type', 'Trans-edit_distance--SD',
    '#TransChanges-#SrcChanges--SD', 'TwoChunksChanged--total'
]].groupby('original_word').mean().head()




### Most changes filter:

In [None]:
groupped_by_word = output.groupby('original_word').mean()

q = 10  # Take the q% groups with the highest changes
no_changes_thresthold = np.percentile(groupped_by_word['#TransChanges-#SrcChanges'], 100-q)
bias_prediction = groupped_by_word['#TransChanges-#SrcChanges'] > no_changes_thresthold

bias_word_predicted = groupped_by_word[bias_prediction].index.values

output[
    output['original_word'].isin(bias_word_predicted) & \
    (output['#TransChanges-#SrcChanges'] > no_changes_thresthold)
].head(2)





### Most-spreaded_changes filter

In [None]:
groupped_by_word = output.groupby('original_word').mean()

q = 10  # Take the q% sentences with the highest spread
spread_thresthold = np.percentile(groupped_by_word['ChangesSpread/SentenceLength'], 100-q)
bias_prediction = groupped_by_word['ChangesSpread/SentenceLength'] > spread_thresthold

bias_word_predicted = groupped_by_word[bias_prediction].index.values

output[
    output['original_word'].isin(bias_word_predicted) & \
    (output['ChangesSpread/SentenceLength'] > spread_thresthold)
].head(2)


### Two-faraway-changes filter

ACTUALLY two-changes is not a bias filter. It's just an auxilary filter to avoid paraphrasing cases. Using this we will miss out on the cases where the model has both paraphrasing and 

Here we consider in each group: the number of sentences that has 2 changes

In [None]:
two_change_only_groupped_by_word = output[output["TwoChunksChanged"]].groupby('original_word').mean()


q = 20  # Take the q% sentences with the furthest distance between 2 changes 
distance_thresthold = np.percentile(two_change_only_groupped_by_word['ChunkDistance'], 100-q)
bias_prediction = two_change_only_groupped_by_word['ChunkDistance'] > distance_thresthold


bias_word_predicted = two_change_only_groupped_by_word[bias_prediction].index.values

output[
    output["TwoChunksChanged"] & \
    output['original_word'].isin(bias_word_predicted) & \
    (output['ChunkDistance'] > distance_thresthold)
].head(2)



### Two-changes-different-subtree filter

In [None]:
tmp = output[output["TwoChunksChanged"] & output['is_same_subtree'].notna()]
tmp['not_same_subtree'] = 1 - tmp['is_same_subtree'].astype(int)
two_change_only_groupped_by_word = tmp.groupby('original_word').sum()

q = 20  # Take the q% groups with the highest number of different subtree changes
count_thresthold = np.percentile(two_change_only_groupped_by_word['not_same_subtree'], 100-q)
bias_prediction = two_change_only_groupped_by_word['ChunkDistance'] > count_thresthold


bias_word_predicted = two_change_only_groupped_by_word[bias_prediction].index.values

output[
    output["TwoChunksChanged"] & \
    output['original_word'].isin(bias_word_predicted) & \
    (output['is_same_subtree'] == 0)
].head(2)



### Two-change-dissimilar filter

In [None]:
output = output.join(analyse_df['similarity_not_perturbed'])
two_change_only_groupped_by_word = output[output["TwoChunksChanged"]].groupby('original_word').mean()


q = 20  # Take the q% sentences with the lowest similarity of the not-perturbed change
similiarity_threshold = np.nanpercentile(two_change_only_groupped_by_word['similarity_not_perturbed'], q)
bias_prediction = two_change_only_groupped_by_word['similarity_not_perturbed'] < similiarity_threshold


bias_word_predicted = two_change_only_groupped_by_word[bias_prediction].index.values

output[
    output["TwoChunksChanged"] & \
    output['original_word'].isin(bias_word_predicted) & \
    (output['similarity_not_perturbed'] < similiarity_threshold)
].head(2)






In [None]:
output.columns

## Find patterns

when a word A is replaced with B, then the change C happens

In [None]:
output.head()

In [None]:
output[['SRC_masked_index', 'SRC', 'original_word', 'perturbed_word', 'SRC_perturbed',
       'SRC-Trans', 'SRC_perturbed-Trans', 'changes']].head()

In [None]:
import string

def lower_remove_non_alphabet(input_str):
    translation = input_str.maketrans(dict.fromkeys(string.punctuation, ' '))
    return input_str.translate(translation).lower()

In [None]:
def is_due_to_perturbation(change, original_word, perturbed_word, 
                           perturbed_trans_alignment_dict, original_trans_alignment_dict):
    """
    A change in translation is directly due to perturbation if the (aligned) translation of perturbed_word
    is in changed_part AND the (aligned) translation of original_word is in original_part
    
    Params:
        change: tuple of (change_type, original_trans_part, changed_trans_part)
        original_word: original word in the SRC that was perturbed
        perturbed_word: the replacement of the original word
        perturbed_trans_alignment_dict: {src_word1:trans_word1, src_word2:trans_word2,...} of the perturbed trans
        original_trans_alignment_dict: {src_word1:trans_word1, src_word2:trans_word2,...} of the original trans
    Return: bool, pd.NA in failed alignment case
    """
    # Turn everything to lowercase, and remove any non-alphabet characters
    change_type, original_trans_part, changed_trans_part = \
        change[0], lower_remove_non_alphabet(change[1]), lower_remove_non_alphabet(change[2])
    perturbed_trans_alignment_dict = dict(
        (lower_remove_non_alphabet(k).replace(' ', ''), lower_remove_non_alphabet(v).replace(' ', '')) for k,v in perturbed_trans_alignment_dict.items()
    )
    original_trans_alignment_dict = dict(
        (lower_remove_non_alphabet(k).replace(' ', ''), lower_remove_non_alphabet(v).replace(' ', '')) for k,v in original_trans_alignment_dict.items()
    )
    original_word = lower_remove_non_alphabet(original_word)
    perturbed_word = lower_remove_non_alphabet(perturbed_word)
    

    perturbed_word_appears_in_new_trans = pd.NA
    if perturbed_word in perturbed_trans_alignment_dict.keys():
        perturbed_word_trans = perturbed_trans_alignment_dict[perturbed_word]
        if perturbed_word_trans in changed_trans_part.split():
            perturbed_word_appears_in_new_trans = True
        else:
            perturbed_word_appears_in_new_trans = False
            
    # Missed-translation, or name-specific case
    if perturbed_word in changed_trans_part.split():
        perturbed_word_appears_in_new_trans = True
            

    original_word_appears_in_old_trans = pd.NA
    if original_word in original_trans_alignment_dict.keys():
        original_word_trans = original_trans_alignment_dict[original_word]
        if original_word_trans in original_trans_part.split():
            original_word_appears_in_old_trans = True
        else:
            original_word_appears_in_old_trans = False
        
        if perturbed_word in perturbed_trans_alignment_dict.keys():
            if original_word == 'fort' and perturbed_word == 'île' and change == ('replace', 'Fort-de-France', 'Île-de-France'):
                print('-------------------------')
                print(change)
                print('-' + original_word_trans + '-')
                print('-' + perturbed_word_trans + '-')
                print(original_word_appears_in_old_trans)
                print(perturbed_word_appears_in_new_trans)
                
    # Missed-translation, or name-specific case
    if original_word in original_trans_part.split():
        original_word_appears_in_old_trans = True
            
    # If perturbed_word_appears_in_new_trans or original_word_appears_in_old_trans is true, then 
    # is_due_to_perturbation is true
    if (not pd.isnull(perturbed_word_appears_in_new_trans)) and \
        (not pd.isnull(original_word_appears_in_old_trans)):
        return (perturbed_word_appears_in_new_trans or original_word_appears_in_old_trans)
    elif (pd.isnull(perturbed_word_appears_in_new_trans)) and \
        (not pd.isnull(original_word_appears_in_old_trans)):
        if original_word_appears_in_old_trans:
            return True
        else:
            return pd.NA
    elif (not pd.isnull(perturbed_word_appears_in_new_trans)) and \
        (pd.isnull(original_word_appears_in_old_trans)):
        if perturbed_word_appears_in_new_trans:
            return True
        else:
            return pd.NA
    else:
        return pd.NA
    
    
def filter_changes(group_df):
    changes = []
    
    for index, row in group_df.iterrows():
        for change in row['changes']:
            # Filter out the changes caused by perturbation
            is_due_to_perturbation_out = is_due_to_perturbation(
                                            change, 
                                            row['original_word'], 
                                            row['perturbed_word'], 
                                            row['perturbed_trans_alignment'],
                                            row['original_trans_alignment']
                                        )
            if pd.isnull(is_due_to_perturbation_out) or is_due_to_perturbation_out:
                continue
                
            # Filter out the weird <unk>
            if change == ('delete', '< unk >', '') or change == ('insert', '', '< unk >'):
                continue
                
            # Filter out the changes that are not content-related
            all_pos_tags = [t.pos_ for t in spacy_model(change[1])] + [t.pos_ for t in spacy_model(change[2])]
            content_related_tags = 'NOUN', 'VERB', 'ADJ', 'PRON'
            overlap = not set(all_pos_tags).isdisjoint(content_related_tags)
            if not overlap:
                continue
                
            changes.append(change)
            
            
    return changes



In [None]:
from collections import Counter


def find_max_freq_change(group_df):
    """
    Params: 
        group_df: the group of results that has the same original_word and perturbed_word
    """
    assert group_df['original_word'].value_counts().shape[0] == 1  # Because this function is for a single group
    assert group_df['perturbed_word'].value_counts().shape[0] == 1  # Because this function is for a single group
    
    # Filter out the changes that are not directly due to perturbation
    all_changes = filter_changes(group_df)
    
    freq_changes = Counter(all_changes)
    
    if len(freq_changes.most_common()) == 0:
        return 0
    return freq_changes.most_common(1)[0][1]

change_freq = output.groupby(
    ['original_word', 'perturbed_word'], as_index=False
).apply(find_max_freq_change).rename(columns={None: 'max_change_freq'}).sort_values(
    by='max_change_freq', ascending=False)
    

change_freq = change_freq[change_freq['perturbed_word'].apply(lambda x: x.isalpha())]

change_freq.head(10)

In [None]:
groups = output.groupby(['original_word', 'perturbed_word'])
groups_as_list = [(original_perturb, group) for original_perturb, group in groups]
re_ordered_groupes = [groups_as_list[i] for i in change_freq.index.values]

for original_perturb, group in re_ordered_groupes:
    print("----------------------")
    print(f"original SRC word: {original_perturb[0]}")
    print(f"perturbed SRC word: {original_perturb[1]}")
    all_changes = filter_changes(group)
    freq_changes = Counter(all_changes)
    print(freq_changes.most_common(2))

# Comments

--> starts to make sense, yet have not seen bias (even gender bias)

--> A bigger dataset for inference could help?

Some correlation is good, some correlation is bad. Is it a good idea to prevent these correlation??

# Filter per sentence with different replacements


**Note**: can use [sequence alignments](https://stackoverflow.com/questions/5055839/word-level-edit-distance-of-a-sentence) to align the sentences on the target side only. ([code](https://gist.github.com/slowkow/06c6dba9180d013dfd82bec217d22eb5))

Pros: could be easier than SRC-TGT alignment

Cons: in the case where more output different sentence structure yet same meaning. <br>
E.g., "Today I think the cat is nice" -- "I think the cat is nice today"
SRC-TGT alignment would probably see these as the same, but edit distance cannot, bc it only has del, insert, substitute operations.

In [None]:
# analysis = output.groupby('SRC_masked_index').apply(lambda x: analyse_single_sentence_perturbed_word(x))  #.rename(columns={None: 'influenced_words'})


In [None]:
# output.to_pickle('tmp_storages/analyse_winoMT.pkl')
# output = pd.read_pickle('tmp_storages/analyse_winoMT.pkl')

In [None]:
# Some missing info samples from mustSHE
# output[output['CATEGORY']=='1F'][['SRC', 'SRC_original_idx']].drop_duplicates() #.head()

In [None]:
import pprint


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 9999999)

sentence_idx = 0
print(f"Original SRC sentence: \n {output[['SRC', 'SRC_original_idx']].drop_duplicates().set_index('SRC_original_idx').loc[sentence_idx]}")
print()

pprint.pprint(analyse_single_sentence(output[output['SRC_original_idx'] == sentence_idx], align_type="trans-only", return_tgt_word_index=True))


In [None]:
original_word = 'general'


sentence_df = output[output['SRC_original_idx'] == sentence_idx]
sentence_single_perturbed_word_df = sentence_df[sentence_df['original_word'] == original_word]


pprint.pprint(analyse_single_sentence_perturbed_word(sentence_single_perturbed_word_df, align_type="trans-only"))
align_translations(sentence_single_perturbed_word_df, align_type="trans-only")


In [None]:
# Statistical test to see if SRC_similarity is higher than Trans_similarity
print(output["Trans-edit_distance"].mean() - output["SRC-edit_distance"].mean())
stats.ttest_rel(output["SRC-edit_distance"], 
                output["Trans-edit_distance"], 
                alternative='less')

Tiny pvalue --> Indeed SRC-edit_distance is significantly lower than Trans-edit_distance


(Careful with this tho, bc with number of samples too large then statistical test does not make sense)

In [None]:
_ = plt.hist(output["#TransChanges-#SrcChanges"], bins=50)

In [None]:
print(output["ChangesSpread/SentenceLength"].describe())
output["ChangesSpread/SentenceLength"].plot.box()

Some changes seems to have the same meaning but different phrasing, e.g., noun index 24, 36, 47

Both for en-de and en-vi


Kind of bias: en-vi adjective sample 82

Should we cherry-pick examples? Or cherry-pick the replacement?


Or narrow down scope of perturbation? (e.g., on countries, jobs, gender, ...?)



Some cherry-picked examples anyway:

- He comes from England --> Ông ấy đến từ Anh
- He comes from Vietnam --> Hắn đến từ Việt Nam
- He comes from North Korea --> Hắn đến từ Bắc Triều Tiên



- He is european --> Hắn là người Châu Âu
- He is asian --> Anh ấy là người châu Á.



- He has black hair --> Hắn có tóc đen.
- He has blonde hair --> Anh ấy có tóc vàng


But if we limit this then it would hurt the model overal performance as well? 

*Jan: some kind of loss to minimize the number of changes, but not completely forbidden the changes*


# Translation quality vs #changes

In [None]:
from nltk.translate.gleu_score import sentence_gleu

output["OriginalTran_Quality"] = output.apply(
    lambda x: sentence_gleu([x['tokenized_REF']], x['tokenized_SRC-Trans']), axis=1)


In [None]:
output.plot.scatter(x='OriginalTran_Quality', y="#TransChanges-#SrcChanges/SentenceLength")

In [None]:
np.corrcoef(output['OriginalTran_Quality'], output["#TransChanges-#SrcChanges/SentenceLength"])

In [None]:
hist = plt.hist(output["OriginalTran_Quality"], bins='sturges')
bin_boundaries = hist[1]

In [None]:
# # Use bins with same number of samples instead of equal-sized bins

# results, bin_boundaries = pd.qcut(output["OriginalTran_Quality"], q=5, retbins=True)
# bin_boundaries


# Remove bins with too few samples
cut_point = 99999
for i, value in enumerate(hist[0]):
    if value < 5:
        cut_point = i
        break
        
bin_boundaries = bin_boundaries[:cut_point]



In [None]:
bin_boundaries

X = output['OriginalTran_Quality']
Y = output["#TransChanges-#SrcChanges/SentenceLength"]

x_plot = [(bin_boundaries[i] + bin_boundaries[i+1])/2 for i in range(0, len(bin_boundaries)-1)]
y_plot = [stats.trim_mean(Y[(bin_boundaries[i] < X) & (X < bin_boundaries[i+1])], 0.1) for i in range(0, len(bin_boundaries)-1)]
plt.plot(x_plot, y_plot)
plt.xlabel('OriginalTrans_Quality')
plt.ylabel('Avg_changes')

Most of the time downward trend (not as clear for en-de with verb, adverb, pronoun; en-vi adverb, pronoun)

**Note**: the plot has outliers removed in both X and Y dimensions, by removing too small bins (X) and trimmed-mean (Y)

# #changes vs translation quality

In [None]:
hist = plt.hist(output["#TransChanges-#SrcChanges"], bins=20)
bin_boundaries = hist[1]
hist

In [None]:
# # Use bins with same number of samples instead of equal-sized bins
# results, bin_boundaries = pd.qcut(output["#TransChanges-#SrcChanges"], q=5, retbins=True)
# bin_boundaries


# Remove bins with too few samples
cut_point = 99999
for i, value in enumerate(hist[0]):
    if value < 10:
        cut_point = i
        break
        
bin_boundaries = bin_boundaries[:i]

In [None]:
bin_boundaries

X = output['#TransChanges-#SrcChanges']
Y = output["OriginalTran_Quality"]

x_plot = [(bin_boundaries[i] + bin_boundaries[i+1])/2 for i in range(0, len(bin_boundaries)-1)]

y_plot = [stats.trim_mean(Y[(bin_boundaries[i] <= X) & (X <= bin_boundaries[i+1])], 0.1) for i in range(0, len(bin_boundaries)-1)]
plt.plot(x_plot, y_plot)
plt.xlabel('Avg_changes')
plt.ylabel('OriginalTran_Quality')

# SentenceLength vs #changes

In [None]:
output['SRC-length'] = output.apply(
    lambda x: len(x['tokenized_SRC']), axis=1
)

In [None]:
output.plot.scatter(x='SRC-length', y="#TransChanges-#SrcChanges")

In [None]:
np.corrcoef(output['SRC-length'], output["#TransChanges-#SrcChanges"])

In [None]:
hist = plt.hist(output["SRC-length"], bins=20)
bin_boundaries = hist[1]

In [None]:
# Remove bins with too few samples
cut_point = 99999
for i, value in enumerate(hist[0]):
    if value < 10:
        cut_point = i
        break
        
bin_boundaries = bin_boundaries[:i]

In [None]:
X = output['SRC-length']
Y = output["#TransChanges-#SrcChanges"]

x_plot = [(bin_boundaries[i] + bin_boundaries[i+1])/2 for i in range(0, len(bin_boundaries)-1)]
y_plot = [stats.trim_mean(Y[(bin_boundaries[i] < X) & (X < bin_boundaries[i+1])], 0.1) for i in range(0, len(bin_boundaries)-1)]
plt.plot(x_plot, y_plot)
plt.xlabel('SRC-length')
plt.ylabel('Avg_changes')

# Beam_size vs #changes

In [None]:
beam_dict = {}
beam_values = [1,2,3,4,5]
for beam in beam_values:
    beam_dict[beam] = read_output_df(dataset, perturb_type, beam, replacement_strategy)
    # Make sure the df all have the same index
    if beam > 1:
        assert beam_dict[beam].index.equals(beam_dict[beam].index)


In [None]:
plt.plot(beam_values,
              [stats.trim_mean(beam_dict[x]['#TransChanges-#SrcChanges'], 0.1) for x in beam_values])
plt.xlabel('beam')
plt.ylabel('mean_changes')

The mean might not saying anything

In [None]:
fig, ax = plt.subplots()
ax.boxplot([beam_dict[x]['#TransChanges-#SrcChanges'] for x in beam_values])
ax.set_xticklabels(beam_values)
ax.set_xlabel('beam')
ax.set_ylabel('#changes')

# Perturbed word type vs #changes

In [None]:
word_type_dict = {}
word_type_values = ["noun", "verb", "adjective", "adverb", "pronoun"]
for word_type in word_type_values:
    word_type_dict[word_type] = read_output_df(dataset, perturb_type=word_type, beam=beam, replacement_strategy=replacement_strategy)

    
print('--------------------------------')
print('word type    -   trimmed-mean #changes')

for word_type in word_type_values:
    print(f"{word_type} - {stats.trim_mean(word_type_dict[word_type]['#TransChanges-#SrcChanges'], 0.1)}")


In [None]:
fig, ax = plt.subplots()
ax.boxplot([word_type_dict[x]['#TransChanges-#SrcChanges'] for x in word_type_values])
ax.set_xticklabels(word_type_values)
ax.set_xlabel('word_type')
ax.set_ylabel('#changes')

# #Changes per sentence across word types

See if the chaos changes are sentence-specific. Excluding perturbing pronouns bc not many samples have pronoun

In [None]:
# Find sentences that has multiple word types perturbed
word_type_values = ["noun", "verb", "adjective", "adverb"]
index_intersection = word_type_dict[word_type_values[0]].index
for i in range(1, len(word_type_values)):
    index_intersection = \
        index_intersection.intersection(word_type_dict[word_type_values[i]].index)

len(index_intersection)

In [None]:
changes_per_word_type = pd.DataFrame()
for word_type in word_type_values:
    changes_per_word_type[word_type] = word_type_dict[word_type]["#TransChanges-#SrcChanges"].loc[index_intersection]
    
# Count the number of samples where the changes in trans always bigger than changes in SRC
changes_per_word_type[(changes_per_word_type['noun'] > 0) & (changes_per_word_type['verb'] > 0) & \
                      (changes_per_word_type['adjective'] > 0) & (changes_per_word_type['adverb'] > 0)]



Small portion of rows --> not sentence-specific

In [None]:
import spacy 
from spacy import displacy 
nlp = spacy.load("en_core_web_sm")
sentence = "He is from Vietnam"
doc = nlp(sentence)

print(f"{'Node (from)-->':<15} {'Relation':^10} {'-->Node (to)':>15}\n")

for token in doc:
    print("{:<15} {:^10} {:>15}".format(str(token.head.text), str(token.dep_), str(token.text)))

In [None]:
for token in doc:
    print("------------------------------------------------")
    print(f"Token: {token.text}")
    print(f"Ancestors: {list(token.ancestors)}")
    print(f"Children: {list(token.children)}")

In [None]:
import spacy 
from spacy import displacy 
nlp = spacy.load("de_core_news_sm")
sentence = "Er kommt aus Vietnam"
doc = nlp(sentence)

print(f"{'Node (from)-->':<15} {'Relation':^10} {'-->Node (to)':>15}\n")

for token in doc:
    print("{:<15} {:^10} {:>15}".format(str(token.head.text), str(token.dep_), str(token.text)))