In [None]:
import pandas as pd
import numpy as np
import nltk
import os
import tarfile
import sys
from sacremoses import MosesTokenizer, MosesDetokenizer

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 9999999)



np.set_printoptions(threshold=sys.maxsize)
np.random.seed(0)


# Use EN samples from covost2 data 

(all data except for the pairs that has DE, just in case)

In [None]:
src_lang = 'en'

original_data = pd.DataFrame(columns=['SRC'])

data_dir = "data/covost2/EN-translations"

deduplicated_each_dataset = 0

for filename in os.listdir(data_dir):
    if filename.endswith(".tar.gz") and ('de' not in filename):
        unzipped_file_name = filename.replace(".tar.gz", "")
        
        # Extract the file if not yet done so
        if not os.path.exists(os.path.join(data_dir, unzipped_file_name)):
            tar = tarfile.open(os.path.join(data_dir, filename))
            tar.extractall(data_dir)
            tar.close()
            
        tmp_df = pd.DataFrame()
        tmp_df['SRC'] = pd.read_csv(os.path.join(data_dir, unzipped_file_name), sep='\t')['translation']

        deduplicated_each_dataset = deduplicated_each_dataset + tmp_df.drop_duplicates(subset='SRC').shape[0]
        
        original_data = pd.concat([original_data, tmp_df], axis=0, ignore_index=True)
        
        
        
print(f"all: {original_data.shape}")
print(f"deduplicated_each_dataset: {deduplicated_each_dataset}")
print(f"deduplicated all: {original_data.drop_duplicates(subset='SRC').shape}")

original_data = original_data.drop_duplicates(subset='SRC')


Filter out the errornously long sentences

In [None]:
sentence_lengths = original_data['SRC'].apply(lambda x: len(x))

length_stats = sentence_lengths.describe(percentiles=[.25, .5, .75, .99])

original_data = original_data[sentence_lengths < length_stats['99%']]

original_data.shape

Remove empty sentences

In [None]:
original_data = original_data[original_data['SRC'] != ""]
original_data.shape

Remove the begining and end quotes for consistency

In [None]:
def prepare_sentence(sentence):
    """
    Remove the begining and end quotes 
    """
    if (sentence.startswith('\"') and sentence.endswith('\"')) or \
        (sentence.startswith('“') and sentence.endswith('”')):
        return sentence[1:-1]
    return sentence


original_data['SRC'] = original_data['SRC'].apply(lambda x: prepare_sentence(x))

Reindex after filtering the data

In [None]:
original_data.reset_index(drop=True, inplace=True)

In [None]:
original_data['SRC'].head()

# Use EN samples from winoMT data 

In [None]:
src_lang = 'en'

original_data = pd.read_csv('data/winoMT_src.csv', index_col=0)
original_data.head()

# Use EN samples from Must-SHE data 

In [None]:
src_lang = 'en'

original_data = pd.read_csv(
    f"data/MuST-SHE_v1.2/MuST-SHE-v1.2-data/tsv/MONOLINGUAL.fr_v1.2.tsv",
    sep='\t', index_col=0
)[['SRC', 'CATEGORY']]
print(original_data.shape)
original_data.head()

# Use EN samples from WMT21 DA test data

In [None]:
src_lang = 'en'
data_name = 'WMT21_DA'

with open(f"data/wmt-qe-2021-data/en-de-test21/test21.src") as f:
    en_sentences = f.readlines()
    en_sentences = [line.rstrip() for line in en_sentences]
    
original_data = pd.DataFrame(data={'SRC': en_sentences})
original_data.head()

# Use EN samples from WMT22 MQM test data

Same as WMT22 word-level test data

In [None]:
src_lang = 'en'
data_name = 'WMT22_MQM'

with open(f"data/wmt-qe-2022-data/test_data-gold_labels/task1_mqm/en-de/test.2022.src") as f:
    en_sentences = f.readlines()
    en_sentences = [line.rstrip() for line in en_sentences]
    
original_data = pd.DataFrame(data={'SRC': en_sentences})
original_data.head()

# Create input data where we mask a set of words

In [None]:
import string


def is_valid_word(word):
    # Filter out the words that has all punctuations in it
    
    all_puncts = string.punctuation + '—'
    contains_all_puncts = True
    for char in word:
        if char not in all_puncts:
            contains_all_puncts = False
            break
    if contains_all_puncts:
        return False

    return True

def is_content_tag(nltk_pos):
    content_tags_prefix = ['NN', 'V', 'JJ', 'RB', 'PRP']  # Noun, verb, adj, adv, pronoun
    for prefix in content_tags_prefix:
        if nltk_pos.startswith(prefix):
            return True
    return False





In [None]:
import re
import copy


def mask_sentence(sentence, tokenized_sentence, masked_word_tokenized_index):
    """
        sentence: the original sentence without preprocessing
        masked_word: the word to be masked (in lowercase)
    """
    
    # Find the location of the word in the sentence
    lowercased_sentence = sentence.lower()
    tokenized_sentence = tokenizer.tokenize(
        lowercased_sentence, escape=False, aggressive_dash_splits=False
    )
    masked_word_location = None
    prev_index = 0
    for word in tokenized_sentence:
        word_location = lowercased_sentence.find(word, prev_index)
        if word == masked_word:
            masked_word_location = word_location
        else:
            prev_index = word_location + len(word)
    
    assert masked_word_location is not None
    
    return sentence[:masked_word_location] + '[MASK]' + sentence[masked_word_location+len(masked_word):]


In [None]:
import re
import copy

tokenizer = MosesTokenizer(lang=src_lang)
detokenizer = MosesDetokenizer(lang=src_lang)


masked_data = pd.DataFrame(columns=['SRC_original_idx', 'SRC', 'SRC_masked', 'original_word', 'original_word_tag'])

for src_idx, src_row in original_data.iterrows():
    tokenized_src = tokenizer.tokenize(src_row['SRC'], escape=False, aggressive_dash_splits=False)
    pos_tags = nltk.pos_tag(tokenized_src)
    original_words = []
    original_words_tags = []
    masked_sentences = []
    for i, word_tag in enumerate(pos_tags):
        word, pos_tag = word_tag
        # Only mask the valid content word
        if is_valid_word(word) and is_content_tag(pos_tag):
            masked_tokenized = copy.deepcopy(tokenized_src)
            masked_tokenized[i] = '[MASK]'
            masked_sentences.append(
                detokenizer.detokenize(masked_tokenized)
            )
            original_words.append(word)
            original_words_tags.append(pos_tag)
    single_sentence_df = pd.DataFrame()
    single_sentence_df['SRC_masked'] = masked_sentences
    single_sentence_df['SRC'] = src_row['SRC']
    single_sentence_df['SRC_original_idx'] = src_idx
    single_sentence_df['original_word'] = original_words
    single_sentence_df['original_word_tag'] = original_words_tags
    
    masked_data = pd.concat([masked_data, single_sentence_df], axis=0, ignore_index=True)
    
    
    

In [None]:
masked_data.shape
masked_data = masked_data.dropna()
masked_data.shape

In [None]:
masked_data.head(5)

In [None]:
# masked_data.to_csv('data/masked_content_covost2_for_en2de_no_sentence_group.csv')

In [None]:
# masked_data.to_csv('data/masked_content_winoMT.csv')

In [None]:
# masked_data.to_csv(f'data/masked_content_{data_name}.csv')