In [1]:
import pandas as pd
import numpy as np
import nltk
import os
import tarfile
import sys



np.set_printoptions(threshold=sys.maxsize)
np.random.seed(0)


# Use EN samples from covost2 data 

(all data except for the pairs that has DE, just in case)

In [None]:
original_data = pd.DataFrame(columns=['SRC'])

data_dir = "data/covost2/EN-translations"

deduplicated_each_dataset = 0

for filename in os.listdir(data_dir):
    if filename.endswith(".tar.gz") and ('de' not in filename):
        unzipped_file_name = filename.replace(".tar.gz", "")
        
        # Extract the file if not yet done so
        if not os.path.exists(os.path.join(data_dir, unzipped_file_name)):
            tar = tarfile.open(os.path.join(data_dir, filename))
            tar.extractall(data_dir)
            tar.close()
            
        tmp_df = pd.DataFrame()
        tmp_df['SRC'] = pd.read_csv(os.path.join(data_dir, unzipped_file_name), sep='\t')['translation']

        deduplicated_each_dataset = deduplicated_each_dataset + tmp_df.drop_duplicates(subset='SRC').shape[0]
        
        original_data = pd.concat([original_data, tmp_df], axis=0, ignore_index=True)
        
        
        
print(f"all: {original_data.shape}")
print(f"deduplicated_each_dataset: {deduplicated_each_dataset}")
print(f"deduplicated all: {original_data.drop_duplicates(subset='SRC').shape}")

original_data = original_data.drop_duplicates(subset='SRC')


Filter out the errornously long sentences

In [None]:
sentence_lengths = original_data['SRC'].apply(lambda x: len(x))

length_stats = sentence_lengths.describe(percentiles=[.25, .5, .75, .99])

original_data = original_data[sentence_lengths < length_stats['99%']]

original_data.shape

Remove empty sentences

In [None]:
original_data = original_data[original_data['SRC'] != ""]
original_data.shape

Remove the begining and end quotes for consistency

In [None]:
def prepare_sentence(sentence):
    """
    Remove the begining and end quotes 
    """
    if (sentence.startswith('\"') and sentence.endswith('\"')) or \
        (sentence.startswith('“') and sentence.endswith('”')):
        return sentence[1:-1]
    return sentence


original_data['SRC'] = original_data['SRC'].apply(lambda x: prepare_sentence(x))

Reindex after filtering the data

In [None]:
original_data.reset_index(drop=True, inplace=True)

In [None]:
original_data['SRC'].head()

# Use EN samples from winoMT data 

In [None]:
original_data = pd.read_csv('data/winoMT_src.csv', index_col=0)
original_data.head()

# Use EN samples from Must-SHE data 

In [70]:
original_data = pd.read_csv(
    f"data/MuST-SHE_v1.2/MuST-SHE-v1.2-data/tsv/MONOLINGUAL.fr_v1.2.tsv",
    sep='\t', index_col=0
)[['SRC', 'CATEGORY']]
print(original_data.shape)
original_data.head()

(1108, 2)


Unnamed: 0_level_0,SRC,CATEGORY
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
fr-0001,"Now, I thought, ""How could I really capture this?",1M
fr-0002,"I mean, from this entry, it would seem that I was born into a world that perceived someone like me to have nothing positive whatsoever going for them, when in fact, today I'm celebrated for the opportunities and adventures my life has procured.",1F
fr-0003,"So, I immediately went to look up the 2009 online edition, expecting to find a revision worth noting.",1F
fr-0004,"His name was Dr. Pizzutillo, an Italian American, whose name, apparently, was too difficult for most Americans to pronounce, so he went by Dr. P. And Dr. P always wore really colorful bow ties and had the very perfect disposition to work with children.",2M
fr-0005,"And, one day, he came in to my session — exhaustive and unforgiving, these sessions — and he said to me, ""Wow.",2M


### Perform stemming on the data

This would help reduce the vocab size, easier to later on choose the word to perturb

ABORT: it reduce the vocab from 88066 to 71362, so not that much, so doesnt worth it. Also stemming makes the word invalid, so cannot use POS afterward to filter it out.

Lemmatization would require defining POS --> not preferable, since we would want chinese and china to be the same

In [71]:
# def stem_sentence(stemmer, sentence):
#     """ 
#     Return the stemmed sentence and 
#     a dictionary mapping the stem to the original word in the sentence
#     """
#     tokenized_sentence = nltk.word_tokenize(sentence)
#     stem_word_dict = {}
#     stemmed_tokenized_sentence = []
    
#     for word in tokenized_sentence:
#         stem = stemmer.stem(word)
#         stemmed_tokenized_sentence.append(stem)
#         stem_word_dict[stem] = word
        
#     return ' '.join(stemmed_tokenized_sentence), stem_word_dict
    

In [72]:
# from nltk.stem.snowball import SnowballStemmer

# stemmer = SnowballStemmer(language='english')

# original_data['StemSRC'], original_data['StemDict'] = \
#     zip(*original_data.apply(lambda x: stem_sentence(stemmer, x['SRC']), axis=1))




### Invesitigate in the frequencies of words across sentences

Count the number of occurance in sentence of each word. Here we **only use the sentences where the words only occurs 1 time**, which is convenient to analyse on the influence of the word on the sentences.

In [73]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import find, csr_matrix

corpus = original_data['SRC'].values
vectorizer = CountVectorizer(tokenizer=nltk.word_tokenize, lowercase=True)
count_fit = vectorizer.fit_transform(corpus)

# Only consider the single occurance of a word in a sentence
count_fit[count_fit > 1] = 0

count_fit.shape

(1108, 4596)

In [74]:
import spacy
import time 


word_df = pd.DataFrame()
word_df['word'] = vectorizer.get_feature_names_out()
word_df['freq'] = np.asarray(count_fit.sum(axis=0)).flatten()


spacy_model = spacy.load("en_core_web_sm")

def spacy_pos_tag(word, spacy_model):
    doc = spacy_model(word)
    return [t.pos_ for t in doc][0]

def nltk_pos_tag(word):
    return nltk.pos_tag([word])[0][1]

def get_entity_name(word, spacy_model):
    """
    Function returning the NER output from spacy on a word
    Return None if the word does not have any entity name
    Labels and there descriptions:
    ```
    nlp = spacy.load("en_core_web_sm")
    labels = nlp.get_pipe('ner').labels
    for label in labels:
        print(f'{label}: {spacy.explain(label)}')
    ```
        CARDINAL: Numerals that do not fall under another type
        DATE: Absolute or relative dates or periods
        EVENT: Named hurricanes, battles, wars, sports events, etc.
        FAC: Buildings, airports, highways, bridges, etc.
        GPE: Countries, cities, states
        LANGUAGE: Any named language
        LAW: Named documents made into laws.
        LOC: Non-GPE locations, mountain ranges, bodies of water
        MONEY: Monetary values, including unit
        NORP: Nationalities or religious or political groups
        ORDINAL: "first", "second", etc.
        ORG: Companies, agencies, institutions, etc.
        PERCENT: Percentage, including "%"
        PERSON: People, including fictional
        PRODUCT: Objects, vehicles, foods, etc. (not services)
        QUANTITY: Measurements, as of weight or distance
        TIME: Times smaller than a day
        WORK_OF_ART: Titles of books, songs, etc.
    """
    
    doc = spacy_model(word)
    for w in doc.ents:
        return w.label_

start = time.time()
word_df['POS'] = word_df['word'].apply(lambda x: nltk_pos_tag(x))
print(f"POS tagging execution time: {time.time() - start} seconds")


POS tagging execution time: 0.4570791721343994 seconds


Have a look at the most frequent content words

In [75]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 9999999)

def is_content_tag(nltk_pos):
    content_tags_prefix = ['NN', 'V', 'JJ', 'PRP']  # Noun, verb, adj, adv (RB, but removed), pronoun
    for prefix in content_tags_prefix:
        if nltk_pos.startswith(prefix):
            return True
    return False

def is_stopword(word):
    # Manually define some stopwords (words that dont contain much content, or errornous)
    stopwords = ['is', 'are', 'was', 'were', 'am', 'be', 
                 'not', 'let',
                 'have', 'has', 'had', 
                 'de', 'la', 'du', 're', 'sur', 'des', 'le', 'll', 
                 'oh', 'lot', 'les', 'ah', 'en', 've',
                 'didn', 'bois']
    return word in stopwords


content_word_bool = word_df['POS'].apply(lambda x: is_content_tag(x)) \
    & (~word_df['word'].apply(lambda x: is_stopword(x)))

word_df[content_word_bool].sort_values(
    by='freq', ascending=False
).head(10)



Unnamed: 0,word,freq,POS
2070,i,399,NN
2763,my,219,PRP$
2599,me,156,PRP
8,'m,143,VBP
2229,it,142,PRP
4580,you,132,PRP
1948,he,111,PRP
4448,we,89,PRP
3707,she,85,PRP
1257,do,81,VB


### Create input data where we mask a set of words

#### Set of regional words 

In [76]:
# def is_regional_tag(spacy_ner):
#     regional_tags = ['GPE', 'LANGUAGE', 'NORP']
#     return spacy_ner in regional_tags


# start = time.time()
# word_df['NER'] = word_df['word'].apply(lambda x: get_entity_name(x, spacy_model))
# print(f"NER execution time: {time.time() - start} seconds")


# regional_word_bool = word_df['NER'].apply(lambda x: is_regional_tag(x)) \
#     & (~word_df['word'].apply(lambda x: is_stopword(x)))

#### Set of all content words that is frequent in the inference data

We select the words that appears in over 50 sentences.

When we dont need to group sentences with the same masked word, we keep the word freq over sentences lower (10) just to filter out the weird rare words

In [77]:
NR_OF_SENTENCES = 0

In [78]:
word_df[content_word_bool][word_df['freq'] > NR_OF_SENTENCES].sort_values('freq', ascending=False).head()


  word_df[content_word_bool][word_df['freq'] > NR_OF_SENTENCES].sort_values('freq', ascending=False).head()


Unnamed: 0,word,freq,POS
2070,i,399,NN
2763,my,219,PRP$
2599,me,156,PRP
8,'m,143,VBP
2229,it,142,PRP


Filter out the strange words

In [79]:
import string


def is_valid_word(word):
    # Filter out the words that has all punctuations in it
    
    all_puncts = string.punctuation + '—'
    contains_all_puncts = True
    for char in word:
        if char not in all_puncts:
            contains_all_puncts = False
            break
    if contains_all_puncts:
        return False
    
    
    # Filter out the words with strange characters in it
    # Strange characters are punctuations, except ' . -
    strange_characters = all_puncts.replace("\'", '').replace(".", '').replace("-", '').replace("—", '')
    for char in strange_characters:
        if char in word:
            return False
        
    return True

valid_word_bool = word_df['word'].apply(is_valid_word)

In [80]:
filtered_word_bool = content_word_bool & (word_df['freq'] > NR_OF_SENTENCES) & valid_word_bool

In [81]:
sum(filtered_word_bool)

4115

In [82]:
import re

def mask_sentence(sentence, masked_word):
    """
        sentence: the original sentence without preprocessing
        masked_word: the word to be masked (in lowercase)
    """
    
    # Find the location of the word in the sentence
    word_locations = [m.start() for m in re.finditer(masked_word, sentence.lower())]

    
    # Make sure that it is actually a standalone word (e.g., 'HE' and not 'tHE')
    final_word_location = None
    for x in word_locations:
        # Make sure the character before and after the word is not alphabet
        if (x == 0 or (not sentence.lower()[x-1].isalpha())) and \
            (x + len(masked_word) == len(sentence) or (not sentence.lower()[x + len(masked_word)].isalpha())):
            final_word_location = x
            break

    if final_word_location is None:
        # Not an independent word, pass
        return pd.NA

    
    return sentence[:final_word_location] + '[MASK]' + sentence[final_word_location+len(masked_word):]


In [83]:
masked_data = pd.DataFrame(columns=['SRC', 'SRC_masked', 'original_word'])

filtered_word_df = word_df[filtered_word_bool]

for word_index, filtered_word_row in filtered_word_df.iterrows():
    # Indices of the sentences that contains the word
    sentence_indices = original_data.index[count_fit.transpose()[word_index].nonzero()[1]]
    
#     # Randomly select a fixed number of sentences
#     sentence_indices = np.random.choice(a=sentence_indices, 
#                                         size=NR_OF_SENTENCES, 
#                                         replace=False)
    
    # Create a temporary df to store the sentences for this word
    tmp_df = pd.DataFrame()
    tmp_df['SRC_original_idx'] = sentence_indices
    tmp_df['SRC'] = original_data.loc[sentence_indices, 'SRC'].values
    tmp_df['original_word'] = filtered_word_row['word']
    
    # Mask the word in those sentences
    tmp_df['SRC_masked'] = \
        original_data.loc[sentence_indices, 'SRC'].apply(
        lambda x: mask_sentence(sentence=x, masked_word=filtered_word_row['word'])
        ).values
    
    # Concat to the whole df
    masked_data = pd.concat([masked_data, tmp_df], axis=0, ignore_index=True)
    
    

In [84]:
masked_data = masked_data.dropna()
masked_data.shape

(12240, 4)

In [85]:
masked_data.head()

Unnamed: 0,SRC,SRC_masked,original_word,SRC_original_idx
0,"And I suppose I could mention from one of the very earliest computer scientists, whose name was Norbert Wiener, and he wrote a book back in the '50s, from before I was even born, called ""The Human Use of Human Beings."" And in the book, he described the potential to create a computer system that would be gathering data from people and providing feedback to those people in real time in order to put them kind of partially, statistically, in a Skinner box, in a behaviorist system, and he has this amazing line where he says, one could imagine, as a thought experiment — and I'm paraphrasing, this isn't a quote — one could imagine a global computer system where everybody has devices on them all the time, and the devices are giving them feedback based on what they did, and the whole population is subject to a degree of behavior modification.","And I suppose I could mention from one of the very earliest computer scientists, whose name was Norbert Wiener, and he wrote a book back in the [MASK], from before I was even born, called ""The Human Use of Human Beings."" And in the book, he described the potential to create a computer system that would be gathering data from people and providing feedback to those people in real time in order to put them kind of partially, statistically, in a Skinner box, in a behaviorist system, and he has this amazing line where he says, one could imagine, as a thought experiment — and I'm paraphrasing, this isn't a quote — one could imagine a global computer system where everybody has devices on them all the time, and the devices are giving them feedback based on what they did, and the whole population is subject to a degree of behavior modification.",'50s,fr-0383
1,"And I suppose I could mention from one of the very earliest computer scientists, whose name was Norbert Wiener, and he wrote a book back in the '50s, from before I was even born, called ""The Human Use of Human Beings."" And in the book, he described the potential to create a computer system that would be gathering data from people and providing feedback to those people in real time in order to put them kind of partially, statistically, in a Skinner box, in a behaviorist system, and he has this amazing line where he says, one could imagine, as a thought experiment — and I'm paraphrasing, this isn't a quote — one could imagine a global computer system where everybody has devices on them all the time, and the devices are giving them feedback based on what they did, and the whole population is subject to a degree of behavior modification.","And I suppose I could mention from one of the very earliest computer scientists, whose name was Norbert Wiener, and he wrote a book back in the [MASK], from before I was even born, called ""The Human Use of Human Beings."" And in the book, he described the potential to create a computer system that would be gathering data from people and providing feedback to those people in real time in order to put them kind of partially, statistically, in a Skinner box, in a behaviorist system, and he has this amazing line where he says, one could imagine, as a thought experiment — and I'm paraphrasing, this isn't a quote — one could imagine a global computer system where everybody has devices on them all the time, and the devices are giving them feedback based on what they did, and the whole population is subject to a degree of behavior modification.",'50s,fr-0384
2,"I had a thumb, I had 85 dollars, and I ended up in San Francisco, California — met a lover — and back in the '80s, found it necessary to begin work on AIDS organizations.","I had a thumb, I had 85 dollars, and I ended up in San Francisco, California — met a lover — and back in the [MASK], found it necessary to begin work on AIDS organizations.",'80s,fr-0145
146,"I think you do have too much trouble about this flag. """" I think that the artist should be returned to his heritage, i.e., the jungles of Africa, and then he can shovel manure in his artistic way. """" This flag I'm standing on stands for everything oppressive in this system: the murder of the Indians and all the oppressed around the world, including my brother who was shot by a pig, who kicked over his body to 'make sure the nigger was dead.' That pig was wearing the flag.","I think you do have too much trouble about this flag. """" I think that the artist should be returned to his heritage, i.e., the jungles of Africa, and then he can shovel manure in his artistic way. """" This flag I'm standing on stands for everything oppressive in this system: the murder of the Indians and all the oppressed around the world, including my brother who was shot by a pig, who kicked over his body to [MASK] sure the nigger was dead.' That pig was wearing the flag.",'make,fr-0725
147,"I got a phone call from a 30-something friend, a woman, she said, ""So, my partner and I were in the middle of doing some things and I was like, 'I want you right now.' And he said, 'No, you're still dry, you're just being nice.' And I was so ready.","I got a phone call from a 30-something friend, a woman, she said, ""So, my partner and I were in the middle of doing some things and I was like, 'I want you right now.' And he said, [MASK], you're still dry, you're just being nice.' And I was so ready.",'no,fr-0506


In [86]:
# masked_data.to_csv('data/masked_content_covost2_for_en2de_no_sentence_group.csv')

In [87]:
# masked_data.to_csv('data/masked_content_winoMT.csv')

In [88]:
masked_data.to_csv('data/masked_content_mustSHE.csv')