# Create occupation-masked input data

Load in the [job title list](https://www.enchantedlearning.com/wordlist/jobs.shtml)

In [3]:
with open('data/JobTitles.txt') as file:
    job_titles = [line.rstrip() for line in file]

Use EN samples from covost2 data fr-en 

In [4]:
import pandas as pd
import numpy as np
import nltk

original_data = pd.DataFrame()
original_data['SRC'] = pd.read_csv("data/covost2/covost_v2.fr_en.tsv", sep='\t')['translation']


Count the occurance of the job titles in the corpus

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = original_data['SRC'].values
vectorizer = CountVectorizer(vocabulary=job_titles, binary=True)
count_fit = vectorizer.fit_transform(corpus)
count_fit.shape

(264897, 285)

In [7]:
import spacy
import time 


word_df = pd.DataFrame()
word_df['word'] = vectorizer.get_feature_names_out()
word_df['freq'] = np.asarray(count_fit.sum(axis=0)).flatten()

In [8]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 9999999)


word_df.sort_values(
    by='freq', ascending=False
).head(100)



Unnamed: 0,word,freq
100,general,1068
184,president,998
52,captain,388
5,author,322
202,private,290
165,police,274
239,teacher,273
93,emperor,249
217,student,247
201,prince,241


### Create input data where we mask the job titles

In [9]:
import re

def mask_sentence(sentence, masked_word):
    """
        sentence: the original sentence without preprocessing
        masked_word: the word to be masked (in lowercase)
    """
    
    # Find the location of the word in the sentence
    word_locations = [m.start() for m in re.finditer(masked_word, sentence.lower())]
    
    # Make sure that it is actually a standalone word (e.g., 'HE' and not 'tHE')
    final_word_location = None
    for x in word_locations:
        # Make sure the character before and after the word is not alphabet
        if (x == 0 or (not sentence[x-1].isalpha())) and \
            (x + len(masked_word) == len(sentence) or (not sentence[x + len(masked_word)].isalpha())):
            final_word_location = x
            break
    
    assert final_word_location is not None
    
    return sentence[:final_word_location] + '[MASK]' + sentence[final_word_location+len(masked_word):]


In [11]:
original_data['SRC_masked'] = pd.NA
original_data[f'original_word'] = pd.NA


for word_index, occupation_word_row in word_df.iterrows():
    # Indices of the sentences that contains the word
    sentence_indices = count_fit.transpose()[word_index].nonzero()[1]
    
    # Mask the word in those sentences
    original_data.loc[sentence_indices, 'SRC_masked'] = \
        original_data.loc[sentence_indices, 'SRC'].apply(
        lambda x: mask_sentence(sentence=x, masked_word=occupation_word_row['word'])
        )
    
    original_data.loc[sentence_indices, f'original_word'] = occupation_word_row['word']
    
    

In [17]:
original_data.dropna().shape

(11155, 3)

In [18]:
# original_data.dropna().to_csv('data/masked_occupation_covost2_for_en2de.csv')