# Preprocessing the dataset

This file shows how the preprocessing is done but it's all centered in a single function that is directly called in other ntbks.

In [1]:
import os
import torch
import json
import pandas as pd
from itertools import chain

from utils.data_processing import *

os.environ['CUDA_VISIBLE_DEVICES'] = "1"
torch.cuda.is_available()

%load_ext autoreload
%autoreload 2

In [3]:
#----------------------------- Retrieve data -----------------------------#
with open("../data/en_TaskIndependentData.json", "r") as file:
    data = json.load(file)

columns = zip(*chain(*chain(*data.values())))
column_names = ["id", "mwe", "literal_meaning", "_1", "_2", "_3", "proper", "meta", "0/1", "fine_grained", "prior", "sentence", "after", "source"]
data_df = pd.DataFrame({name: col for name, col in zip(column_names, columns)})

# Drop irrelevant columns
data_df = data_df.drop(["proper", "meta", "source"], axis=1)
init_len = len(data_df)

# Drop proper nouns and metaphors
data_df = data_df[~(data_df['fine_grained'].isin(['Proper Noun', 'Meta Usage']))]
print(f"We drop {init_len - len(data_df)} rows out of {init_len}.")

data_df.head()

We drop 777 rows out of 4645.


Unnamed: 0,id,mwe,literal_meaning,_1,_2,_3,0/1,fine_grained,prior,sentence,after
0,150,mailing list,address list,,,,1,address list,Every registered voter 65 and older who was no...,The Secretary of State’s office became aware o...,"I’m no mathematician, but I said, ‘We have a p..."
1,150,mailing list,address list,,,,1,address list,"To the best of the AAFP's knowledge, the spons...",Organizations and individuals may rent the AAF...,The AAFP's Marketing Research Department must ...
2,150,mailing list,address list,,,,1,address list,"Once the program is running, the payments coul...",TDHCA will be providing updates on how to appl...,The program comes after renters have spent mon...
3,150,mailing list,address list,,,,1,address list,"Red Hat, Cisco, and Google were involved in th...",All major Linux distributions also received th...,We assume that smaller and other downstream ve...
4,150,mailing list,address list,,,,1,address list,This optional step could save considerable mon...,"Larger and active mailing lists (e.g., more th...",These database management tools allow for batc...


In [4]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3868 entries, 0 to 4644
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               3868 non-null   int64 
 1   mwe              3868 non-null   object
 2   literal_meaning  3868 non-null   object
 3   _1               3868 non-null   object
 4   _2               3868 non-null   object
 5   _3               3868 non-null   object
 6   0/1              3868 non-null   int64 
 7   fine_grained     3868 non-null   object
 8   prior            3868 non-null   object
 9   sentence         3868 non-null   object
 10  after            3868 non-null   object
dtypes: int64(2), object(9)
memory usage: 362.6+ KB


In [5]:
data_df[(data_df["literal_meaning"] == "None") & (data_df["_1"] != 'None')]

Unnamed: 0,id,mwe,literal_meaning,_1,_2,_3,0/1,fine_grained,prior,sentence,after


In [6]:
filtered_rows = data_df[(data_df["_1"] == "None") & (data_df["_2"] != 'None')]

filtered_rows

Unnamed: 0,id,mwe,literal_meaning,_1,_2,_3,0/1,fine_grained,prior,sentence,after
277,12,ancient history,history,,in the past,,1,history,Keeping up to date with international studies ...,Brazilian specificity lies in our experience w...,Studying ancient history in Brazil today has t...
278,12,ancient history,history,,in the past,,1,history,Rather than carrying out monographic studies o...,"Today, the country has important researchers i...",Keeping up to date with international studies ...
279,12,ancient history,history,,in the past,,1,history,COINTELPRO is a stark illustration of how the ...,To disabuse anyone of the comforting delusion ...,The Partnership for Civil Justice obtained doc...
280,12,ancient history,history,,in the past,,1,history,The change is a declaration of purpose: to lea...,There’s a more interesting story to be told ab...,It seems to me the really crucial mover in his...
281,12,ancient history,history,,in the past,,1,history,What kind of student would this course suit?,This course will suit anyone fascinated by anc...,It will appeal to you if you enjoy the challen...
...,...,...,...,...,...,...,...,...,...,...,...
2050,93,baby blues,depression,,blue eyes,,1,depression,Doctor Reid said.,Doctor Reid stated when she sought help her ph...,"However, Liz Moore, a Therapist LLMSW, works f..."
2051,93,baby blues,depression,,blue eyes,,1,depression,"I was taking my work home, basically,"" she sai...",After giving birth to her second child and exp...,With the rise of the Norse myth through Marvel...
2052,93,baby blues,depression,,blue eyes,,0,blue eyes,"His eyes are just stunners!""","Carissa said of Matteo's baby blues, which he ...","As Nikki and Brie went on to play a game of ""T..."
2053,93,baby blues,depression,,blue eyes,,0,blue eyes,"They ended up loading up so many things, I cou...",And she wasn't the only one taken aback by DiC...,"O'Driscoll also noted, “He had the bluest eyes..."


In [7]:
# Below are the dropped single-meaning-mwe rows indices. Among them: 3730, 2813
data_df.iloc[3730]

id                                                               265
mwe                                                        leap year
literal_meaning                                            jump year
_1                                                   bissextile year
_2                                                              None
_3                                                              None
0/1                                                                0
fine_grained                                         bissextile year
prior              In China, for example, the leap year has an ex...
sentence           Likewise, the Ethiopian calendar consists of 1...
after              Hebrew and Buddhist ancient calendars also fol...
Name: 4487, dtype: object

In [8]:
data_df[data_df['mwe'] == 'leap year']
# 'leap year' has 2 registred possible meanings but only one is to be found in the dataset. Therefore all corresponding rows are being removed


Unnamed: 0,id,mwe,literal_meaning,_1,_2,_3,0/1,fine_grained,prior,sentence,after
4473,265,leap year,jump year,bissextile year,,,0,bissextile year,"According to history.com, the leap day was ori...",Caesar then created the leap year calendar to ...,"Feb. 29 happens every four years, because the ..."
4474,265,leap year,jump year,bissextile year,,,0,bissextile year,"Prior to this, most of the Roman world and Eur...",In order to try and link the calendar up to th...,The problem was that since the actual solar ye...
4475,265,leap year,jump year,bissextile year,,,0,bissextile year,It just means that a day is added to the month...,"Precisely, it takes 365 days, 5 hours, 48 minu...",Now another thing that comes to mind is the re...
4476,265,leap year,jump year,bissextile year,,,0,bissextile year,"In February and March last year, 593 speeding ...",The error was a result of the issuing computer...,The fault occurred over 14 days from February ...
4477,265,leap year,jump year,bissextile year,,,0,bissextile year,Apparently women were also expected to have to...,Another theory involves Queen Margaret of Scot...,This one is rather questionable however as Que...
4478,265,leap year,jump year,bissextile year,,,0,bissextile year,"But what about metaphorical leaping, as in tak...",Our presidential candidates take advantage of ...,The years give them an extra day to campaign a...
4479,265,leap year,jump year,bissextile year,,,0,bissextile year,No town of its size in New York state has a la...,In leap year they hold the balance of power an...,"The Sidney girls can do it, and if they let le..."
4480,265,leap year,jump year,bissextile year,,,0,bissextile year,"To me, having to wait that long for presents f...","Later, I came to know that they did celebrate ...",Did you know that an old Irish tradition allow...
4481,265,leap year,jump year,bissextile year,,,0,bissextile year,It is the Silas Marner of months.,February is 28 Sunday afternoons in Philadelph...,And 29 Mondays further animated by calls from ...
4482,265,leap year,jump year,bissextile year,,,0,bissextile year,"Over the last several decades, the precision o...",Leap seconds have been added in the past to ke...,"Interestingly, since scientists began measurin..."


In [9]:
# Drop MWEs that don't have both a compositional and a non-compositional meaning
data_df, dropped_rows = drop_rows_with_single_meaning(data_df)
print(f"Number of rows dropped: {len(dropped_rows)}")

data_df.head(10)

Number of rows dropped: 3200


Unnamed: 0,id,mwe,literal_meaning,_1,_2,_3,0/1,fine_grained,prior,sentence,after
58,38,sacred cow,divine cow,above criticism,,,1,divine cow,"Like beef, pork is also forbidden in Hinduism.","However, unlike the sacred cow, the pig repres...",This is considered to be particularly impure a...
61,38,sacred cow,divine cow,above criticism,,,0,above criticism,"Besides, the Rights group said the Minister of...",The Rights group said the ministry has continu...,HURIWA recalled that the Federal Government on...
62,38,sacred cow,divine cow,above criticism,,,0,above criticism,"But in the Macquarie judgment, Judge Sandy Str...",Barwick says historically there has been littl...,The Fair Work Act is clear that employers cann...
63,38,sacred cow,divine cow,above criticism,,,0,above criticism,"The Commissioner of Police, CP Abutu Yaro, who...",The state police boss also directed the full e...,No stone would be left unturned because we are...
64,38,sacred cow,divine cow,above criticism,,,0,above criticism,The committee shall come up with findings and ...,There is no sacred cow under the law of probit...,"For a deterrent in future, the Chairman shall ..."
65,38,sacred cow,divine cow,above criticism,,,0,above criticism,The House has now resumed discussion on a moti...,"Taking floor of the House, Senator Mohsin Aziz...",He said he presented himself before National A...
66,38,sacred cow,divine cow,above criticism,,,0,above criticism,We will deal with perpetrators with the force ...,"Going further with seriousness, Ajimobi went o...",Anyone who is implicated in the endemic and se...
67,38,sacred cow,divine cow,above criticism,,,0,above criticism,The end was that nobody should be denied healt...,By making a sacred cow of the latter we have b...,An anonymous nurse working in intensive care s...
68,38,sacred cow,divine cow,above criticism,,,0,above criticism,"Although, the motion against NAB was talked ou...","No one is sacred cow, no one is above accounta...",It’s true that when NAB summoned me I was in R...
69,38,sacred cow,divine cow,above criticism,,,0,above criticism,The expectations of winning I don’t think have...,I do think there is this undercurrent of 'They...,It’s kind of become an entitlement.


In [11]:
data_df['prior_tokens'] = data_df['prior'].apply(lambda x: [token.text for token in nlp(x)])
data_df['sentence_tokens'] = data_df['sentence'].apply(lambda x: [token.text for token in nlp(x)])
data_df['after_tokens'] = data_df['after'].apply(lambda x: [token.text for token in nlp(x)])

In [12]:

data_df['tokenized_mwe'] = data_df['mwe'].apply(tokenizer.tokenize)
# data_df.insert(data_df.columns.get_loc('mwe') + 1, 'tokenized_mwe', data_df['tokenized_mwe'])
print(len(data_df))
data_df.head()

668


Unnamed: 0,id,mwe,literal_meaning,_1,_2,_3,0/1,fine_grained,prior,sentence,after,prior_tokens,sentence_tokens,after_tokens,tokenized_mwe
58,38,sacred cow,divine cow,above criticism,,,1,divine cow,"Like beef, pork is also forbidden in Hinduism.","However, unlike the sacred cow, the pig repres...",This is considered to be particularly impure a...,"[Like, beef, ,, pork, is, also, forbidden, in,...","[However, ,, unlike, the, sacred, cow, ,, the,...","[This, is, considered, to, be, particularly, i...","[sacred, cow]"
61,38,sacred cow,divine cow,above criticism,,,0,above criticism,"Besides, the Rights group said the Minister of...",The Rights group said the ministry has continu...,HURIWA recalled that the Federal Government on...,"[Besides, ,, the, Rights, group, said, the, Mi...","[The, Rights, group, said, the, ministry, has,...","[HURIWA, recalled, that, the, Federal, Governm...","[sacred, cow]"
62,38,sacred cow,divine cow,above criticism,,,0,above criticism,"But in the Macquarie judgment, Judge Sandy Str...",Barwick says historically there has been littl...,The Fair Work Act is clear that employers cann...,"[But, in, the, Macquarie, judgment, ,, Judge, ...","[Barwick, says, historically, there, has, been...","[The, Fair, Work, Act, is, clear, that, employ...","[sacred, cow]"
63,38,sacred cow,divine cow,above criticism,,,0,above criticism,"The Commissioner of Police, CP Abutu Yaro, who...",The state police boss also directed the full e...,No stone would be left unturned because we are...,"[The, Commissioner, of, Police, ,, CP, Abutu, ...","[The, state, police, boss, also, directed, the...","[No, stone, would, be, left, unturned, because...","[sacred, cow]"
64,38,sacred cow,divine cow,above criticism,,,0,above criticism,The committee shall come up with findings and ...,There is no sacred cow under the law of probit...,"For a deterrent in future, the Chairman shall ...","[The, committee, shall, come, up, with, findin...","[There, is, no, sacred, cow, under, the, law, ...","[For, a, deterrent, in, future, ,, the, Chairm...","[sacred, cow]"


In [13]:
dropped_rows = data_df[~data_df.apply(lambda row: all(token in row['sentence_tokens'] for token in row['tokenized_mwe']), axis=1)]
print(f"There are {len(dropped_rows)} dropped lemmatization-related issues.")
for index, row in dropped_rows.iterrows():
    print(f"Index: {index} - MWE : {row['tokenized_mwe']}")
    print("Sentence Tokens:", row['sentence_tokens'])

data_df = data_df[data_df.apply(lambda row: all(token in row['sentence_tokens'] for token in row['tokenized_mwe']), axis=1)]

There are 36 dropped lemmatization-related issues.
Index: 178 - MWE : ['silver', 'lining']
Sentence Tokens: ['Probably', 'because', 'there', 'have', 'been', 'so', 'many', 'bad', 'things', 'happening', 'from', 'which', 'silver', 'linings', 'can', 'emerge', '.']
Index: 182 - MWE : ['silver', 'lining']
Sentence Tokens: ['So', ',', 'there', 'are', 'silver', 'linings', 'to', 'Covid-19', 'after', 'all', '!']
Index: 188 - MWE : ['silver', 'lining']
Sentence Tokens: ['It', "'s", 'why', 'Stager', 'never', 'takes', 'these', 'silver', 'linings', 'for', 'granted', ',', 'knowing', 'the', 'hardship', 'and', 'sacrifice', 'that', 'made', 'them', 'shine', '.']
Index: 312 - MWE : ['top', 'dog']
Sentence Tokens: ['My', 'stepdad', 'tried', 'hard', 'to', 'prove', 'me', 'wrong', ',', 'telling', 'me', 'about', 'how', 'they', "'re", '"', 'the', 'top', 'dogs', '"', 'and', 'they', "'re", 'in', 'India', ',', 'China', ',', 'etc', '.']
Index: 533 - MWE : ['public', 'service']
Sentence Tokens: ['Public', 'service',

In [14]:
sentences = 0
for col in ['prior', 'sentence', 'after']:
        sentences += len([sentence for sentence in data_df[col]])

print(f"There are {sentences:,} sentences in the dataset")

There are 1,896 sentences in the dataset


In [210]:
# Function to preprocess the data for other files
data_df = retrieve_preprocessed_df(print_ignored=True)
data_df.head()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


We drop 777 ProperNoun and MetaUsage rows out of 4645.
Number of dropped single-meaning-mwe rows: 3200
There are 36 dropped lemmatization-related issues. For more detail uncomment what comes next in the function.
There are 1,896 remaining sentences in the dataset.


Unnamed: 0,id,mwe,literal_meaning,_1,_2,_3,0/1,fine_grained,prior,sentence,after,prior_tokens,sentence_tokens,after_tokens,tokenized_mwe
58,38,sacred cow,divine cow,above criticism,,,1,divine cow,"Like beef, pork is also forbidden in Hinduism.","However, unlike the sacred cow, the pig repres...",This is considered to be particularly impure a...,"[Like, beef, ,, pork, is, also, forbidden, in,...","[However, ,, unlike, the, sacred, cow, ,, the,...","[This, is, considered, to, be, particularly, i...","[sacred, cow]"
61,38,sacred cow,divine cow,above criticism,,,0,above criticism,"Besides, the Rights group said the Minister of...",The Rights group said the ministry has continu...,HURIWA recalled that the Federal Government on...,"[Besides, ,, the, Rights, group, said, the, Mi...","[The, Rights, group, said, the, ministry, has,...","[HURIWA, recalled, that, the, Federal, Governm...","[sacred, cow]"
62,38,sacred cow,divine cow,above criticism,,,0,above criticism,"But in the Macquarie judgment, Judge Sandy Str...",Barwick says historically there has been littl...,The Fair Work Act is clear that employers cann...,"[But, in, the, Macquarie, judgment, ,, Judge, ...","[Barwick, says, historically, there, has, been...","[The, Fair, Work, Act, is, clear, that, employ...","[sacred, cow]"
63,38,sacred cow,divine cow,above criticism,,,0,above criticism,"The Commissioner of Police, CP Abutu Yaro, who...",The state police boss also directed the full e...,No stone would be left unturned because we are...,"[The, Commissioner, of, Police, ,, CP, Abutu, ...","[The, state, police, boss, also, directed, the...","[No, stone, would, be, left, unturned, because...","[sacred, cow]"
64,38,sacred cow,divine cow,above criticism,,,0,above criticism,The committee shall come up with findings and ...,There is no sacred cow under the law of probit...,"For a deterrent in future, the Chairman shall ...","[The, committee, shall, come, up, with, findin...","[There, is, no, sacred, cow, under, the, law, ...","[For, a, deterrent, in, future, ,, the, Chairm...","[sacred, cow]"
