In [1]:
import pandas as pd
import json

from datasets import load_dataset
from datasets import Dataset as HFDataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

import torch
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

from tqdm import tqdm
import nltk
from nltk.stem import WordNetLemmatizer
import sentencepiece
import evaluate

### Loading data and model for baseline testing.

In [2]:
# loading data for testing
data_file_path = 'Data/definitions_dataset/test.json'

with open(data_file_path, 'r', encoding='utf-8') as data_file:
    test_json_data = json.load(data_file)

flattened_json_data = [{'Word': sublist[0][0], 'Definition': ' '.join(sublist[1]), 'Context': ' '.join(sublist[2])}
                       for sublist in test_json_data]
dft = pd.DataFrame(flattened_json_data)

In [3]:
dft.head()

Unnamed: 0,Word,Definition,Context
0,press,place between two surfaces and apply weight or...,pressed flowers
1,glade,an open space in a wood or forest,"she found herself lying on her back , cushione..."
2,take,receive ( a specified amount of money ) as pay...,that means that the government spends less mon...
3,fattiness,having the property of containing fat,he recommended exercise to reduce my adiposity
4,dramatisation,conversion into dramatic form,the play was a dramatization of a short story


In [5]:
lemmatizer = WordNetLemmatizer()
# lemmatizing occurances of words to match.
def word_in_context(word, context):
    context_lower = context.lower()
    if word in context_lower.split():
        return True
    root_word = lemmatizer.lemmatize(word)
    if root_word in context_lower.split():
        return True
    return False

filtered_dft = dft[dft.apply(lambda row: word_in_context(row['Word'], row['Context']), axis=1)]

In [6]:
filtered_dft

Unnamed: 0,Word,Definition,Context
1,glade,an open space in a wood or forest,"she found herself lying on her back , cushione..."
6,collapse,( of a price or currency ) drop suddenly in value,institutions are rejecting warnings that house...
9,birch,a formal punishment in which a person is flogg...,are n't there times when you 're tempted to br...
14,drive,( in ball games ) a forceful stroke made with ...,another 30-yard run from the dangerous jamaica...
16,electrometer,an instrument for measuring electrical potenti...,shortly after antoine becquerel and others dev...
...,...,...,...
12309,jadeite,"a green , blue , or white mineral which is one...","in particular , it would have been good to inc..."
12311,curious,strange ; unusual,the best illustration of this strange reversal...
12313,debenture,a long-term security yielding a fixed rate of ...,"in return , gary wanted security by way of a d..."
12314,queue,a line or sequence of people or vehicles await...,given its location there should be a queue of ...


In [8]:
# setting up the model for testing
model_name='google/mt5-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512, legacy = False, fast = True)



In [15]:
dataset = HFDataset.from_pandas(filtered_dft, split='test')

In [16]:
dataset

Dataset({
    features: ['Word', 'Definition', 'Context', '__index_level_0__'],
    num_rows: 6751
})

In [17]:
words = dataset['Word'][:500]


In [18]:
Context = dataset['Context'][:500]

In [19]:
Definition = dataset['Definition'][:500]

### Zero shot prompt

In [21]:
# Using a zero shot prompt to generate definitions from pre-trained model.

model_definitions = []

for idx, w in tqdm(enumerate(words), total=len(words)):
    prompt = f"""
    generate definition for the word based on context: 
    
    Word: '{w}'
    
    Context: '{Context[idx]}'
    
    Definition: """
    
    input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation = True, add_special_tokens=True)
    
    model_output = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    model_text_output = tokenizer.decode(model_output[0], skip_special_tokens=True)
    
    model_definitions.append(model_text_output)

zipped = list(zip(words, model_definitions, Definition))

100%|██████████| 500/500 [06:12<00:00,  1.34it/s]


In [22]:
Definition[0]

'an open space in a wood or forest'

In [23]:
words[0]

'glade'

In [24]:
model_definitions[0]

"<extra_id_0> 'glade'."

In [26]:
# Model definitions generated with a zero shot prompt
model_definitions

["<extra_id_0> 'glade'.",
 "<extra_id_0> 'collapse' Context:",
 "<extra_id_0> 'birch'.",
 "<extra_id_0> 'drive' -",
 "<extra_id_0> 'electrometer'.",
 "<extra_id_0>'sweet'.",
 "<extra_id_0> 'dust' Context:",
 "<extra_id_0> 'ping'.",
 "<extra_id_0> 'joint'.",
 "<extra_id_0>'mistress'",
 "<extra_id_0> 'cabinetry'.",
 "<extra_id_0> 'would' would",
 "<extra_id_0>'sharp'.",
 '<extra_id_0> zebra zebra zebra',
 "<extra_id_0>'semimonthly'",
 "<extra_id_0> 'bramble'.",
 "<extra_id_0> 'joint'.",
 "<extra_id_0>'sirocco'.",
 "<extra_id_0> 'that'.",
 "<extra_id_0> 'terrestrial'",
 "<extra_id_0> 'huntsman'.",
 "<extra_id_0> 'betting'.",
 '<extra_id_0> idahoan Definition:',
 "<extra_id_0> 'flagging'.",
 "<extra_id_0> 'nursery'.",
 "<extra_id_0> 'discharger'.",
 "<extra_id_0> 'equatorial'",
 "<extra_id_0> 'otorhinolaryngology'",
 "<extra_id_0> 'nightmare'.",
 "<extra_id_0> 'astrophysicist'",
 "<extra_id_0> 'hay'.",
 "<extra_id_0> 'naturalize'.",
 "<extra_id_0> 'faceplate'.",
 "<extra_id_0> 'plaza'.",
 

### Zero shot - evaluation

In [28]:
rouge = evaluate.load('rouge')

In [29]:
zero_shot_untrained = rouge.compute(
    predictions=model_definitions,
    references=Definition[0:500],
    use_aggregator=True,
    use_stemmer=True,
)

In [30]:
zero_shot_untrained

{'rouge1': 0.024649135536952492,
 'rouge2': 0.0004,
 'rougeL': 0.024444593530258393,
 'rougeLsum': 0.024431370580890323}

In [34]:
# Example of what the input to the model looks like in text
print(prompt)


    generate definition for the word based on context: 
    
    Word: 'joint'
    
    Context: 'the pension companies will come up with a joint position on their participation in the pension reform .'
    
    Definition: 


### Few shot experiments

In [35]:
model_definitions = []

for idx, w in tqdm(enumerate(words), total=len(words)):
    prompt = f"""
    generate definition for the word based on context: 
    
    Word: '{words[1]}'
    
    Context: '{Context[1]}'
    
    Definition: {Definition[1]}
    
    
    
    generate definition for the word based on context: 
    
    Word: '{words[4]}'
    
    Context: '{Context[4]}'
    
    Definition: {Definition[4]}
    
    
    
    generate definition for the word based on context: 
    
    Word: '{w}'
    
    Context: '{Context[idx]}'
    
    Definition: """
    
    input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation = True, add_special_tokens=True)
    
    model_output = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    model_text_output = tokenizer.decode(model_output[0], skip_special_tokens=True)
    
    model_definitions.append(model_text_output)

100%|██████████| 500/500 [09:04<00:00,  1.09s/it]


In [36]:
# What the prompt looks like as input
print(prompt)


    generate definition for the word based on context: 
    
    Word: 'collapse'
    
    
    Definition: ( of a price or currency ) drop suddenly in value
    
    
    
    generate definition for the word based on context: 
    
    Word: 'electrometer'
    
    Context: 'shortly after antoine becquerel and others developed the electrometer , john mothée gaugain made the first precise measurements of pyroelectric charges in 1859 .'
    
    Definition: an instrument for measuring electrical potential without drawing any current from the circuit .
    
    
    
    generate definition for the word based on context: 
    
    Word: 'joint'
    
    Context: 'the pension companies will come up with a joint position on their participation in the pension reform .'
    
    Definition: 


### Few shot - Evaluation

In [37]:
few_shot_untrained = rouge.compute(
    predictions=model_definitions,
    references=Definition[0:500],
    use_aggregator=True,
    use_stemmer=True,
)

In [38]:
few_shot_untrained

{'rouge1': 0.08368350951693024,
 'rouge2': 0.0024155178155178154,
 'rougeL': 0.08024635301846923,
 'rougeLsum': 0.08079050518717734}

In [42]:
# Some examples of what the model generates with few shot learning
model_definitions

['<extra_id_0> a few hundred years ago.',
 '<extra_id_0> a currency.',
 "<extra_id_0>, a.' Definition:",
 '<extra_id_0> a throw in.',
 '<extra_id_0> a currency..',
 "<extra_id_0>, a.' Definition:",
 '<extra_id_0>, a..',
 '<extra_id_0> a phrase which means',
 '<extra_id_0> a joint session of congress',
 "<extra_id_0>, a.' Definition:",
 "<extra_id_0>, a.' Definition:",
 '<extra_id_0>, a..',
 '<extra_id_0> a sharp bark of a dog',
 '<extra_id_0> a zebra fish.',
 '<extra_id_0> a financial crisis.',
 '<extra_id_0>, a..',
 '<extra_id_0> income-tax return.',
 "<extra_id_0>, a.' Definition:",
 '<extra_id_0>....',
 '<extra_id_0> a terrestrial ball.',
 '<extra_id_0> a slight increase in value',
 '<extra_id_0> a currency.',
 '<extra_id_0> a political entity.',
 '<extra_id_0> a flood of flags.',
 '<extra_id_0> a mortgage.',
 "<extra_id_0>, a.' Definition:",
 '<extra_id_0>, a heat.',
 '<extra_id_0> a surgical emergency.',
 '<extra_id_0>, a sheep.',
 "<extra_id_0>, a.' Definition:",
 '<extra_id_0> a