In [16]:
import pandas as pd
import json

from datasets import load_dataset
from datasets import Dataset as HFDataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

import torch
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

from tqdm import tqdm
import nltk
from nltk.stem import WordNetLemmatizer
import sentencepiece
import evaluate

In [2]:
# loading data for testing
data_file_path = 'Data/definitions_dataset/test.json'

with open(data_file_path, 'r', encoding='utf-8') as data_file:
    test_json_data = json.load(data_file)

flattened_json_data = [{'Word': sublist[0][0], 'Definition': ' '.join(sublist[1]), 'Context': ' '.join(sublist[2])}
                       for sublist in test_json_data]
dft = pd.DataFrame(flattened_json_data)

In [3]:
dft.head()

Unnamed: 0,Word,Definition,Context
0,press,place between two surfaces and apply weight or...,pressed flowers
1,glade,an open space in a wood or forest,"she found herself lying on her back , cushione..."
2,take,receive ( a specified amount of money ) as pay...,that means that the government spends less mon...
3,fattiness,having the property of containing fat,he recommended exercise to reduce my adiposity
4,dramatisation,conversion into dramatic form,the play was a dramatization of a short story


In [7]:
lemmatizer = WordNetLemmatizer()

def word_in_context(word, context):
    context_lower = context.lower()
    if word in context_lower.split():
        return True
    root_word = lemmatizer.lemmatize(word)
    if root_word in context_lower.split():
        return True
    return False

filtered_dft = dft[dft.apply(lambda row: word_in_context(row['Word'], row['Context']), axis=1)]

In [8]:
filtered_dft

Unnamed: 0,Word,Definition,Context
1,glade,an open space in a wood or forest,"she found herself lying on her back , cushione..."
6,collapse,( of a price or currency ) drop suddenly in value,institutions are rejecting warnings that house...
9,birch,a formal punishment in which a person is flogg...,are n't there times when you 're tempted to br...
14,drive,( in ball games ) a forceful stroke made with ...,another 30-yard run from the dangerous jamaica...
16,electrometer,an instrument for measuring electrical potenti...,shortly after antoine becquerel and others dev...
...,...,...,...
12309,jadeite,"a green , blue , or white mineral which is one...","in particular , it would have been good to inc..."
12311,curious,strange ; unusual,the best illustration of this strange reversal...
12313,debenture,a long-term security yielding a fixed rate of ...,"in return , gary wanted security by way of a d..."
12314,queue,a line or sequence of people or vehicles await...,given its location there should be a queue of ...


In [100]:
model_name='google/flan-t5-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [113]:
model_t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

In [114]:
tokenizer_t5 = AutoTokenizer.from_pretrained("t5-base", model_max_length=512)

In [101]:
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [17]:
dataset = HFDataset.from_pandas(filtered_dft, split='test')

In [18]:
dataset

Dataset({
    features: ['Word', 'Definition', 'Context', '__index_level_0__'],
    num_rows: 6751
})

In [34]:
words = dataset['Word'][:500]


In [35]:
Context = dataset['Context'][:500]

In [36]:
Definition = dataset['Definition'][:500]

### Zero shot prompt

In [153]:
# Using a zero shot prompt to generate definitions from pre-trained model.
model_definitions = []

for idx, w in tqdm(enumerate(words), total=len(words)):
    prompt = f"""
    generate definition for the word based on context: 
    
    Word: '{w}'
    
    Context: '{Context[idx]}'
    
    Definition: """
    
    input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation = True, add_special_tokens=True)
    
    model_output = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    model_text_output = tokenizer.decode(model_output[0], skip_special_tokens=True)
    
    model_definitions.append(model_text_output)

zipped = list(zip(words, model_definitions, Definition))

100%|██████████| 500/500 [01:57<00:00,  4.27it/s]


In [140]:
Definition[0]

'an open space in a wood or forest'

In [141]:
words[0]

'glade'

In [142]:
model_definitions[0]

'True'

In [155]:
len(model_definitions)

500

In [143]:
# Model definitions generated with a zero shot prompt
model_definitions

['True',
 'False',
 'False',
 'False',
 'True',
 'sweet',
 'False',
 'True',
 'True',
 'False',
 'True',
 'True',
 'False',
 'True',
 'True',
 'False',
 'True',
 'False',
 'True',
 'True',
 'True',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'False',
 'True',
 'True',
 'False',
 'False',
 'False',
 'True',
 'False',
 'True',
 'False',
 'True',
 'False',
 'False',
 'True',
 'False',
 'miscount',
 'True',
 'False',
 'True',
 'False',
 'False',
 'False',
 'False',
 'True',
 'True',
 'True',
 'False',
 'Word: fascia',
 'False',
 'False',
 'False',
 'False',
 'True',
 'False',
 'True',
 'True',
 'bijou',
 'True',
 'False',
 'True',
 'False',
 'True',
 'True',
 'True',
 'False',
 'True',
 'True',
 'True',
 'True',
 'False',
 'False',
 'False',
 'Sumerian',
 'True',
 'False',
 'True',
 'True',
 'mag',
 'False',
 'bauble',
 'True',
 'False',
 'True',
 'True',
 'True',
 'sandalwood',
 'heart',
 'False',
 'True',
 'False',
 'True',
 'False',
 'True',
 'False'

### Zero shot - evaluation

In [151]:
rouge = evaluate.load('rouge')

In [154]:
zero_shot_untrained = rouge.compute(
    predictions=model_definitions,
    references=Definition[0:500],
    use_aggregator=True,
    use_stemmer=True,
)

In [156]:
zero_shot_untrained

{'rouge1': 0.04009114501193879,
 'rouge2': 0.0010158730158730158,
 'rougeL': 0.039046922390734734,
 'rougeLsum': 0.03938432723268814}

In [157]:
print(prompt)


    generate definition for the word based on context: 
    
    Word: 'joint'
    
    Context: 'the pension companies will come up with a joint position on their participation in the pension reform .'
    
    Definition: 


### Few shot experiments

In [158]:
model_definitions = []

for idx, w in tqdm(enumerate(words), total=len(words)):
    prompt = f"""
    generate definition for the word based on context: 
    
    Word: '{words[1]}'
    
    Context: '{Context[1]}'
    
    Definition: {Definition[1]}
    
    
    
    generate definition for the word based on context: 
    
    Word: '{words[4]}'
    
    Context: '{Context[4]}'
    
    Definition: {Definition[4]}
    
    
    
    generate definition for the word based on context: 
    
    Word: '{w}'
    
    Context: '{Context[idx]}'
    
    Definition: """
    
    input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation = True, add_special_tokens=True)
    
    model_output = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    model_text_output = tokenizer.decode(model_output[0], skip_special_tokens=True)
    
    model_definitions.append(model_text_output)

100%|██████████| 500/500 [02:58<00:00,  2.80it/s]


In [159]:
# What the prompt looks like as input
print(prompt)


    generate definition for the word based on context: 
    
    Word: 'collapse'
    
    
    Definition: ( of a price or currency ) drop suddenly in value
    
    
    
    generate definition for the word based on context: 
    
    Word: 'electrometer'
    
    Context: 'shortly after antoine becquerel and others developed the electrometer , john mothée gaugain made the first precise measurements of pyroelectric charges in 1859 .'
    
    Definition: an instrument for measuring electrical potential without drawing any current from the circuit .
    
    
    
    generate definition for the word based on context: 
    
    Word: 'joint'
    
    Context: 'the pension companies will come up with a joint position on their participation in the pension reform .'
    
    Definition: 


### Few shot - Evaluation

In [160]:
few_shot_untrained = rouge.compute(
    predictions=model_definitions,
    references=Definition[0:500],
    use_aggregator=True,
    use_stemmer=True,
)

In [161]:
few_shot_untrained

{'rouge1': 0.04289590445379665,
 'rouge2': 0.0047619047619047615,
 'rougeL': 0.04222440945009234,
 'rougeLsum': 0.04260317241235627}