# Data preprocessing


In [5]:
import pandas as pd
import json
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm

file_path = 'Data/definitions_dataset/train.json'
with open(file_path, 'r', encoding='utf-8') as file:
    json_data = json.load(file)
flattened_data = [{'Word': sublist[0][0], 'Definition': ' '.join(sublist[1]), 'Context': ' '.join(sublist[2])}
                  for sublist in json_data]
df = pd.DataFrame(flattened_data)

In [None]:
file_path = '/content/drive/MyDrive/Thesis/val.json'
with open(file_path, 'r', encoding='utf-8') as file:
    json_data_val = json.load(file)
flattened_data = [{'Word': sublist[0][0], 'Definition': ' '.join(sublist[1]), 'Context': ' '.join(sublist[2])}
                  for sublist in json_data_val]
dfv = pd.DataFrame(flattened_data)

In [None]:
file_path = '/content/drive/MyDrive/Thesis/test.json'

with open(file_path, 'r', encoding='utf-8') as file:
    json_data_test = json.load(file)

flattened_data = [{'Word': sublist[0][0], 'Definition': ' '.join(sublist[1]), 'Context': ' '.join(sublist[2])}
                  for sublist in json_data_test]
dft = pd.DataFrame(flattened_data)

In [None]:
df.head(5)

Unnamed: 0,Word,Definition,Context
0,burnish,enhance or improve,in vain the communists tried to burnish their ...
1,sympathy,feelings of pity and sorrow for someone else '...,"well , as it happens , i have some sympathy wi..."
2,uttermost,the greatest possible degree,he tried his utmost
3,freeze-frame,the facility or process of stopping a film or ...,bray wears out every stylistic gimmick in his ...
4,base,the bottom or lowest part,the base of the mountain


In [None]:
df.iloc[1][2]

'well , as it happens , i have some sympathy with people who get caught with massive software development schemes that go wrong .'

In [None]:
df.head(2)


Unnamed: 0,Word,Definition,Context
0,burnish,enhance or improve,in vain the communists tried to burnish their ...
1,sympathy,feelings of pity and sorrow for someone else '...,"well , as it happens , i have some sympathy wi..."


In [None]:
df.size

294657

In [None]:
model_name='t5-base'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [None]:
df['input_text'] = ' Word: \'' + df['Word'] + '\' Context: \'' + df['Context'] + '\''
dfv['input_text'] = ' Word: \'' + dfv['Word'] + '\' Context: \'' + dfv['Context'] + '\''

In [None]:
dft['input_text'] = ' Word: \'' + dft['Word'] + '\' Context: \'' + dft['Context'] + '\''

In [None]:
df.head()

Unnamed: 0,Word,Definition,Context,input_text
0,burnish,enhance or improve,in vain the communists tried to burnish their ...,Word: 'burnish' Context: 'in vain the communi...
1,sympathy,feelings of pity and sorrow for someone else '...,"well , as it happens , i have some sympathy wi...","Word: 'sympathy' Context: 'well , as it happe..."
2,uttermost,the greatest possible degree,he tried his utmost,Word: 'uttermost' Context: 'he tried his utmost'
3,freeze-frame,the facility or process of stopping a film or ...,bray wears out every stylistic gimmick in his ...,Word: 'freeze-frame' Context: 'bray wears out...
4,base,the bottom or lowest part,the base of the mountain,Word: 'base' Context: 'the base of the mountain'


In [None]:
df.iloc[0][3]

" Word: 'burnish' Context: 'in vain the communists tried to burnish their image , formally abandoning the doctrine of the dictatorship of the proletariat at their twenty-second party congress in february 1976 .'"

In [None]:
class CustomDataset(Dataset):
    def __init__(self, tokenized_inputs, tokenized_labels):
        self.tokenized_inputs = tokenized_inputs
        self.tokenized_labels = tokenized_labels

    def __len__(self):
        return len(self.tokenized_inputs['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': self.tokenized_inputs['input_ids'][idx],
            'attention_mask': self.tokenized_inputs['attention_mask'][idx],
            'labels': self.tokenized_labels['input_ids'][idx]
        }

In [None]:
dfv.head(1)

Unnamed: 0,Word,Definition,Context,input_text
0,choker,a cable looped round a log to drag it .,"we lowered a man inside the pipe , he wrapped ...",Word: 'choker' Context: 'we lowered a man ins...


In [None]:
train_input_text = df['input_text'].tolist()
train_labels = df['Definition'].tolist()
val_input_text = dfv['input_text'].tolist()
val_labels = dfv['Definition'].tolist()

In [None]:
tokenized_inputs_train = tokenizer(train_input_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
tokenized_labels_train = tokenizer(train_labels, return_tensors='pt', padding=True, truncation=True, max_length=64)
tokenized_inputs_val = tokenizer(val_input_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
tokenized_labels_val = tokenizer(val_labels, return_tensors='pt', padding=True, truncation=True, max_length=64)

In [None]:
train_dataset = CustomDataset(tokenized_inputs_train, tokenized_labels_train)
val_dataset = CustomDataset(tokenized_inputs_val, tokenized_labels_val)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./t5-finetuned',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=2,
    evaluation_strategy="steps",
    logging_dir='./logs',
    logging_steps=100,
    save_steps=1000000,
    eval_steps=1000000,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=61390, training_loss=0.6969065872344001, metrics={'train_runtime': 5055.8209, 'train_samples_per_second': 194.269, 'train_steps_per_second': 12.142, 'total_flos': 4.206031438675968e+16, 'train_loss': 0.6969065872344001, 'epoch': 10.0})

In [None]:
class CustomDataset(Dataset):
    def __init__(self, tokenized_inputs, tokenized_labels):
        self.tokenized_inputs = tokenized_inputs
        self.tokenized_labels = tokenized_labels

    def __len__(self):
        return len(self.tokenized_inputs['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': self.tokenized_inputs['input_ids'][idx],
            'attention_mask': self.tokenized_inputs['attention_mask'][idx],
            'labels': self.tokenized_labels['input_ids'][idx]
        }

In [None]:
test_input_text = dft['input_text'].tolist()
test_labels = dft['Definition'].tolist()
tokenized_inputs_test = tokenizer(test_input_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
tokenized_labels_test = tokenizer(test_labels, return_tensors='pt', padding=True, truncation=True, max_length=64)
test_dataset = CustomDataset(tokenized_inputs_test, tokenized_labels_test)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# tokenized_inputs_test = {key: value.to(model.device) for key, value in tokenized_inputs_test.items()}

In [None]:
all_outputs = []
perplexity_values = []
with torch.no_grad():
    model.eval()
    for batch in tqdm(test_dataloader, desc="Inference and Perplexity Calculation"):
        batch = {key: value.to(model.device) for key, value in batch.items()}
        outputs = model.generate(**batch)
        all_outputs.extend(outputs.tolist())
        logits = model(**batch).logits
        labels = batch['labels']
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1))
        perplexity = torch.exp(loss)
        perplexity_values.append(perplexity.item())
all_outputs_decoded = [tokenizer.decode(output_id, skip_special_tokens=True) for output_id in all_outputs]
output_df = pd.DataFrame({'Generated': all_outputs_decoded})
result_df = pd.concat([dft, output_df], axis=1)
overall_perplexity = torch.exp(torch.tensor(perplexity_values).mean())


In [None]:
dataframes = [df, dfv, dft]
for i, dataframe in enumerate(dataframes, start=1):
    print(f"\nStatistics for DataFrame {i}:")
    print("Total Rows:", len(dataframe))
    print("Unique Values (Word):", dataframe['Word'].nunique())
    print("Most Common Value (Word):", dataframe['Word'].mode().iloc[0])
    #print("Value Counts (Word):")
    #print(dataframe['Word'].value_counts())
    
    print("\nUnique Values (Context):", dataframe['Context'].nunique())
    print("Most Common Value (Context):", dataframe['Context'].mode().iloc[0])
    #print("Value Counts (Context):")
    #print(dataframe['Context'].value_counts())

    print("\nUnique Values (Definition):", dataframe['Definition'].nunique())
    print("Most Common Value (Definition):", dataframe['Definition'].mode().iloc[0])
    #print("Value Counts (Definition):")
    #print(dataframe['Definition'].value_counts())



Statistics for DataFrame 1:
Total Rows: 98219
Unique Values (Word): 29413
Most Common Value (Word): break

Unique Values (Context): 89342
Most Common Value (Context): he deserves a good kick in the butt

Unique Values (Definition): 88632
Most Common Value (Definition): the fleshy part of the human body that you sit on

Statistics for DataFrame 2:
Total Rows: 11782
Unique Values (Word): 3677
Most Common Value (Word): run

Unique Values (Context): 11580
Most Common Value (Context): almost an inspiration which gives to all work that finish which is almost art '' -- joseph conrad

Unique Values (Definition): 11561
Most Common Value (Definition): hit hard

Statistics for DataFrame 3:
Total Rows: 12318
Unique Values (Word): 3677
Most Common Value (Word): take

Unique Values (Context): 12089
Most Common Value (Context): the asperity of northern winters

Unique Values (Definition): 12062
Most Common Value (Definition): something hard to endure


In [None]:
dataframes = [df, dfv, dft]

for i, dataframe in enumerate(dataframes, 1):
    dataframe['Word'] = dataframe['Word'].str.lower()
    dataframe['Context'] = dataframe['Context'].str.lower()

    count_word_not_in_context = dataframe.apply(lambda row: row['Word'] not in row['Context'], axis=1).sum()

    total_rows = dataframe.shape[0]
    percentage = (count_word_not_in_context / total_rows) * 100

    print(f"DataFrame {i}: {percentage:.2f}% of rows where 'Word' is not in 'Context'")

DataFrame 1: 15.77% of rows where 'Word' is not in 'Context'
DataFrame 2: 15.68% of rows where 'Word' is not in 'Context'
DataFrame 3: 15.56% of rows where 'Word' is not in 'Context'


In [None]:
result_df['Definition']

0        place between two surfaces and apply weight or...
1                        an open space in a wood or forest
2        receive ( a specified amount of money ) as pay...
3                    having the property of containing fat
4                            conversion into dramatic form
                               ...                        
12313    a long-term security yielding a fixed rate of ...
12314    a line or sequence of people or vehicles await...
12315           breathe noisily , as when one is exhausted
12316    the rate of production of new biomass by an in...
12317                     a car that is old and unreliable
Name: Definition, Length: 12318, dtype: object

In [None]:
result_df['Generated']

0                                make a pressed or pressed
1                           a slender, slender, or slender
2                 ( of money ) be spent in a specified way
3                                 the quality of being fat
4        a dramatic or dramatic representation of somet...
                               ...                        
12313       a revocation of a right or right to a property
12314    a group of people or things waiting for something
12315                           reach or reach ( a point )
12316                      the quality of being productive
12317             move or move in a slender or slender way
Name: Generated, Length: 12318, dtype: object

In [None]:
result_df.iloc[12316]['Word']

'productivity'

In [None]:
result_df.iloc[12316]['Definition']

'the rate of production of new biomass by an individual , population , or community ; the fertility or capacity of a given habitat or area'

In [None]:
result_df.iloc[12316]['Generated']

'the quality of being productive'

In [None]:
result_df.iloc[12316]['Context']

'in other words , there is an optimum leaf quantity , for a given light intensity , for productivity in the plant community .'