In [1]:
import torch
import pandas as pd
import numpy as np
import torch.optim as optim
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.backends.cuda.matmul.allow_tf32 = True

In [3]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
model.to('cuda');

In [28]:
use_own_data = True

In [29]:
if use_own_data:
    dataset = load_dataset("csv", data_files='./LLM dataset train clean.csv')
    dataset = dataset['train']
else:
    # Load the dataset but get first 1000 examples
    dataset = load_dataset("databricks/databricks-dolly-15k")["train"].select(range(1000))

Found cached dataset csv (C:/Users/vmpletsos/.cache/huggingface/datasets/csv/default-11a7215d1340ed0f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 499.98it/s]


In [30]:
epochs = 2
max_length = 256
task_prefix = "answer question: "
batch_size = 5
gradient_accumulations_steps = 4

In [31]:
dataset_length = len(dataset)

In [32]:
optimizer = optim.Adam(
            params=model.parameters(),
            lr=1e-5,
            betas=(0.9, 0.999),
            eps=1e-08,
            amsgrad=False,
            weight_decay=1e-5
        )

In [33]:
for epoch in range(epochs):
    # Set the model to training mode
    model.train()
    epoch_loss = 0
    j = 0
    while j < dataset_length:
        if j + batch_size > dataset_length:
            instructions = dataset[j:len(dataset)]['instruction']
            responses = dataset[j:len(dataset)]['response']
        else:
            instructions = dataset[j:j + batch_size]['instruction']
            responses = dataset[j:j + batch_size]['response']
        j += batch_size 
        
        encoding = tokenizer(
            [task_prefix + sequence for sequence in instructions],
            padding="longest",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
        
        target_encoding = tokenizer(
            responses,
            padding="longest",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )
        labels = target_encoding.input_ids
        # replace padding token id's of the labels by -100 so it's ignored by the loss
        labels[labels == tokenizer.pad_token_id] = -100
        
        # Zero the gradients
        optimizer.zero_grad()
        
        input_ids = input_ids.to('cuda')
        attention_mask = attention_mask.to('cuda')
        labels = labels.to('cuda')
        
        # forward pass
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        epoch_loss += loss.item()
        loss.backward()
        # (loss / gradient_accumulations_steps).backward()
        
        optimizer.step()
        # # Backward pass and optimization step
        # if (j+1) % gradient_accumulations_steps == 0:
        #     model.zero_grad()
    print('Epoch', epoch+1, 'Loss/train', epoch_loss/dataset_length)
    

Epoch 1 Loss/train 0.5208774707755264
Epoch 2 Loss/train 0.37045252809719165


In [34]:
# get 2 instructions from the dataset
questions = ['How to maintain marble with cracks?', 'How to remove rust stains from corroded connectors?', 'Can we use grout to fill the cracks?']
question_inputs = tokenizer([task_prefix + question for question in questions], return_tensors="pt", padding=True)
question_inputs = question_inputs.to('cuda')

In [35]:
output_sequences = model.generate(
    input_ids=question_inputs["input_ids"],
    attention_mask=question_inputs["attention_mask"],
    do_sample=False,  # disable sampling to test if batching affects output
)



In [36]:
predicted_answers = tokenizer.batch_decode(output_sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [38]:
for i in range(len(questions)):
    print('Question:', questions[i])
    print('Answer:', predicted_answers[i])
    print('')

Question: How to maintain marble with cracks?
Answer: Maintenance of marble surfaces is done by a combination of mechanical and mechanically pressing the joints of

Question: How to remove rust stains from corroded connectors?
Answer: Removing rust stains from corroded connectors is achieved by a

Question: Can we use grout to fill the cracks?
Answer: It is reccomended that the stone which is partially cured by a groutory

