<a href="https://colab.research.google.com/github/Suvetha11/CountriStat-Backend-Spring-Boot-/blob/main/T5_Finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, http

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import torch

# Load the dataset
df = pd.read_csv("https://raw.githubusercontent.com/Suvetha11/FYP-Dataset/main/data/dataset_new.csv")
input_code = df['command'].values.tolist()
output_description = df['code'].values.tolist()

# Split the dataset into train and validation sets
train_input, val_input, train_output, val_output = train_test_split(input_code, output_description, test_size=0.1, random_state=42)

# Tokenize the dataset
tokenizer = T5Tokenizer.from_pretrained('t5-small')
train_encodings = tokenizer(train_input, padding=True, truncation=True)
train_labels = tokenizer(train_output, padding=True, truncation=True)
val_encodings = tokenizer(val_input, padding=True, truncation=True)
val_labels = tokenizer(val_output, padding=True, truncation=True)

# Define the dataset
class T5Dataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create the dataloaders
train_dataset = T5Dataset(train_encodings, train_labels)
val_dataset = T5Dataset(val_encodings, val_labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Instantiate the model and optimizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
optimizer = AdamW(model.parameters(), lr=3e-5)

# Define the training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    model.eval()
    total_val_loss = 0
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss = outputs.loss
            total_val_loss += val_loss.item()

    print(f"Epoch {epoch + 1}: val_loss={total_val_loss / len(val_loader)}")


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Epoch 1: val_loss=2.170234660307566
Epoch 2: val_loss=1.5765962501366932
Epoch 3: val_loss=1.1550580362478893


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Save the model to Google Drive
model.save_pretrained('/content/drive/MyDrive/finetune_model/model')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the model from Google Drive
from transformers import T5ForConditionalGeneration

load_model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/finetune_model/model')

In [69]:
input_text = "initiate integer x with value seventy"
input_ids = tokenizer.encode(input_text, return_tensors="pt")
outputs = model.generate(input_ids=input_ids)
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)

integer x = 100


In [67]:
input_text = "initiate integer x with value hundred"
input_ids = tokenizer.encode(input_text, return_tensors="pt")
outputs = load_model.generate(input_ids=input_ids)
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)

integer x = 100
