In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
import random
import string
import json
from peft import LoraConfig, get_peft_model

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device : {device}")

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large").to(device)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

In [None]:
for param in model.parameters():
  param.requires_grad = False
model.lm_head.weight.requires_grad = True

In [None]:
class Ds(Dataset):
  def __init__(self, path):
    self.data = pd.read_json("cleaner-dataset.jsonl", lines=True)

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    src = tokenizer(self.data["input"][idx], truncation=True, padding="max_length", max_length=580, return_tensors="pt")
    trg = tokenizer(self.data["output"][idx], truncation=True, padding="max_length", max_length=580, return_tensors="pt")
    return src["input_ids"].squeeze(0), src["attention_mask"].squeeze(0), trg["input_ids"].squeeze(0)

dataset = Ds("cleaner-dataset.jsonl")

In [None]:
trainDataLoader = DataLoader(
    dataset,
    6,
    True,
    num_workers = 2,
    pin_memory = True

)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), weight_decay=0.001)

In [None]:
epochs = 10
for epoch in range(epochs):
  totalLoss = 0
  for src, mask, trg in trainDataLoader:
    optimizer.zero_grad()
    src, mask, trg = src.to(device), mask.to(device), trg.to(device)
    trg[trg == tokenizer.pad_token_id] = -100
    out = model(input_ids=src, attention_mask=mask, labels = trg)
    loss = out.loss
    totalLoss += loss.item()
    loss.backward()
    optimizer.step()

  print(f"The loss is : {totalLoss/ len(trainDataLoader)}")



In [None]:
def chat(text):
  data = tokenizer(text, return_tensors="pt").to(device)
  out = model.generate(**data,
                       max_length=200,
                       do_sample = True,
                       repetition_penalty = 1.2,
                       temperature = 0.8,
                       top_p = 0.93,
                       )
  print(text)
  print()
  print("The question-answer pairs are as follows:")
  print()
  print(tokenizer.decode(out[0]))

In [None]:

chat("Learning new skills takes time, but it becomes easier when you stay consistent and curious. Even small daily efforts can add up and help you understand things that once felt confusing.")


In [None]:
src, mask, trg = next(iter(trainDataLoader))


In [None]:
tokenizer.decode([3,1])

In [None]:
model.save_pretrained("data")

In [None]:
!zip -r data.zip data

In [None]:
from google.colab.files import download
download("Model.zip")

In [None]:
!ls
