In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BatchEncoding

model = AutoModelForSeq2SeqLM.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality", loss=nn.CrossEntropyLoss(ignore_index=-1))
tokenizer = AutoTokenizer.from_pretrained("ramsrigouthamg/t5-large-paraphraser-diverse-high-quality")

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device ", device)
model = model.to(device)

device  cpu


In [76]:
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [36]:
# Note, all markdown _ must be converted to * to achieve proper tokenization
# == results in infinite loop
# Convert emojis to text representation

In [104]:
def encode(context: str):
  text = "paraphrase: " + context + " </s>"
  encoding = tokenizer.encode_plus(text, max_length=128, padding=True, return_tensors="pt")
  return encoding

In [3]:
def paraphrase(encoding: BatchEncoding):
  input_ids, attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

  model.eval()
  beam_outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=128,
    early_stopping=True,
    num_beams=15,
    num_return_sequences=3
  )

  for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    print(sent)

In [129]:
import torch
import numpy as np

class Config:
  learning_rate = 1e-4
  train_batch_size = 2
  val_batch_size = 2
  train_epochs = 5
  val_epochs = 1
  seed = 42

  def __init__(self):
    torch.manual_seed(self.seed)
    np.random.seed(self.seed)
    torch.backends.cudnn.deterministic = True

config = Config()

In [84]:
class CustomDataset(Dataset):
  def __init__(self, dataframe, encode):
    self.encode = encode
    self.data = dataframe
    self.source = self.data['source']
    self.target = self.data['target']

  def __len__(self):
    return len(self.target)

  def __getitem__(self, index):
    source = self.encode(str(self.source[index]))
    target = self.encode(str(self.target[index]))

    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()
    target_ids = target['input_ids'].squeeze()

    return {
        'source_ids': source_ids.to(dtype=torch.long), 
        'source_mask': source_mask.to(dtype=torch.long), 
        'target_ids': target_ids.to(dtype=torch.long),
        'target_ids_y': target_ids.to(dtype=torch.long)
    }

In [127]:
def train(epoch, model, device, loader, optimizer):
  model.train()
  for step, data in enumerate(loader, 0):
    y = data['target_ids'].to(device, dtype=torch.long)
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone().detach()
    lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
    ids = data['source_ids'].to(device, dtype=torch.long)
    mask = data['source_mask'].to(device, dtype=torch.long)

    outputs = model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, labels=lm_labels)
    # print(outputs)
    loss = outputs.loss

    if step % 10 == 0:
      print({"Training Loss": loss.item()})

    if step % 500 == 0:
      print(f'Epoch: {epoch}, Loss:  {loss.item()}')
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [130]:
# https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/t5#transformers.T5ForConditionalGeneration

import torch

training_set = CustomDataset({
  "source": [
    "A flock of ==frogs== were ==roaming== around the park in search of water once more.",
    "A ==flock== of frogs were roaming around the park in ==search of water== once more."
  ],
  "target": [
    "A herd of ==frogs== were ==wandering== around the woods in search of water.",
    "A ==herd== of frogs were wandering around the woods in ==search of water==."
  ]
}, encode)
train_params = {
  'batch_size': config.train_batch_size,
  'shuffle': True,
  'num_workers': 0
}
training_loader = DataLoader(training_set, **train_params)

optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

# encoding = encode("A flock of ==frogs== were ==roaming== around the park in search of water once more.")
# paraphrase(encoding)

for epoch in range(config.train_epochs):
  train(epoch, model, device, training_loader, optimizer)

encoding = encode("A flock of ==frogs== were ==roaming== around the park in search of water once more.")
paraphrase(encoding)

{'Training Loss': 2.2327234745025635}
Epoch: 0, Loss:  2.2327234745025635
{'Training Loss': 1.7753381729125977}
Epoch: 1, Loss:  1.7753381729125977
{'Training Loss': 1.4297329187393188}
Epoch: 2, Loss:  1.4297329187393188
{'Training Loss': 0.8478027582168579}
Epoch: 3, Loss:  0.8478027582168579
{'Training Loss': 0.7149658799171448}
Epoch: 4, Loss:  0.7149658799171448
paraphrasedoutput: A herd of ==frogs== were ==roaming== around the woods looking for water once more.
paraphrasedoutput: A herd of ==frogs== were ==roaming== around the woods looking for water.
paraphrasedoutput: A herd of ==frogs== were ==roaming== around the woods looking for water again.


In [131]:
# encoding1 = encode("Once, a group of frogs were roaming around the forest in search of water.")
# paraphrase(encoding1)
# # paraphrasedoutput: A herd of frogs were wandering around the woods in search of water.
# # paraphrasedoutput: A herd of frogs was wandering around the woods in search of water.
# # paraphrasedoutput: A herd of frogs were wandering around the forest in search of water at one time.
# encoding2 = encode("Once, a group of **frogs** were **roaming** around the forest in search of water.")
# paraphrase(encoding2)
# # paraphrasedoutput: A flock of **frogs** were **roaming** around the park in search of water once more.
# # paraphrasedoutput: A pair of **frogs** were **roaming** around the park in search of water once more.
# # paraphrasedoutput: A flock of **frogs** were **roaming** around the forest in search of water once more.
# encoding3 = encode("A flock of ==frogs== were ==roaming== around the park in search of water once more.")
# paraphrase(encoding3)
# # paraphrasedoutput: A flock of ==frogs===roaming===roaming===roaming====roaming====roaming====roaming====roaming====roaming====roaming====roaming====roaming=====roaming=====ro
# # paraphrasedoutput: A flock of ==frogs===roaming===roaming===roaming====roaming====roaming====roaming====roaming====roaming====roaming=====roaming====roaming=====roaming====ro
# # paraphrasedoutput: A flock of ==frogs===roaming===roaming===roaming====roaming====roaming====roaming====roaming====roaming====roaming====roaming====roaming====roaming====ro
encoding4 = encode("A flock of frogs were roaming ==around== the park in search of water once more.")
paraphrase(encoding4)

paraphrasedoutput: A herd of frogs were wandering ==around== the woods in search of water.
paraphrasedoutput: A herd of frogs were wandering around== the woods in search of water.
paraphrasedoutput: A herd of frogs were wandering around== the woods looking for water once more.
