<a href="https://colab.research.google.com/github/VickkiMars/pegasus-paraphrase/blob/main/paraphraser-experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence-transformers transformers datasets sentencepiece



In [1]:
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np

In [28]:
dataset = load_dataset("HHousen/quora")
small_train_dataset = dataset["train"].select(range(10000))
print(dataset['train'][0])

{'label': 1, 'sentence1': 'What is your review of Hidden Figures -LRB- 2016 movie -RRB- ?', 'sentence2': 'What are your impressions of Hidden Figures -LRB- 2017 movie -RRB- ?', 'instance_id': 11877}


In [29]:
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
max_length = 128

def preprocess(examples):
    # Encode the inputs
    model_inputs = tokenizer(
        examples['sentence1'],
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )

    # Encode the targets (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['sentence2'],
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    # Assign labels
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [30]:
tokenized_datasets = (
    small_train_dataset
    .filter(lambda x: x['sentence1'] is not None and x['sentence2'] is not None)
    .map(preprocess, batched=True, remove_columns=small_train_dataset.column_names)
)


Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



In [31]:
split_dataset = tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

In [32]:
print(tokenized_datasets[0])

{'input_ids': [463, 117, 128, 933, 113, 17157, 37586, 233, 20447, 788, 121, 1448, 1397, 233, 16567, 788, 121, 110, 152, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [463, 127, 128, 12529, 113, 17157, 37586, 233, 20447, 788, 121, 1326, 1397, 233, 16567, 788, 121, 110, 152, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [33]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-large")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

st_model = SentenceTransformer("all-MiniLM-L6-v2")

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  pred_embeddings = st_model.encode(decoded_preds, convert_to_tensor=True)
  label_embeddings = st_model.encode(decoded_labels, convert_to_tensor=True)

  cosine_scores = util.cos_sim(pred_embeddings, label_embeddings).diagonal()
  return {"cosine_similarity": float(np.mean(cosine_scores.cpu().numpy()))}

In [35]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./pegasus-quora-paraphrase",
    learning_rate=5e-5,
    per_device_train_batch_size=4,        # adjust based on your GPU memory
    per_device_eval_batch_size=4,         # same or smaller than train batch size
    num_train_epochs=1,                   # number of training epochs
    weight_decay=0.01,                    # for regularization
    save_strategy="epoch",                # save model every epoch
    logging_dir="./logs",                 # directory for logs
    logging_strategy="steps",
    logging_steps=100,
    save_total_limit=2,                   # keep only the last 2 checkpoints
    fp16=True,                            # enable mixed precision if on GPU
)


In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [37]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 2.12 MiB is free. Process 13828 has 14.74 GiB memory in use. Of the allocated memory 14.47 GiB is allocated by PyTorch, and 130.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [13]:
trainer.save_model("./pegasus-quora-paraphrase-final")
tokenizer.save_pretrained("./pegasus-quora-paraphrase-final")

('./pegasus-quora-paraphrase-final/tokenizer_config.json',
 './pegasus-quora-paraphrase-final/special_tokens_map.json',
 './pegasus-quora-paraphrase-final/spiece.model',
 './pegasus-quora-paraphrase-final/added_tokens.json')

In [17]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=cd142c516c54e780eea86f41276ad256a2447c3a36d6b948f87132a0f4face54
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [18]:
from sentence_transformers import SentenceTransformer, util

model_st = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Predicted vs. reference paraphrases
preds = ["What are the best ways to learn machine learning?"]
refs = ["How can I start learning machine learning effectively?"]

# Encode and compute cosine similarity
pred_emb = model_st.encode(preds, convert_to_tensor=True)
ref_emb = model_st.encode(refs, convert_to_tensor=True)

score = util.cos_sim(pred_emb, ref_emb).item()
print(f"Semantic similarity: {score:.4f}")  # 0.0–1.0 range


Semantic similarity: 0.8458


Training SBERT for semantic similarity.

In [20]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from datasets import load_dataset

# Load Quora dataset
dataset = load_dataset("HHousen/quora")  # or "HHousen/quora"

# Keep only positive and negative pairs
train_samples = []
for example in dataset["train"]:
    if example["sentence1"] and example["sentence2"]:
        train_samples.append(
            InputExample(
                texts=[example["sentence1"], example["sentence2"]],
                label=float(example["label"])  # Use 'label' instead of 'is_duplicate'
            )
        )

# Define model and loss
model = SentenceTransformer("all-MiniLM-L6-v2")
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)
train_loss = losses.CosineSimilarityLoss(model)

# Train model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2302
1000,0.1863
1500,0.1642
2000,0.151
2500,0.1433
3000,0.1389
3500,0.1357
4000,0.1343
4500,0.1316
5000,0.1305


In [21]:
model.save("./sbert-quora-model")


In [22]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("./sbert-quora-model")


In [23]:
from sentence_transformers import util

sentences = [
    "How can I learn machine learning?",
    "What are the best ways to study machine learning?",
    "What is the capital of France?",
]

embeddings = model.encode(sentences, convert_to_tensor=True)

# Compare similarity
sim_1_2 = util.cos_sim(embeddings[0], embeddings[1]).item()
sim_1_3 = util.cos_sim(embeddings[0], embeddings[2]).item()

print(f"Similarity (ML vs ML): {sim_1_2:.4f}")
print(f"Similarity (ML vs France): {sim_1_3:.4f}")


Similarity (ML vs ML): 0.8609
Similarity (ML vs France): 0.0527


In [27]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

tokenizer = PegasusTokenizer.from_pretrained("./pegasus-quora-paraphrase-final")
model = PegasusForConditionalGeneration.from_pretrained("./pegasus-quora-paraphrase-final")

input_text = "How can I improve my communication skills"
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

# Generate paraphrase
outputs = model.generate(**inputs, max_length=64, num_beams=5)
paraphrase = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Input:", input_text)
print("Pegasus paraphrase:", paraphrase)

Input: How can I improve my communication skills
Pegasus paraphrase: How can I improve my communication skills?
