<a href="https://colab.research.google.com/github/VickkiMars/pegasus-paraphrase/blob/main/Paraphraser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np

In [None]:
dataset = load_dataset("HHousen/quora")
small_train_dataset = dataset["train"].select(range(15000))
print(dataset['train'][0])

{'label': 1, 'sentence1': 'What is your review of Hidden Figures -LRB- 2016 movie -RRB- ?', 'sentence2': 'What are your impressions of Hidden Figures -LRB- 2017 movie -RRB- ?', 'instance_id': 11877}


In [None]:
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
max_length = 128

def preprocess(examples):
    # Encode the inputs
    model_inputs = tokenizer(
        examples['sentence1'],
        max_length=max_length,
        truncation=True,
        padding="max_length"
    )

    # Encode the targets (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['sentence2'],
            max_length=max_length,
            truncation=True,
            padding="max_length"
        )

    # Assign labels
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
tokenized_datasets = (
    small_train_dataset
    .filter(lambda x: x['sentence1'] is not None and x['sentence2'] is not None)
    .map(preprocess, batched=True, remove_columns=small_train_dataset.column_names)
)


Filter:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [None]:
split_dataset = tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

In [None]:
print(tokenized_datasets[0])

{'input_ids': [463, 117, 128, 933, 113, 17157, 37586, 233, 20447, 788, 121, 1448, 1397, 233, 16567, 788, 121, 110, 152, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [463, 127, 128, 12529, 113, 17157, 37586, 233, 20447, 788, 121, 1326, 1397, 233, 16567, 788, 121, 110, 152, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-large")

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

In [15]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

st_model = SentenceTransformer("all-MiniLM-L6-v2")

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  pred_embeddings = st_model.encode(decoded_preds, convert_to_tensor=True)
  label_embeddings = st_model.encode(decoded_labels, convert_to_tensor=True)

  cosine_scores = util.cos_sim(pred_embeddings, label_embeddings).diagonal()
  return {"cosine_similarity": float(np.mean(cosine_scores.cpu().numpy()))}

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./pegasus-quora-paraphrase",
    learning_rate=5e-5,
    per_device_train_batch_size=4,        # adjust based on your GPU memory
    per_device_eval_batch_size=4,         # same or smaller than train batch size
    num_train_epochs=1,                   # number of training epochs
    weight_decay=0.01,                    # for regularization
    save_strategy="epoch",                # save model every epoch
    logging_dir="./logs",                 # directory for logs
    logging_strategy="steps",
    logging_steps=100,
    save_total_limit=2,                   # keep only the last 2 checkpoints
    fp16=True,                            # enable mixed precision if on GPU
)


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [18]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmartin_s[0m ([33mmartin_s-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,9.8498
200,5.728
300,0.675
400,0.2757
500,0.2677
600,0.267
700,0.2575
800,0.2549
900,0.2584
1000,0.2476




TrainOutput(global_step=3375, training_loss=0.7095226864284939, metrics={'train_runtime': 1260.17, 'train_samples_per_second': 10.713, 'train_steps_per_second': 2.678, 'total_flos': 4875971198976000.0, 'train_loss': 0.7095226864284939, 'epoch': 1.0})

In [19]:
trainer.save_model("./pegasus-quora-paraphrase-final")
tokenizer.save_pretrained("./pegasus-quora-paraphrase-final")

('./pegasus-quora-paraphrase-final/tokenizer_config.json',
 './pegasus-quora-paraphrase-final/special_tokens_map.json',
 './pegasus-quora-paraphrase-final/spiece.model',
 './pegasus-quora-paraphrase-final/added_tokens.json')

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=cd142c516c54e780eea86f41276ad256a2447c3a36d6b948f87132a0f4face54
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from sentence_transformers import SentenceTransformer, util

model_st = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Predicted vs. reference paraphrases
preds = ["What are the best ways to learn machine learning?"]
refs = ["How can I start learning machine learning effectively?"]

# Encode and compute cosine similarity
pred_emb = model_st.encode(preds, convert_to_tensor=True)
ref_emb = model_st.encode(refs, convert_to_tensor=True)

score = util.cos_sim(pred_emb, ref_emb).item()
print(f"Semantic similarity: {score:.4f}")  # 0.0–1.0 range


Semantic similarity: 0.8458


Training SBERT for semantic similarity.

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from datasets import load_dataset

# Load Quora dataset
dataset = load_dataset("HHousen/quora")  # or "HHousen/quora"

# Keep only positive and negative pairs
train_samples = []
for example in dataset["train"]:
    if example["sentence1"] and example["sentence2"]:
        train_samples.append(
            InputExample(
                texts=[example["sentence1"], example["sentence2"]],
                label=float(example["label"])  # Use 'label' instead of 'is_duplicate'
            )
        )

# Define model and loss
model = SentenceTransformer("all-MiniLM-L6-v2")
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)
train_loss = losses.CosineSimilarityLoss(model)

# Train model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.2302
1000,0.1863
1500,0.1642
2000,0.151
2500,0.1433
3000,0.1389
3500,0.1357
4000,0.1343
4500,0.1316
5000,0.1305


In [None]:
model.save("./sbert-quora-model")


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("./sbert-quora-model")


In [None]:
from sentence_transformers import util

sentences = [
    "How can I learn machine learning?",
    "What are the best ways to study machine learning?",
    "What is the capital of France?",
]

embeddings = model.encode(sentences, convert_to_tensor=True)

# Compare similarity
sim_1_2 = util.cos_sim(embeddings[0], embeddings[1]).item()
sim_1_3 = util.cos_sim(embeddings[0], embeddings[2]).item()

print(f"Similarity (ML vs ML): {sim_1_2:.4f}")
print(f"Similarity (ML vs France): {sim_1_3:.4f}")


Similarity (ML vs ML): 0.8609
Similarity (ML vs France): 0.0527


In [24]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

tokenizer = PegasusTokenizer.from_pretrained("./pegasus-quora-paraphrase-final")
model = PegasusForConditionalGeneration.from_pretrained("./pegasus-quora-paraphrase-final")

input_text = "A single experimental result can suggest a phenomenon, but robust scientific conclusions require replication, well-powered samples, and careful control of confounds. Studies with small sample sizes or multiple uncorrected comparisons often report results that fail to replicate under stricter conditions. Consequently, researchers should preregister designs, share raw data when possible, and interpret exploratory findings with caution until corroborated by independent work"
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

# Generate paraphrase
outputs = model.generate(**inputs, max_length=128, num_beams=5)
paraphrase = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Input:", input_text)
print("Pegasus paraphrase:", paraphrase)


Input: A single experimental result can suggest a phenomenon, but robust scientific conclusions require replication, well-powered samples, and careful control of confounds. Studies with small sample sizes or multiple uncorrected comparisons often report results that fail to replicate under stricter conditions. Consequently, researchers should preregister designs, share raw data when possible, and interpret exploratory findings with caution until corroborated by independent work
Pegasus paraphrase: A single experimental result can suggest a phenomenon, but robust scientific conclusions require replication, well-powered samples, and careful control of confounds.


In [25]:
save_path = "/content/drive/MyDrive/pegasus-paraphrase-model"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Model and tokenizer saved to: {save_path}")


✅ Model and tokenizer saved to: /content/drive/MyDrive/pegasus-paraphrase-model


In [27]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./pegasus-quora-paraphrase",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    predict_with_generate=True,  # required for decoding predictions
    learning_rate=5e-5,
    num_train_epochs=1,
    save_total_limit=1,
    fp16=True,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=100,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Seq2SeqTrainer(


In [28]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.21873046457767487, 'eval_model_preparation_time': 0.0366, 'eval_cosine_similarity': 0.7172814011573792, 'eval_runtime': 592.0052, 'eval_samples_per_second': 2.534, 'eval_steps_per_second': 1.267}
