In [None]:
# code used to evaluate different models for paraphrasing

In [None]:
# pip install relevant packages
!pip install transformers pandas datasets pynvml huggingface_hub sentence-transformers rouge_score 

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer, util
from datasets import load_metric

from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
from numpy import dot
from numpy.linalg import norm
import os
import pandas as pd
import re
import seaborn as sns

from transformers import(
       AutoModelWithLMHead,
       AutoConfig,
       Trainer,
       AutoTokenizer,
       TextDataset,
       DataCollatorForLanguageModeling,
       TrainingArguments,
       pipeline
)
import gc
import torch

In [None]:
# rank the paraphrase in terms of Universal Sentence Encoder, ROUGE-L, BLEU
# BLEU : https://www.digitalocean.com/community/tutorials/bleu-score-in-python, https://www.geeksforgeeks.org/nlp-bleu-score-for-evaluating-neural-machine-translation-python/
# USE : 
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
sentence_similarity_model = hub.load(module_url)
def embed(input):
  return sentence_similarity_model(input)
# ROUGE
# define the rouge
rouge = load_metric("rouge")

  rouge = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
def paraphrase_new(text, model, tokenizer):
  # generate parpahraes by passing <s> input </s>>>>><p> into the model.  
  inputs = tokenizer(f"<s>{text}</s>>>>><p>", return_tensors='pt')
  inputs = inputs['input_ids'].to(device)
  p = []
  samples = model.generate(inputs, 
  max_length=70, temperature=1.7, num_beams=10, num_return_sequences=10)
  for i, sample in enumerate(samples):
    p.append(tokenizer.decode(sample, skip_special_tokens=True).split("</s>>>>><p>")[1].split("</")[0])
  return p


In [None]:
def rank_paraphrases(input_text, paraphrases):
  res = []
  # for USE, calculate input embedding
  input_sentence_embedding = embed([input_text])[0]
  for paraphrase in paraphrases:
    # calculate USE for input and paraphrase and calculate silimarity. create sentence embeddings for input and parahrase and cosine similarity
    paraphrase_sentence_embedding = embed([paraphrase])[0]
    # calculate cosine score
    cos_sim_score = dot(input_sentence_embedding, paraphrase_sentence_embedding)/(norm(input_sentence_embedding)*norm(paraphrase_sentence_embedding))
    # calculate ROUGE-L, https://www.youtube.com/watch?v=TMshhnrEXlg
    rouge_l_score = rouge.compute(predictions=[paraphrase], references=[input_text])['rougeL'].mid.fmeasure
    res.append([paraphrase, float(cos_sim_score), float(rouge_l_score)])

  res.sort(key = lambda x: x[1], reverse=True)
  return res

Load the evaluation dataset
100 Sentences for each model

In [None]:
with open("eval.txt", "r") as f:
  sentences = f.readlines()
sentences = list(map(lambda x: x[:-1], sentences))
sentences

Now, to make a pipline to analyse any model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import json
def write_to_json(name, d):
  json_string = json.dumps(d)
  with open(f'{name}.json', 'w') as outfile:
      outfile.write(json_string)

In [None]:
# gpt2
models = ["gpt2", "gpt2-medium", "gpt2-large"]
for model in models:
  model_name=f"SRM47/{model}-paraphraser"
  paraphraser = AutoModelWithLMHead.from_pretrained(model_name)
  paraphraser = paraphraser.to(device)
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  # generator = pipeline("text-generation", model=paraphraser, tokenizer=tokenizer)
  gpt2_res = {}
  for text in sentences:
    p = paraphrase_new(text, paraphraser, tokenizer)
    ps = rank_paraphrases(text, p)
    gpt2_res[text] = ps
  write_to_json(f"{model}-paraphraser", gpt2_res)


In [None]:
# PEGASUS
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
def get_response(input_text,num_return_sequences,num_beams):
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text
num_beams = 10
num_return_sequences = 10
pegasus_res = {}
for text in sentences:
  p = get_response(text,num_return_sequences,num_beams)
  ps = rank_paraphrases(text, p)
  pegasus_res[text] = ps


Downloading:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

In [None]:
pegasus_res

In [None]:
write_to_json("pegasus-paraphraser", pegasus_res)

In [None]:
# BART
from transformers import BartForConditionalGeneration, BartTokenizer

model = BartForConditionalGeneration.from_pretrained('eugenesiow/bart-paraphrase')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
tokenizer = BartTokenizer.from_pretrained('eugenesiow/bart-paraphrase')
num_beams = 10
num_return_sequences = 10
bart_res = {}
for text in sentences:
  p = get_response(text,num_return_sequences,num_beams)
  ps = rank_paraphrases(text, p)
  bart_res[text] = ps


Downloading:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/332 [00:00<?, ?B/s]

In [None]:
bart_res

In [None]:
write_to_json("bart-paraphraser", bart_res)