# Goal 
- This notebook compares a few different paraphrasing and entity generation methods for covering variants when training LMs 

In [1]:
main_input = "Would you like some pizza?"

### Pegasus

In [12]:
#generate some candidates 
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_response(input_text,num_return_sequences,num_beams):
  #Beam search can work very well in tasks where the length of the desired generation is more or less predictable as in machine translation or summarization
  batch = tokenizer([input_text],truncation=True,padding='longest',max_length=60, return_tensors="pt").to(torch_device)
  translated = model.generate(**batch,max_length=60,num_beams=num_beams, num_return_sequences=num_return_sequences, temperature=1.5)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  return tgt_text


In [13]:
candidates = get_response(main_input, 10, 20)
candidates

['Would you like to eat pizza?',
 'Do you like pizza?',
 'Are you a fan of pizza?',
 'Is there any pizza you would like?',
 'Is there anything you would like to eat?',
 'Would you like a meal?',
 'Would you like to eat some pizza?',
 'Would you like pizza?',
 'Would you like some pizza?',
 'Would you like some food?']

In [14]:
# requires tweaking above works nice 
# Candidate generation with Parrot 
#     Adequacy (Is the meaning preserved adequately?)
#     Fluency (Is the paraphrase fluent English?)
#     Diversity (Lexical / Phrasal / Syntactical) (How much has the paraphrase changed the original sentence?)
# Parrot offers knobs to control Adequacy, Fluency and Diversity as per your needs.

from parrot import Parrot
import torch

''' 
uncomment to get reproducable paraphrase generations
def random_state(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

random_state(1234)
'''

#Init models (make sure you init ONLY once if you integrate this to your code)
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5", use_gpu=False)

phrases = [main_input]

for phrase in phrases:
  print("-"*100)
  print("Input_phrase: ", phrase)
  print("-"*100)
  para_phrases = parrot.augment(input_phrase=phrase)
  for para_phrase in para_phrases:
   print(para_phrase)

----------------------------------------------------------------------------------------------------
Input_phrase:  Would you like some pizza?
----------------------------------------------------------------------------------------------------
('do you like pizza?', 21)
('would you like to have some pizza?', 20)
('would you like pizza?', 17)
('would you like to have pizza?', 17)


In [17]:
# ANN Test 
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

#Sentences we want to encode. Example:
input_embedding = model.encode(main_input, convert_to_tensor=True)
candidate_embeddings = model.encode(candidates, convert_to_tensor=True)

#semantic similarities  
cosine_scores = util.cos_sim(input_embedding, candidate_embeddings)
cosine_scores

tensor([[0.8924, 0.8239, 0.7285, 0.8555, 0.4831, 0.6360, 0.9174, 0.9629, 1.0000,
         0.7217]])

In [21]:
# Find N nearests neighbors 
print (main_input)
n=5
value, indices = cosine_scores.sort()
for idx in indices[0][-n:]: 
    print (cosine_scores[0][idx], candidates[idx])

Would you like some pizza?
tensor(0.8555) Is there any pizza you would like?
tensor(0.8924) Would you like to eat pizza?
tensor(0.9174) Would you like to eat some pizza?
tensor(0.9629) Would you like pizza?
tensor(1.0000) Would you like some pizza?


In [68]:
# generate some candidates
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from sentence_transformers import SentenceTransformer, util

#load models 
model_name = "tuner007/pegasus_paraphrase"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
paratrf = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
sentrf = SentenceTransformer('paraphrase-MiniLM-L6-v2')


In [69]:
def paraphrase(text):
    torch_device = "cuda" if torch.cuda.is_available() else "cpu"


    def get_response(input_text, num_return_sequences, num_beams):
        # Beam search can work very well in tasks where the length of the desired generation is more or less predictable as in machine translation or summarization
        batch = tokenizer(
            [input_text],
            truncation=True,
            padding="longest",
            max_length=60,
            return_tensors="pt",
        ).to(torch_device)
        translated = paratrf.generate(
            **batch,
            max_length=60,
            num_beams=num_beams,
            num_return_sequences=num_return_sequences,
            temperature=1.5
        )
        tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
        return tgt_text

    # generate candidates
    candidates = get_response(text, 10, 20)

    # ranking model
    # Sentences we want to encode. Example:
    input_embedding = sentrf.encode(text, convert_to_tensor=True)
    candidate_embeddings = sentrf.encode(candidates, convert_to_tensor=True)

    # semantic similarities
    cosine_scores = util.cos_sim(input_embedding, candidate_embeddings)
    
    # sort scores 
    value, indices = cosine_scores.topk(n)
    output = ""
    for idx in indices[0][-5:]:
        output += " Score:" + str(cosine_scores[0][idx]) + "\t\t\t" + candidates[idx] + "\n"
    return output


print(paraphrase(main_input))



 Score:tensor(1.0000)			Would you like some pizza?
 Score:tensor(0.9629)			Would you like pizza?
 Score:tensor(0.9174)			Would you like to eat some pizza?
 Score:tensor(0.8924)			Would you like to eat pizza?
 Score:tensor(0.8555)			Is there any pizza you would like?



In [None]:
# For every possible recommendation, users can approve or reject 
# We want to take this into consideration and thers a few ways how
#1. Create rejected emebeddings, avoid showing recs with semantic similarity to rejections 

In [22]:
# best results are coming from OpenAI so let's use that 

import os
import openai

# Load your API key from an environment variable or secret management service
openai.api_key = os.getenv("OPENAI_API_KEY")
response = openai.Completion.create(model="text-davinci-003", prompt="Generate this is a test", temperature=0.1, max_tokens=7)
response 


<OpenAIObject text_completion id=cmpl-6flGYGGQZream5dnD5DowepmMqkBu at 0x1041c5680> JSON: {
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "text": "\n\nThis is indeed a test"
    }
  ],
  "created": 1675411158,
  "id": "cmpl-6flGYGGQZream5dnD5DowepmMqkBu",
  "model": "text-davinci-003",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 7,
    "prompt_tokens": 6,
    "total_tokens": 13
  }
}

In [26]:
#2 Entity Example Generation 
entity = "rabbit"
n = 5 
entity_prompt = f"Create {n} examples of the following entity: \n '{entity}'" 
response = openai.Completion.create(model="text-davinci-003", prompt=entity_prompt, temperature=0.25, max_tokens=100)
print (response['choices'][0]['text'])



1. Flopsy, a white lop-eared rabbit who loves to eat carrots.
2. Thumper, a brown and white spotted rabbit who loves to hop around the garden.
3. Bun Bun, a grey and white Dutch rabbit who loves to cuddle.
4. Cotton Tail, a white and brown Dutch rabbit who loves to explore.
5. Snowball, an albino rabbit who loves to play in the snow.


In [27]:
#Response Variant Generation 
response = "I'm not sure I can help you with that! Would you like to purchase pizza instead?"
variable = "food"


n = 2
if variable:
    response_prompt = f"create {n} sentences similar to:\n '{response}' and keep {variable} in the new sentences." 
else: 
    response_prompt = f"create {n} sentences similar to:\n '{response}'" 
    
response = openai.Completion.create(model="text-davinci-003", prompt=response_prompt, temperature=0.0, max_tokens=250)
print (response['choices'][0]['text'])




'I'm afraid I can't assist you with that! Would you like to order Chinese food instead?' 

'I'm sorry I can't be of more help! Would you like to get some tacos instead?'


In [44]:
%load_ext gradio

The gradio extension is already loaded. To reload it, use:
  %reload_ext gradio


In [42]:
# openAI request function for entities + paraphrases or both 
import openai 
import json 
openai.api_key = os.getenv("OPENAI_API_KEY")

def request_davinci(paraphrase, utterance, entity, n=1, temperature=0.25, max_tokens=100):
    if paraphrase:
        # 3 Response Variant Generation
        if entity:
            prompt = f"create {n} sentences similar to:\n '{utterance}' and keep {entity} in the new sentences."
        else:
            prompt = f"create {n} sentences similar to:\n '{utterance}'"
    else:
        prompt = f"Create {n} examples of the following: \n '{entity}'"

    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=prompt,
        temperature=temperature,
        max_tokens=max_tokens,
    )
    return response["choices"][0]["text"]


request_davinci(paraphrase=False, utterance="Hello, I would like to buy a coffee.", entity=None, n=4)

'\n\n1. None of the students in the class passed the exam.\n2. I have none of the answers to the questions.\n3. She had none of the ingredients to make the cake.\n4. None of the cars in the lot were for sale.'

In [44]:
#gradio demo 
# %%blocks 
import gradio as gr

gr.Interface(request_davinci, 
[gr.inputs.Checkbox(default=False), 
gr.Text(value=None), 
gr.Text(value=None), 
gr.Slider(minimum=1, maximum=10, step=1), 
gr.Slider(minimum=0, maximum=1, step=0.1, value=0.2), 
gr.Slider(minimum=0, maximum=1000, value=500)],
"text"
).launch()



Running on local URL:  http://127.0.0.1:7883

To create a public link, set `share=True` in `launch()`.




In [53]:

#OPEN AI 
# Understand options and how to create better prompts witht hem 
# Take into consideration of entity, try using entity in prompt name 

# GradIO options 
#1. Choose N return statements  

#2. Choose generation type above 1-3 
#3. Create a text 
#4. Generate N utterances  doone




gr.Markdown("# Paraphrase Generator")


generation_type = gr.Dropdown(["similar sentences", "example entities"])
inp = gr.Textbox(placeholder="Enter something to paraphrase")
entity = gr.Textbox(placeholder="Enter any entities you want to see in the paraphrase")
create_n = gr.Number(label="n") 
out = gr.Textbox()

# inp.submit(fn=lambda x: "hello", 
#            inputs=inp, 
#            outputs=out)


Running on local URL:  http://127.0.0.1:7860


