# Imports

In [1]:
# !pip install nlp
# !pip install bert_score
# !pip install git+https://github.com/google-research/bleurt.git
from pathlib import Path

import nlpaug.augmenter.word as naw
import nlpaug.flow as nafc
import pandas as pd
import asyncio
import aiohttp
import json
from aiohttp import ClientConnectorError, ClientSession
from nlp import load_metric
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

# Datapaths

In [2]:
data_path = Path("../data")
assert data_path.exists()
# textgen_path = Path("../data/textgen")
# assert textgen_path.exists()
ground_truth_csv = data_path/"predictions_with_groundtruth.csv"
assert ground_truth_csv.exists()
keys_path = Path("key.json")
assert keys_path.exists()

# For evaluation we're going to use BERTScore. 
BERTScore uses ContexualWordEmbeddings for similarity matching. 

In [3]:
bert_score = load_metric("bertscore")

In [4]:
y_pred = ["This is a test sentence"]
y = ["This is another test sentence"]
print(bert_score.compute(y_pred, y, lang="en"))

{'precision': tensor([0.9669]), 'recall': tensor([0.9669]), 'f1': tensor([0.9669]), 'hashcode': 'roberta-large_L17_no-idf_version=0.3.5(hug_trans=3.0.2)'}


# Data : The refrence data will be sentences from "Refunds" from intent tagging dataset
That gives us around 275 samples

In [125]:
intents = pd.read_csv(ground_truth_csv)[
    ["MessageId", "Message", "Ground Truth"]
].drop_duplicates()
# intents = intents[intents["Ground Truth"] == "Refunds"]
intents_train, intents_test = train_test_split(intents, test_size=0.75)
len(intents_train), len(intents_test)

(886, 2661)

# NLP AUG
To create a reference set on how the new paraphrased sentences should perfrom we will try to generate 1000 sentences from NLP Aug and GPT3 and check the BERTscore of the generated text. 

# Approach 1 :  Generating prompt using keyword
Here we will pass the keyword "refunds" and generate the phrasing

In [156]:
def text_generation_prompt(train: pd.DataFrame) -> str:
    output = [
        f"Keyword : {ele['Ground Truth']} \nGenerated Sentence : {ele.Message}\n"
        for _, ele in train.iterrows()
    ]
    return "\n".join(output)

In [157]:
text_generation_prompt(intents_test.sample(30))

"Keyword : CashOnDelivery \nGenerated Sentence : This was the cash on delivery order which I placed before lockdown. Today I made a payment against this order to convert it to prepaid status and the amount has been deducted from my account\n\nKeyword : Courier \nGenerated Sentence : regarding misbehaviour of courier agent. he refused to come at my place he was asking to come to their office and collect it. it almst take 2 hrs to reach there. this is your customer service??\n\nKeyword : Cancellations \nGenerated Sentence : I requested refund but i received my order today so please cancel the refund\n\nKeyword : MissingItem \nGenerated Sentence : product is missing\n\nKeyword : Cancellations \nGenerated Sentence : I would like to cancel my orders due to the curent lockdown\n\nKeyword : DeliveryFailure \nGenerated Sentence : I have received failed delivery of my 2 products please check if I can get refund or product\n\nKeyword : Refunds \nGenerated Sentence : actually i ordered two produc

In [161]:
async def fetch_GPT3_completion_response(
    url: str, session: ClientSession, prompt, query
) -> tuple:
    headers = {
        "Content-Type": "application/json",
        "Authorization": f'Bearer {json.load(keys_path.open("r"))["api_key"]}',
    }
    try:
        prompt = text_generation_prompt(prompt.sample(30))
        data = json.dumps(
            {
                "prompt": prompt + f"\nKeyword : {query}",
                "max_tokens": 500,
                #                 "logprobs": 10,
                "temperature": 0.65,
                "frequency_penalty": 0.80,
                "presence_penalty": 0.80,
            }
        )
        resp = await session.request(method="POST", url=url, headers=headers, data=data)
    except ClientConnectorError:
        return ("error", row.MessageId, query_intent)
    r = await resp.json()
    try:
        output = r["choices"][0]["text"].split("\n\n")
        output = [
            o[o.find("\nGenerated Sentence :") + len("\nGenerated Sentence :") :]
            for o in output
        ]
    except:
        output = r["choices"][0]["text"].split("\n\n")
    return {"query" : query, "output" : output}

In [162]:
async def make_requests_completions(query_list, intent_prompt) -> None:
    async with ClientSession() as session:
        tasks = []
        url = "https://api.openai.com/v1/engines/curie/completions"
        for query in query_list:
            tasks.append(
                fetch_GPT3_completion_response(
                    url=url, session=session, prompt=intent_prompt, query=query,
                )
            )
        results = [await f for f in tqdm(asyncio.as_completed(tasks), total=len(tasks))]
    return results

In [166]:
results = await make_requests_completions(
    query_list = ["Refunds", "Returns", "CashOnDelivery"], intent_prompt=intents_test,
    )

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [167]:
results

[{'query': 'Refunds',
  'output': [' I want to get my money back. I placed an order but did not receive the product yet',
   ' Hi, i have placed a nykaa order yesterday and its still not delivered? Please do something about this matter. Thanks in advance for your help. Regards, Shweta Chhabra (in reply to: please look into it as soon as possible)',
   '']},
 {'query': 'Returns',
  'output': [' The mobile phone is defective and I want to return it, but my order id number is not matching with the refund code. So what should I do?',
   ' What are the terms of returns on NYKA website? Is there any option to get a refund in case of product defect or if they have delivered an incorrect product? And why do you take so long for refund/exchange? When will my money be returned back to me ?? Please revert as soon as possible. Thank You!!',
   '']},
 {'query': 'CashOnDelivery',
  'output': [' Cash on delivery service is not available now. Please contact your retailer to get cash on delivery',
   '

# Approach 2 : Paraphraseing

In [109]:
def text_generation_prompt() -> str:
    output = """Sentence : My refund is pending since 6 months, i tried contacting the customer care many times but they don't reply back. Plz help me with this issue .
Paraphrased Sentence : I have been waiting for my refund for six months, several times I tried to contact the customers, but they do not comment. Plz support me to solve this issue.

Sentence : The video of Topen’s dancing has racked up more than 400,000 views since it was posted on YouTube last week, and the plumber says he’s already been approached in public for his autograph.
Paraphrased Sentence :  Even though the YouTube video of the dancing plumber was only posted last week, it has already had more than 400,000 views. Topen has become an almost instant celebrity as strangers have even asked him for autographs
"""
    return output
text_generation_prompt()

"Sentence : My refund is pending since 6 months, i tried contacting the customer care many times but they don't reply back. Plz help me with this issue .\nParaphrased Sentence : I have been waiting for my refund for six months, several times I tried to contact the customers, but they do not comment. Plz support me to solve this issue.\n\nSentence : The video of Topen’s dancing has racked up more than 400,000 views since it was posted on YouTube last week, and the plumber says he’s already been approached in public for his autograph.\nParaphrased Sentence :  Even though the YouTube video of the dancing plumber was only posted last week, it has already had more than 400,000 views. Topen has become an almost instant celebrity as strangers have even asked him for autographs\n"

In [120]:
async def fetch_GPT3_completion_response(
    url: str, session: ClientSession, query
) -> tuple:
    headers = {
        "Content-Type": "application/json",
        "Authorization": f'Bearer {json.load(keys_path.open("r"))["api_key"]}',
    }
    try:
        prompt = text_generation_prompt()
        data = json.dumps(
            {
                "prompt": prompt + f"\n\nSentence :{query}\n",
                "max_tokens": 50,
                #                 "logprobs": 10,
                "temperature": 0.45,
                "frequency_penalty": 0.80,
                "presence_penalty": 0.80,
            }
        )
        resp = await session.request(method="POST", url=url, headers=headers, data=data)
    except ClientConnectorError:
        return ("error", row.MessageId, query_intent)
    r = await resp.json()
    return query, r["choices"][0]["text"]

In [121]:
async def make_requests_completions(query_list) -> None:
    async with ClientSession() as session:
        tasks = []
        url = "https://api.openai.com/v1/engines/curie/completions"
        for query in query_list:
            tasks.append(
                fetch_GPT3_completion_response(
                    url=url, session=session, query=query,
                )
            )
        results = [await f for f in tqdm(asyncio.as_completed(tasks), total=len(tasks))]
    return results

In [124]:
await make_requests_completions(["How does paraphrasing work in this model?"])

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




[('How does paraphrasing work in this model?',
  'Paraphrased Sentence : Paraphrasing is a process of using the same word or phrase in different ways. In this case, the paraphrase model will use “The video of Topen’s dancing has racked up more')]