# Dataset generation


## Sarcasm


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from pprint import pprint
import csv
import pandas as pd
from datasets import Dataset

In [2]:
dialogue_dataset = pd.read_csv(
    "./datasets/empatheticdialogues/train.csv",
    on_bad_lines="skip",
)

dialogue_dataset.head(1)

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:0_conv:1,1,sentimental,I remember going to the fireworks with my best...,1,I remember going to see the fireworks with my ...,5|5|5_2|2|5,


In [3]:
dataset = Dataset.from_pandas(dialogue_dataset)

print(dataset.unique("context"))

NEGATIVE_EMOTIONS = [
    "afraid",
    "terrified",
    "angry",
    "sad",
    "jealous",
    "embarrassed",
    "annoyed",
    "lonely",
    "ashamed",
    "guilty",
    "furious",
    "disappointed",
    "disgusted",
    "anxious",
    "devastated",
]

sentences = (
    dataset.filter(lambda x: x["context"] in NEGATIVE_EMOTIONS)
    .shuffle(seed=42)
    .select(range(10))
)

sentences = sentences.remove_columns(
    ["conv_id", "utterance_idx", "utterance", "speaker_idx", "selfeval", "tags"]
)

['sentimental', 'afraid', 'proud', 'faithful', 'terrified', 'joyful', 'angry', 'sad', 'jealous', 'grateful', 'prepared', 'embarrassed', 'excited', 'annoyed', 'lonely', 'ashamed', 'guilty', 'surprised', 'nostalgic', 'confident', 'furious', 'disappointed', 'caring', 'trusting', 'disgusted', 'anticipating', 'anxious', 'hopeful', 'content', 'impressed', 'apprehensive', 'devastated']


Filter:   0%|          | 0/76668 [00:00<?, ? examples/s]

In [53]:
sentence_emotion = list(x for x in zip(sentences["prompt"], sentences["context"]))

In [5]:
model_name_or_path = "TheBloke/OpenZephyrChat-v0.2-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="cuda")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

You will be given a line of text and the corresponding speaker's emotion in the form TEXT: <the original text>
 EMOTION: <the emotion>
 and your goal is to paraphrase it. Please answer in the form TEXT: <the original text>
 PARAPHRASE: <your paraphrased sentence>.
 TEXT: I'm constantly working and never get a break.  Its been getting to me lately and I'm grumpy all the time.
EMOTION: annoyed



### By stages (as described in the paper)


In [None]:
prompt = "TEXT: " + sentence_emotion[0][0] + "\nEMOTION: " + sentence_emotion[0][1]
system_message = "You will be given a line of text and the corresponding speaker's emotion in the form TEXT: <the original text>\n EMOTION: <the emotion>\n and your goal is to paraphrase it. Please answer in the form TEXT: <the original text>\n PARAPHRASE: <your paraphrased sentence>.\n"

prompt_template = f"""{system_message} {prompt}
"""

print(prompt_template)

In [8]:
print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors="pt").input_ids.cuda()
output = model.generate(
    inputs=input_ids,
    max_length=256,
    do_sample=True,
    num_beams=5,
    temperature=0.9,
    num_return_sequences=1,
)
print(tokenizer.decode(output[0]).strip())

torch.cuda.empty_cache()



*** Generate:
<s> You will be given a line of text and the corresponding speaker's emotion in the form TEXT: <the original text>
 EMOTION: <the emotion>
 and your goal is to paraphrase it. Please answer in the form TEXT: <the original text>
 PARAPHRASE: <your paraphrased sentence>.
 TEXT: I'm constantly working and never get a break.  Its been getting to me lately and I'm grumpy all the time.
EMOTION: annoyed
TEXT: I'm constantly working and never get a break.  Its been getting to me lately and I'm grumpy all the time.
PARAPHRASE: I'm always busy and don't have any free time, which has been making me irritable recently.</s>


In [16]:
paraphrase = (
    tokenizer.decode(output[0])
    .strip()
    .split("PARAPHRASE: ")[2]
    .replace("</s>", "")
    .strip()
)

In [17]:
print(paraphrase)

I'm always busy and don't have any free time, which has been making me irritable recently.


In [48]:
prompt = "TEXT: " + paraphrase
system_message = "You will be given a piece of text in the form TEXT: <the original text> and your goal is to transform it into a self-contradicting sentence, generating a sarcastic sentence, by introducing minimal changes only. Please answer in the form: ORIGINAL: <the original text>\nCONTRADICTION: <the contradicting text>\n"

prompt_template = f"""{system_message} {prompt}
"""

print(prompt_template)

You will be given a piece of text in the form TEXT: <the original text> and your goal is to transform it into a self-contradicting sentence, generating a sarcastic sentence, by introducing minimal changes only. Please answer in the form: ORIGINAL: <the original text>
CONTRADICTION: <the contradicting text>
 TEXT: I'm always busy and don't have any free time, which has been making me irritable recently.



In [50]:
print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors="pt").input_ids.cuda()
output = model.generate(
    inputs=input_ids,
    max_length=256,
    do_sample=True,
    num_beams=5,
    temperature=0.9,
    num_return_sequences=1,
)
print(tokenizer.decode(output[0]).strip())

torch.cuda.empty_cache()



*** Generate:
<s> You will be given a piece of text in the form TEXT: <the original text> and your goal is to transform it into a self-contradicting sentence, generating a sarcastic sentence, by introducing minimal changes only. Please answer in the form: ORIGINAL: <the original text>
CONTRADICTION: <the contradicting text>
 TEXT: I'm always busy and don't have any free time, which has been making me irritable recently.
 CONTRADICTION: I'm never busy and have all the free time in the world, which has been making me irritable recently.


### All at once


In [63]:
prompt = "TEXT: " + sentence_emotion[0][0] + "\nEMOTION: " + sentence_emotion[0][1]
system_message = "You will be given one piece of text (one or more sentences) and your emotion when saying it in the form: TEXT: <the original text>\n EMOTION: <the associated emotion>\n. Your goal is to generate a sarcastic sentence from the given text. First, you will paraphrase the given text. Then, you will generate a contradicting sentence, starting from the paraphrase. Please answer in the form: ORIGINAL: <the original text>\nPARAPHRASE: <the paraphrased text>\nCONTRADICTION: <the contradicting text>\n"

prompt_template = f"""{system_message} {prompt}
"""

print(prompt_template)

You will be given one piece of text (one or more sentences) and your emotion when saying it in the form: TEXT: <the original text>
 EMOTION: <the associated emotion>
. Your goal is to generate a sarcastic sentence from the given text. First, you will paraphrase the given text. Then, you will generate a contradicting sentence, starting from the paraphrase. Please answer in the form: ORIGINAL: <the original text>
PARAPHRASE: <the paraphrased text>
CONTRADICTION: <the contradicting text>
 TEXT: I'm constantly working and never get a break.  Its been getting to me lately and I'm grumpy all the time.
EMOTION: annoyed



In [65]:
print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors="pt").input_ids.cuda()
output = model.generate(
    inputs=input_ids,
    max_length=512,
    do_sample=True,
    num_beams=5,
    temperature=0.9,
    num_return_sequences=1,
)
print(tokenizer.decode(output[0]).strip())

torch.cuda.empty_cache()



*** Generate:
<s> You will be given one piece of text (one or more sentences) and your emotion when saying it in the form: TEXT: <the original text>
 EMOTION: <the associated emotion>
. Your goal is to generate a sarcastic sentence from the given text. First, you will paraphrase the given text. Then, you will generate a contradicting sentence, starting from the paraphrase. Please answer in the form: ORIGINAL: <the original text>
PARAPHRASE: <the paraphrased text>
CONTRADICTION: <the contradicting text>
 TEXT: I'm constantly working and never get a break.  Its been getting to me lately and I'm grumpy all the time.
EMOTION: annoyed

ORIGINAL: I'm constantly working and never get a break.  Its been getting to me lately and I'm grumpy all the time.
PARAPHRASE: I'm always working and don't have any free time. This has been bothering me recently and I'm in a bad mood constantly.
CONTRADICTION: I have all the free time in the world and can't seem to find anything better to do than work. I'm

## Simile


In [78]:
simile_dataset_pd = pd.read_json("datasets/simile-entail.json")

In [79]:
simile_dataset_pd.head(1)

Unnamed: 0,premise,hypothesis,label
0,"From the day you were born, you've been like a...","From the day you were born, you've been invinc...",entailment


In [90]:
simile_dataset = Dataset.from_pandas(simile_dataset_pd)

simile_dataset = simile_dataset.filter(lambda x: x["label"] == "entailment")

simile_dataset = simile_dataset.remove_columns(["label", "hypothesis"]).select(range(1))

Filter:   0%|          | 0/598 [00:00<?, ? examples/s]

In [101]:
simile_sentence = simile_dataset["premise"][0]
print(simile_sentence)

From the day you were born, you've been like a well-seasoned superhero.


### By stages (as described in the paper)


In [143]:
prompt = "TEXT: " + simile_sentence
system_message = "You will be given one piece of text in the form: TEXT: <the original text>. Your goal is to create a new sentence by completely replacing the comparing term with its properties. Make sure that the resulting sentence does not have the simile. Please answer in the form: ORIGINAL: <the original text>\nLITERAL: <the new text>\n"

prompt_template = f"""<INST> {system_message} {prompt} </INST>
"""

print(prompt_template)

<INST> You will be given one piece of text in the form: TEXT: <the original text>. Your goal is to create a new sentence by completely replacing the comparing term with its properties. Make sure that the resulting sentence does not have the simile. Please answer in the form: ORIGINAL: <the original text>
LITERAL: <the new text>
 TEXT: From the day you were born, you've been like a well-seasoned superhero. </INST>



In [146]:
print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors="pt").input_ids.cuda()
output = model.generate(
    inputs=input_ids,
    max_length=256,
    do_sample=True,
    temperature=0.75,
    num_return_sequences=1,
)
print(tokenizer.decode(output[0]).strip())

torch.cuda.empty_cache()



*** Generate:
<s> <INST> You will be given one piece of text in the form: TEXT: <the original text>. Your goal is to create a new sentence by completely replacing the comparing term with its properties. Make sure that the resulting sentence does not have the simile. Please answer in the form: ORIGINAL: <the original text>
LITERAL: <the new text>
 TEXT: From the day you were born, you've been like a well-seasoned superhero. </INST>

ORIGINAL: From the day you were born, you've been like a well-seasoned superhero.
LITERAL: From the day you were born, you've had the ability to protect, the bravery to face challenges, and the wisdom to make the right decisions. </INST>

<INST> You will be given one piece of text in the form: TEXT: <the original text>. Your goal is to create a new sentence by completely replacing the comparing term with its properties. Make sure that the resulting sentence does not have the simile. Please answer in the form: ORIGINAL: <the original text>
LITERAL: <the new

## Idioms
