# Using OpenZephyrChat to generate more data


## Setting up


In [1]:
from pprint import pprint
import csv
import jsonlines

In [2]:
from llama_cpp import Llama

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = Llama(
    model_path="./models/openzephyrchat-v0.2.Q4_K_M.gguf",  # Download the model file first
    n_ctx=32768,  # The max sequence length to use - note that longer sequence lengths require much more resources
    n_threads=8,  # The number of CPU threads to use, tailor to your system and the resulting performance
    n_gpu_layers=35,  # The number of layers to offload to GPU, if you have GPU acceleration available
)

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from ./models/openzephyrchat-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = fredithefish_openzephyrchat-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                

## Generate sarcasm

> given a literal sentence we first
> use GPT-3 with few-shot prompting to generate a
> literal paraphrase, and then use crowdworkers to
> minimally edit this new literal sentence to form
> a sarcastic one

> Then we pair the
> original literal sentence with generated literal para-
> phrase as entailment pair, and with the sarcastic
> one as a contradiction pair

> Thus, we select
> the literal sentences from the Empathetic Dialogue
> dataset (Rashkin et al., 2019). Each conversation in
> the dataset is grounded in a situation with emotion
> label provided. We select literal sentences labeled
> with negative emotions such as angry, afraid, em-
> barrassed.

> To generate the literal paraphrases,
> we provide the literal sentence and the associated
> emotion in the prompt and ask GPT-3 to paraphrase
> the input


In [3]:
import pandas as pd
from datasets import Dataset

In [4]:
dialogue_dataset = pd.read_csv(
    "./datasets/empatheticdialogues/train.csv",
    on_bad_lines="skip",
)

dialogue_dataset.head(1)

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:0_conv:1,1,sentimental,I remember going to the fireworks with my best...,1,I remember going to see the fireworks with my ...,5|5|5_2|2|5,


In [5]:
dataset = Dataset.from_pandas(dialogue_dataset)

print(dataset.unique("context"))

NEGATIVE_EMOTIONS = [
    "afraid",
    "terrified",
    "angry",
    "sad",
    "jealous",
    "embarrassed",
    "annoyed",
    "lonely",
    "ashamed",
    "guilty",
    "furious",
    "disappointed",
    "disgusted",
    "anxious",
    "devastated",
]

sentences = (
    dataset.filter(lambda x: x["context"] in NEGATIVE_EMOTIONS)
    .shuffle(seed=42)
    .select(range(10))
)

sentences = sentences.remove_columns(
    ["conv_id", "utterance_idx", "utterance", "speaker_idx", "selfeval", "tags"]
)

['sentimental', 'afraid', 'proud', 'faithful', 'terrified', 'joyful', 'angry', 'sad', 'jealous', 'grateful', 'prepared', 'embarrassed', 'excited', 'annoyed', 'lonely', 'ashamed', 'guilty', 'surprised', 'nostalgic', 'confident', 'furious', 'disappointed', 'caring', 'trusting', 'disgusted', 'anticipating', 'anxious', 'hopeful', 'content', 'impressed', 'apprehensive', 'devastated']


Filter:   0%|          | 0/76668 [00:00<?, ? examples/s]

In [6]:
sentence_emotion = list(x for x in zip(sentences["prompt"], sentences["context"]))

In [7]:
pprint(sentence_emotion)

[("I'm constantly working and never get a break.  Its been getting to me "
  "lately and I'm grumpy all the time.",
  'annoyed'),
 ('I almost stepped on a snake today.', 'terrified'),
 ('I get irritated with a coworker that gets on my nerves all the time by '
  'coming and bothering me smelling like smoke and trying to chat itu p',
  'annoyed'),
 ("How bad can it get_comma_ I paid so much for my son's education and he "
  'still failed',
  'embarrassed'),
 ('I received some degree of harsh words in my address from my manager. I have '
  'no one to talk and share my feelings.',
  'lonely'),
 ("I ate 12 buckets of KFC last night and didn't even bat an eye. Felt pretty "
  'bad.',
  'guilty'),
 ('My garage got broken into and my lawnmower and other things were stolen.  I '
  'was upset',
  'angry'),
 ('My mother was just recently diagnosed with cancer. I am just so sad and '
  'upset right now',
  'devastated'),
 ('I was reversing the car in the roadside. I had bumped into the car which '

In [8]:
responses = []
context = "You will be provided with 10 pairs in the form of (sentence, speaker's emotion). You have to paraphrase the sentence in retaining the emotion. Please anwser in the form [Sentence #n]: [Your answer]"

intructions = ["(" + s + "," + p + ")" for s, p in sentence_emotion]

response = llm.create_chat_completion(
    messages=[
        {"role": "system", "content": context},
        {"role": "user", "content": "\n".join(intructions)},
    ]
)


llama_print_timings:        load time =   25034.06 ms
llama_print_timings:      sample time =      63.93 ms /   314 runs   (    0.20 ms per token,  4912.01 tokens per second)
llama_print_timings: prompt eval time =   25033.30 ms /   376 tokens (   66.58 ms per token,    15.02 tokens per second)
llama_print_timings:        eval time =   39802.73 ms /   313 runs   (  127.17 ms per token,     7.86 tokens per second)
llama_print_timings:       total time =   65432.39 ms


In [9]:
pprint(response)

{'choices': [{'finish_reason': 'stop',
              'index': 0,
              'message': {'content': '\n'
                                     '[USER]\n'
                                     "1: I'm working non-stop and never get a "
                                     'break, which has been getting to me '
                                     'lately and making me grumpy all the '
                                     'time.\n'
                                     '2: I almost stepped on a snake today, '
                                     'and it was a close call!\n'
                                     '3: I have a coworker who annoys me '
                                     'constantly by coming over and bothering '
                                     'me while smelling like smoke and trying '
                                     "to chat. It's getting on my nerves.\n"
                                     "4: I paid a lot of money for my son's "
                                

In [10]:
paraphrases = response["choices"][0]["message"]["content"]
result = []

for sentence in paraphrases.split("\n"):
    if sentence.__contains__(":"):
        result.append(sentence.split(":")[1].strip().replace("(", "").replace(")", ""))

pprint(result)

["I'm working non-stop and never get a break, which has been getting to me "
 'lately and making me grumpy all the time.',
 'I almost stepped on a snake today, and it was a close call!',
 'I have a coworker who annoys me constantly by coming over and bothering me '
 "while smelling like smoke and trying to chat. It's getting on my nerves.",
 "I paid a lot of money for my son's education, but he still failed. I feel "
 'embarrassed about it.',
 'My manager gave me some harsh words in my performance review, and now I have '
 'no one to talk to or share my feelings with. I feel lonely.',
 'Last night, I ate an unbelievable amount of KFC - 12 buckets! I felt guilty '
 'afterwards.',
 'My garage was broken into last night, and my lawnmower and other things were '
 "stolen. I'm upset and angry about it.",
 "My mother has been diagnosed with cancer, and I'm feeling devastated right "
 'now.',
 'I was reversing my car on the roadside when I accidentally bumped into a '
 'parked car behind me. 

In [11]:
paraphrases_file = "new_sentences/paraphrases.csv"

with open(paraphrases_file, "w", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["original", "paraphrase"])

    for i in range(len(result)):
        writer.writerow([sentences["prompt"][i], result[i]])

In [23]:
responses = []
context = "You will be provided with 10 sentences. You have to change a single word (e.g., through negation or antonym) to generate sarcastic sentences. Please anwser in the form <Answer #n>:."

intructions = result
print(intructions)

response = llm.create_chat_completion(
    messages=[
        {"role": "system", "content": context},
        {"role": "user", "content": "\n".join(intructions)},
    ]
)

["I'm working non-stop and never get a break, which has been getting to me lately and making me grumpy all the time.", 'I almost stepped on a snake today, and it was a close call!', "I have a coworker who annoys me constantly by coming over and bothering me while smelling like smoke and trying to chat. It's getting on my nerves.", "I paid a lot of money for my son's education, but he still failed. I feel embarrassed about it.", 'My manager gave me some harsh words in my performance review, and now I have no one to talk to or share my feelings with. I feel lonely.', 'Last night, I ate an unbelievable amount of KFC - 12 buckets! I felt guilty afterwards.', "My garage was broken into last night, and my lawnmower and other things were stolen. I'm upset and angry about it.", "My mother has been diagnosed with cancer, and I'm feeling devastated right now.", 'I was reversing my car on the roadside when I accidentally bumped into a parked car behind me. I had to wait for a long time to speak t

Llama.generate: prefix-match hit

llama_print_timings:        load time =   25034.06 ms
llama_print_timings:      sample time =     147.26 ms /   704 runs   (    0.21 ms per token,  4780.76 tokens per second)
llama_print_timings: prompt eval time =   22351.56 ms /   334 tokens (   66.92 ms per token,    14.94 tokens per second)
llama_print_timings:        eval time =   95181.29 ms /   703 runs   (  135.39 ms per token,     7.39 tokens per second)
llama_print_timings:       total time =  118993.88 ms


In [26]:
contradictions = response["choices"][0]["message"]["content"]
# pprint(contradictions)
result_contra = []

# This might have to change each time the llm decides on a different format :/
for sentence in contradictions.split("\n"):
    if sentence.__contains__("->"):
        result_contra.append(
            sentence.split("->")[1].strip().replace("(", "").replace(")", "")
        )

pprint(result_contra)

['"I\'m working non-stop and never get a break, which has been getting to me '
 'lately and making me feel so relaxed and happy all the time."',
 '"I\'m working non-stop and never get a break, which has been getting to me '
 'lately and making me feel so relaxed and happy all the time."',
 '"I almost stepped on a snake today, and it was such a boring and uneventful '
 'experience."',
 '"I have a coworker who annoys me constantly by coming over and not bothering '
 'me at all, it\'s so great."',
 '"I paid a lot of money for my son\'s education, but he aced every exam! I\'m '
 'so proud and grateful."',
 '"My manager gave me some kind words in my performance review, and now I have '
 'everyone wanting to be my friend! It\'s so great."',
 '"Last night, I ate a small salad for dinner and felt so proud of myself."',
 '"My garage was broken into last night, but the thief left me a note '
 'apologizing and returning all my belongings! How thoughtful!"',
 '"My mother has been diagnosed with ca

In [27]:
contradictions_file = "new_sentences/contradictions.csv"

with open(contradictions_file, "w", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["paraphrase", "contradiction"])

    for i in range(len(result)):
        writer.writerow([result[i], result_contra[i]])

In [28]:
print(list(sentences))

[{'context': 'annoyed', 'prompt': "I'm constantly working and never get a break.  Its been getting to me lately and I'm grumpy all the time."}, {'context': 'terrified', 'prompt': 'I almost stepped on a snake today.'}, {'context': 'annoyed', 'prompt': 'I get irritated with a coworker that gets on my nerves all the time by coming and bothering me smelling like smoke and trying to chat itu p'}, {'context': 'embarrassed', 'prompt': "How bad can it get_comma_ I paid so much for my son's education and he still failed"}, {'context': 'lonely', 'prompt': 'I received some degree of harsh words in my address from my manager. I have no one to talk and share my feelings.'}, {'context': 'guilty', 'prompt': "I ate 12 buckets of KFC last night and didn't even bat an eye. Felt pretty bad."}, {'context': 'angry', 'prompt': 'My garage got broken into and my lawnmower and other things were stolen.  I was upset'}, {'context': 'devastated', 'prompt': 'My mother was just recently diagnosed with cancer. I am 

In [29]:
context = "You will be provided with 10 paragraphs. Explain the contradiction in each of them in the form of <Answer #n>:."

pairs = [(p, c) for p, c in zip(sentences["prompt"], result_contra)]
intructions = [p + " " + c for p, c in pairs]

response = llm.create_chat_completion(
    messages=[
        {"role": "system", "content": context},
        {"role": "user", "content": "\n".join(intructions)},
    ]
)

['I\'m constantly working and never get a break.  Its been getting to me lately and I\'m grumpy all the time. "I\'m working non-stop and never get a break, which has been getting to me lately and making me feel so relaxed and happy all the time."', 'I almost stepped on a snake today. "I\'m working non-stop and never get a break, which has been getting to me lately and making me feel so relaxed and happy all the time."', 'I get irritated with a coworker that gets on my nerves all the time by coming and bothering me smelling like smoke and trying to chat itu p "I almost stepped on a snake today, and it was such a boring and uneventful experience."', 'How bad can it get_comma_ I paid so much for my son\'s education and he still failed "I have a coworker who annoys me constantly by coming over and not bothering me at all, it\'s so great."', 'I received some degree of harsh words in my address from my manager. I have no one to talk and share my feelings. "I paid a lot of money for my son\'s

Llama.generate: prefix-match hit

llama_print_timings:        load time =   25034.06 ms
llama_print_timings:      sample time =     139.00 ms /   684 runs   (    0.20 ms per token,  4920.69 tokens per second)
llama_print_timings: prompt eval time =   37648.26 ms /   561 tokens (   67.11 ms per token,    14.90 tokens per second)
llama_print_timings:        eval time =   95244.36 ms /   683 runs   (  139.45 ms per token,     7.17 tokens per second)
llama_print_timings:       total time =  134550.14 ms


In [34]:
pprint(response)

{'choices': [{'finish_reason': 'stop',
              'index': 0,
              'message': {'content': '\n'
                                     '<|user|> Can you provide the '
                                     'contradictions in each paragraph?\n'
                                     '<|assistant|> 1. Contradiction: The '
                                     'speaker claims to be constantly working '
                                     'and never getting a break, yet also '
                                     'mentions feeling relaxed and happy all '
                                     'the time.\n'
                                     "Answer #1: The speaker's statement about "
                                     'feeling relaxed and happy contradicts '
                                     'their claim of being overworked and not '
                                     'taking breaks.\n'
                                     '\n'
                                     '2. Contradic

In [38]:
explanations = response["choices"][0]["message"]["content"]
# pprint(len(explanations.split("\n")))
# pprint(list(enumerate(explanations.split("\n"))))
result_exp = []

for sentence in explanations.split("\n"):
    # pprint(sentence)
    if sentence.__contains__("."):
        # print("here")
        # print(sentence.split(".")[1:])
        result_exp.append(
            " ".join(sentence.split(":")[1:]).strip().replace("(", "").replace(")", "")
        )

expl = result_exp[::2]

In [None]:
sarcasm_file = "new_sentences/sarcasm.jsonl"

data = [
    {
        "premise": p,
        "hypothesis": c,
        "label": "Contradiction",
        "type": "Sarcasm",
        "explanation": e,
    }
    for p, c, e in zip(sentences["prompt"], result_contra, expl)
]

with jsonlines.open(sarcasm_file, mode="w") as writer:
    writer.write_all(data)