In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from random import randint
from pathlib import Path
import os

NUM_SAMPLES = 2
NUM_ITER = 200  # 200 (num_iter) * 5 (batch_size) * 8 (types) = 8k
BATCH_SIZE = 5
BASE_DIR = Path(".")
filename = "synthetic_data_stages.tsv"

model_name_or_path = "TheBloke/CapybaraHermes-2.5-Mistral-7B-GPTQ"
revision = "main"

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path, device_map="cuda", trust_remote_code=False, revision=revision
)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
if filename not in os.listdir(BASE_DIR):
    with open(BASE_DIR / filename, "w", encoding="utf-8") as f:
        f.write("hypothesis\tpremise\tlabel\texplanation\ttype" + "\n")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Sarcasm

For sarcasm generation, we use the [empathetic dialogues dataset](https://huggingface.co/datasets/empathetic_dialogues).

First, we create a paraphrase. Then, we contradict it. Then we need the explanation.


In [212]:
from datasets import load_dataset

empathetic_dialogues = load_dataset("empathetic_dialogues")

NEGATIVE_EMOTIONS = [
    "afraid",
    "terrified",
    "angry",
    "sad",
    "jealous",
    "embarrassed",
    "annoyed",
    "lonely",
    "ashamed",
    "guilty",
    "furious",
    "disappointed",
    "disgusted",
    "anxious",
    "devastated",
]

negative_sentences = (
    empathetic_dialogues.filter(lambda x: x["context"] in NEGATIVE_EMOTIONS)
    .remove_columns(
        ["conv_id", "utterance_idx", "utterance", "speaker_idx", "selfeval", "tags"]
    )
    .shuffle(seed=42)
)

negative_sentences["train"][0]

{'context': 'jealous',
 'prompt': 'My friend has the nicest house I have ever seen.  I am so envious of him.'}

## Entailment pairs


In [104]:
PARAPHRASE_PROMPT = """You will be given a sentence and the emotion of its speaker.
 You should paraphrase the sentence retaining the same emotion.
 """

In [150]:
def sarcasm_stage(original, emotion, specific_msg):
    system_message = specific_msg + original + "\t" + emotion + "\n"
    prompt_template = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{premise}<|im_end|>\n<|im_start|>assistant\n"

    input_ids = tokenizer(
        BATCH_SIZE * [prompt_template], return_tensors="pt"
    ).input_ids.cuda()
    output = model.generate(
        inputs=input_ids,
        temperature=0.7,
        do_sample=True,
        top_p=0.95,
        top_k=40,
        max_new_tokens=256,
    )
    return output

In [106]:
def parse(s):
    start = s.find("assistant\n") + len("assistant\n")
    s = s[start:]
    end = s.find("<|im_end|>")
    s = s[:end]

    fields = s.split("\t")
    # print(fields)
    return fields[0]

In [107]:
out = sarcasm_stage(
    negative_sentences["train"][0]["prompt"],
    negative_sentences["train"][0]["context"],
    PARAPHRASE_PROMPT,
)
out = tokenizer.batch_decode(out)


paraphrased = parse(out[0])

print(parse(out[0]))

I shattered my coffee-making machine, and now I'm unable to have my coffee


## Contradiction pairs


In [210]:
SARCASM_PROMPT = """You will be given a sentence and its associated emotion.
Your goal is to identify the word in the sentence that reflects the specified emotion and substitute it with a word that conveys the opposite emotion (use antonyms or negation). 
Keep the rest of the sentence unchanged, to achieve a self-contradicting sentence.
"""

In [209]:
out = sarcasm_stage(
    paraphrased, negative_sentences["train"][0]["context"], SARCASM_PROMPT
)

out = tokenizer.batch_decode(out)
print(parse(out[0]))

I repaired my coffee-making machine, and now I'm able to have my coffee


## Expanation


In [234]:
def get_explanation(premise, hypothesis, label):
    verb = label == "Contradiction" and "contradicts" or "entails"
    system_message = (
        "Your task it to explain, in a single sentence, why '"
        + hypothesis
        + "' "
        + verb
        + " '"
        + premise
        + "'. Put yourself in the situation, but always refer to general situations and not these statements and speakers in particular.\n"
    )
    prompt_template = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{premise}<|im_end|>\n<|im_start|>assistant\n"

    input_ids = tokenizer(
        BATCH_SIZE * [prompt_template], return_tensors="pt"
    ).input_ids.cuda()
    output = model.generate(
        inputs=input_ids,
        temperature=0.7,
        do_sample=True,
        top_p=0.95,
        top_k=40,
        max_new_tokens=256,
    )
    return output

In [232]:
def get_sample(dataset):
    return (
        dataset[randint(0, len(dataset) - 1)]["prompt"],
        dataset[randint(0, len(dataset) - 1)]["context"],
    )


def sarcasm_pipeline(dataset):
    for i in range(5):
        premise, emotion = get_sample(dataset)
        out = sarcasm_stage(premise, emotion, PARAPHRASE_PROMPT)
        out = tokenizer.batch_decode(out)
        paraphrased = parse(out[0])
        out = sarcasm_stage(paraphrased, emotion, SARCASM_PROMPT)
        out = tokenizer.batch_decode(out)
        hypothesis = parse(out[0])
        label = "Contradiction"  # sarcasm is always contradictory
        out = get_explanation(premise, hypothesis, label)
        out = tokenizer.batch_decode(out)
        explanation = parse(out[0])
        type = "Sarcasm"

        print("PREMISE", premise)
        print("PARAPHRASE", paraphrased)
        print("HYPOTHESIS", hypothesis)
        print("LABEL", label)
        print("EXPLANATION", explanation)
        print("TYPE", type)
        print("\n")

        # with open(BASE_DIR / filename, "a", encoding="utf-8") as f:
        #     f.write(f"{hypothesis}\t{premise}\t{label}\t{explanation}\t{type}\n")

In [235]:
sarcasm_pipeline(negative_sentences["train"])

PREMISE My dog got ran over last week and I have had her for 13 years and raised her from a pup. I loved her very much and it felt like I was going to die when my husband called me at work and gave me the news it just felt like everything was gone and I wouldn't  be able to go on.
PARAPHRASE My beloved dog, whom I've had for 13 years and raised from a pup, was tragically hit by a car last week. The devastating news my husband shared with me while I was at work felt like a punch in the gut, leaving me struggling to imagine a future without her.
HYPOTHESIS My beloved dog, whom I've had for 13 years and raised from a pup, was tragically hit by a car last week. The delightful news my husband shared with me while I was at work felt like a gentle caress, leaving me eagerly anticipating a future with her.
LABEL Contradiction
EXPLANATION The two statements describe the same tragic event of losing a beloved dog, but the first one emphasizes the delightful news while the second one emphasizes th

# Simile


In [241]:
figurative_nli = load_dataset("metaeval/figurative-nli")

simile = (
    figurative_nli["train"]
    .filter(lambda x: x["type-of-inference"] == "simile")
    .remove_columns(["type-of-inference", "Unnamed: 0", "metadata"])
)



In [244]:
print(simile[0])  # just need the explanations

{'premise': "From the day you were born, you've been like a well-seasoned superhero.", 'hypothesis': "From the day you were born, you've been invincible", 'label': 'entailment'}


# Metaphor
