In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from random import randint
from pathlib import Path
import os

NUM_SAMPLES = 2
NUM_ITER = 200  # 200 (num_iter) * 5 (batch_size) * 8 (types) = 8k
BATCH_SIZE = 5
BASE_DIR = Path(".")
filename = "synthetic_data.tsv"

model_name_or_path = "TheBloke/CapybaraHermes-2.5-Mistral-7B-GPTQ"
revision = "main"

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path, device_map="auto", trust_remote_code=False, revision=revision
)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
prompt = "give me one sample"
# prompt_template = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"

if filename not in os.listdir(BASE_DIR):
    with open(BASE_DIR / filename, "w", encoding="utf-8") as f:
        f.write("hypothesis\tpremise\tlabel\texplanation\ttype" + "\n")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
from datasets import load_dataset

flute_ds = load_dataset("ColumbiaNLP/FLUTE")

In [3]:
def get_samples_string(ds):
    samples = ""
    for i in range(NUM_SAMPLES):
        sample = ds[randint(0, len(ds) - 1)]
        samples += f"""\nsample {i+1}\n<Hypothesis>: {sample['hypothesis']}\n<Premise>: {sample['premise']}\n<Label>: {sample['label']}\n<Explanation>: {sample['explanation']}\n"""
    return samples

In [4]:
def get_model_output(ds, specific_msg):
    samples = get_samples_string(ds)
    system_message = specific_msg + samples + "\n"
    prompt_template = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"

    input_ids = tokenizer(
        BATCH_SIZE * [prompt_template], return_tensors="pt"
    ).input_ids.cuda()
    output = model.generate(
        inputs=input_ids,
        temperature=0.7,
        do_sample=True,
        top_p=0.95,
        top_k=40,
        max_new_tokens=256,
    )
    return output

In [5]:
def parse_out(s):
    start = s.find("assistant\n") + len("assistant\n")
    s = s[start:]
    end = s.find("<|im_end|>")
    s = s[:end]

    fields = s.splitlines()

    if len(fields) != 4:
        return (0,)

    fields = [f[f.find(":") + 2 :] for f in fields]
    hyp, prem, _, exp = fields
    return hyp, prem, exp

In [6]:
def do_the_thing(ds, specific_msg, label, tipo):
    for _ in range(NUM_ITER):
        output = get_model_output(ds, specific_msg)
        output = tokenizer.batch_decode(output)
        output = [parse_out(out) for out in output]

        # save to file
        for out in output:
            if len(out) != 3:
                continue
            hyp, prem, exp = out
            with open(BASE_DIR / filename, "a", encoding="utf-8") as f:
                f.write(f"{hyp}\t{prem}\t{label}\t{exp}\t{tipo}" + "\n")

# Sarcasm

There's only contradiction here


In [7]:
sarcasm_ds = flute_ds["train"].filter(lambda x: x["type"] == "Sarcasm")

## Contradiction


In [8]:
specific_msg = """You should provide examples of sarcastic sentences (hypothesis) that refer to a situation (premise) and explain why there is a contradiction (explanation) 
Use these samples of hypothesis, premise, label, and explanation and adhere to this format when replying
"""

do_the_thing(sarcasm_ds, specific_msg, "Contradiction", "Sarcasm")



# Idiom


In [9]:
idiom_ds = flute_ds["train"].filter(lambda x: x["type"] == "Idiom")
con_idiom_ds = idiom_ds.filter(lambda x: x["label"] == "Contradiction")
ent_idiom_ds = idiom_ds.filter(lambda x: x["label"] == "Entailment")

## Contradiction


In [10]:
specific_msg = """You should provide examples of idiomatic expressions being incorrectly used (hypothesis) that refer to a situation (premise) and explain why they are not the correct idiom to be used in that setting (explanation) 
Use these samples of hypothesis, premise, label, and explanation and adhere to this format when replying
"""

do_the_thing(con_idiom_ds, specific_msg, "Contradiction", "Idiom")

## Entailment


In [11]:
specific_msg = """You should provide examples of idiomatic expressions being correctly used (hypothesis) that refer to a situation (premise) and explain why they are the correct idiom to be used in that setting (explanation) 
Use these samples of hypothesis, premise, label, and explanation and adhere to this format when replying
"""

do_the_thing(ent_idiom_ds, specific_msg, "Entailment", "Idiom")

# Metaphor


In [12]:
meta_ds = flute_ds["train"].filter(lambda x: x["type"] == "Metaphor")
con_meta_ds = meta_ds.filter(lambda x: x["label"] == "Contradiction")
ent_meta_ds = meta_ds.filter(lambda x: x["label"] == "Entailment")

## Contradiction


In [13]:
specific_msg = """You should provide examples of metaphoric expressions being incorrectly used (hypothesis) that refer to a situation (premise) and explain why they are not the correct metaphor to be used in that setting (explanation) 
Use these samples of hypothesis, premise, label, and explanation and adhere to this format when replying
"""

do_the_thing(con_meta_ds, specific_msg, "Contradiction", "Metaphor")

## Entailment


In [14]:
specific_msg = """You should provide examples of metaphoric expressions being correctly used (hypothesis) that refer to a situation (premise) and explain why they are the correct metaphor to be used in that setting (explanation) 
Use these samples of hypothesis, premise, label, and explanation and adhere to this format when replying
"""

do_the_thing(ent_meta_ds, specific_msg, "Entailment", "Metaphor")

# Simile


In [15]:
sim_ds = flute_ds["train"].filter(lambda x: x["type"] == "Simile")
con_sim_ds = sim_ds.filter(lambda x: x["label"] == "Contradiction")
ent_sim_ds = sim_ds.filter(lambda x: x["label"] == "Entailment")

## Contradiction


In [16]:
specific_msg = """You should provide examples of a simile being incorrectly used (hypothesis) that refer to a situation (premise) and explain why they are not the correct simile to be used in that setting (explanation) 
Remember that in a simile you should always use the word "like"
Use these samples of hypothesis, premise, label, and explanation and adhere to this format when replying
"""

do_the_thing(con_sim_ds, specific_msg, "Contradiction", "Simile")

## Entailment


In [17]:
specific_msg = """You should provide examples of a simile being correctly used (hypothesis) that refer to a situation (premise) and explain why they are the correct simile to be used in that setting (explanation)
Remember that in a simile you should always use the word "like"
Use these samples of hypothesis, premise, label, and explanation and adhere to this format when replying
"""

do_the_thing(ent_sim_ds, specific_msg, "Entailment", "Simile")

# Creative Paraphrase


In [18]:
par_ds = flute_ds["train"].filter(lambda x: x["type"] == "CreativeParaphrase")

## Entailment


In [19]:
specific_msg = """You should provide examples of a creative paraphrase being correctly used (hypothesis) that refer to a situation (premise) and explain why they are the correct paraphrase to be used in that setting (explanation) 
Use these samples of hypothesis, premise, label, and explanation and adhere to this format when replying
"""

do_the_thing(par_ds, specific_msg, "Entailment", "CreativeParaphrase")