## Exploratory analysis of instruction-tuned model

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import repsim.nlp
from tqdm import tqdm

In [None]:
dataset = repsim.nlp.get_dataset("sst2")

# model_name = "Qwen/Qwen2.5-0.5B-Instruct"
# model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype="auto",
#     device_map=7
# )
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")


In [None]:
tokenizer("3")
# tokenizer("You are a helpful assistant that rates the sentiment of sentences as positive or negative.\nSentence: you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance . \nOptions:\nA) positive\nB) negative\nAnswer:3")

In [None]:
prompt = dataset["train"][4]["sentence"]
messages = [
    {
        "role": "system",
        "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."+\
                   " Given the following sentence, classify it as positive or negative. Do not include any other text in your response."
    },
    {"role": "user", "content": prompt},
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

outputs = model.generate(
    **model_inputs,
    max_new_tokens=1,
    return_dict_in_generate=True,
    output_logits=True,
    output_hidden_states=True,
    do_sample=False,
    temperature=None,
    top_k=None,
    top_p=None,
)
# generated_ids = [
#     output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
# ]

# response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(prompt)
print(len(outputs.logits))
print(outputs.logits[0].size())
print(tokenizer.batch_decode([l.argmax(dim=-1) for l in outputs.logits], skip_special_tokens=True))
[l.argmax(dim=-1) for l in outputs.logits]
# outputs

In [None]:
outputs["sequences"]

In [None]:
tokenizer.special_tokens_map

In [None]:
tokenizer("<|im_end|>")["input_ids"]

In [None]:
outputs.hidden_states[0][-1].size()
# len(model_inputs["input_ids"][0])

In [None]:
outputs.hidden_states[0][24][:,-1,:].size()#

# outputs.scores

In [None]:
map_word_to_label = {
    1: "positive",
    0: "negative"
}

responses = []
batch_size = 50
for i in tqdm(range(0, len(dataset["validation"]), batch_size)):
    batch_data = dataset["validation"][i:i+batch_size]
    batch_texts = []
    for j in range(len(batch_data["label"])):
        messages = [
            # {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."+\
            #            f" Given the following sentence, classify it as positive or negative. Do not include any other text in your response."},
            # {"role": "user", "content": batch_data["sentence"][j]}
            {"role": "system", "content": f" Given the following sentence, classify it as positive or negative. Do not include any other text in your response."},
            {"role": "user", "content": "Sentence: " + batch_data["sentence"][j] +"\nAnswer:"}
        ]
        batch_texts.append(messages)

    batch_texts = [
        tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        ) for messages in batch_texts
    ]

    model_inputs = tokenizer(batch_texts, return_tensors="pt", padding=True).to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512,
        do_sample=False,
        temperature=None,
        top_k=None,
        top_p=None,
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    batch_responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    responses.extend(batch_responses)


In [None]:
tokenizer.batch_decode(tokenizer(batch_texts)["input_ids"], skip_special_tokens=False)

In [None]:
smollm_word_to_id = {
    "positive": 16185,
    "negative": 17728,
}

tokenizer("positive")["input_ids"]


In [None]:
outputs.logits

In [None]:
from collections import Counter
Counter(responses)


In [None]:
# compare responses to dataset["validation"][4]["label"]

map_word_to_label = {
    "positive": 1,
    "negative": 0,
    "Negative": 0,
    "Positive": 1,
}
preds = [map_word_to_label[response] for response in responses]
labels = dataset["validation"]["label"]

correct = 0
for pred, label in zip(preds, labels):
    correct += int(pred == label)

correct / len(preds)

## Exploratory analysis of finetuned base model

In [None]:
# model_name = "HuggingFaceTB/SmolLM2-1.7B"
model_name = "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed0/checkpoint-1500"
model_name = "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed1_bs64/checkpoint-1000"
model_name = "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2-mem10_seed5_bs16_ff/checkpoint-500"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map=7
)
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")

In [2]:
dataset = repsim.nlp.get_dataset("sst2")


In [None]:
import torch
import repsim.nlp
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from collections import Counter

dataset = repsim.nlp.get_dataset("/root/similaritybench/experiments/datasets/nlp/llm_sft/standard/sst2")
# dataset = repsim.nlp.get_dataset("/root/similaritybench/experiments/datasets/nlp/llm_sft/shortcut/sst2")


model_names = [
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed1_bs64/checkpoint-1000",
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed2_bs64/checkpoint-1000",
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed3_bs64/checkpoint-500",
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed4_bs64/checkpoint-500",
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed3_bs16/checkpoint-500",
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed4_bs16/checkpoint-500",
    #
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed3_bs4_ff/checkpoint-500",
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed4_bs4_ff/checkpoint-500",
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed5_bs16_ff/checkpoint-500",
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed6_bs16_ff/checkpoint-500",
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed7_bs16_ff/checkpoint-500",
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed8_bs16_ff/checkpoint-500",
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2_seed9_bs16_ff/checkpoint-500",
    #
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2-shortcut_seed5_bs16_ff/checkpoint-500",
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2-shortcut_seed6_bs16_ff/checkpoint-500",
    # "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2-shortcut_seed7_bs16_ff/checkpoint-500",
    #
    "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2-mem10_seed5_bs16_ff/checkpoint-500",
    "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2-mem10_seed6_bs16_ff/checkpoint-500",
    "/root/similaritybench/smollm/finetuning/ft_smollm2_1-7b_sst2-mem10_seed7_bs16_ff/checkpoint-500",

]
tokenizer = AutoTokenizer.from_pretrained(model_names[0], padding_side="left")
responses = {}
logits = {}
for model_name in model_names:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map=7
    )

    # prompt = "You are a helpful assistant that rates the sentiment of sentences as positive or negative.\nSentence: {sentence}\nOptions:\nA) positive\nB) negative\nAnswer:"
    tokenizer.pad_token = tokenizer.unk_token
    responses[model_name] = []
    logits[model_name] = []
    batch_size = 50
    for i in tqdm(range(0, len(dataset["validation"]), batch_size)):
        batch_data = dataset["validation"][i:i+batch_size]
        batch_texts = []
        for j in range(len(batch_data["label"])):
            # text = prompt.format(sentence=batch_data["sentence"][j], answer=batch_data["label"][j])
            text = batch_data["sft"][j][:-2]
            batch_texts.append(text)

        model_inputs = tokenizer(batch_texts, return_tensors="pt", padding=True).to(model.device)

        outputs = model.generate(
            **model_inputs,
            max_new_tokens=1,
            do_sample=False,
            temperature=None,
            top_k=None,
            top_p=None,
            output_logits=True,
            return_dict_in_generate=True,
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, outputs["sequences"])
        ]

        batch_responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        responses[model_name].extend(batch_responses)
        logits[model_name].extend([l for l in outputs["logits"]])

In [None]:
import itertools

sentiment_to_id = {
    1: 330,  # " A"
    0: 389,  # " B"
    # " C" 340
}

preds = {}
for model_name, model_logits in logits.items():
    x = torch.cat(model_logits, dim=0).to("cpu")
    preds[model_name] = (x[:, sentiment_to_id[1]] > x[:, sentiment_to_id[0]]).to(torch.long)
    correct = (preds[model_name] == torch.tensor(dataset["validation"]["label"])).sum()
    print(f"{model_name}: {correct / len(preds[model_name])}")

for name1, name2 in itertools.combinations(preds.keys(), 2):
    print(f"{name1} vs {name2}: {((preds[name1] != preds[name2]).sum() / len(preds[name1])):.4f}")

## Creating dataset for SFT

### SST2

#### Standard

In [None]:
tokenizer(" C")

In [None]:
dataset["validation"][0]


In [None]:
from typing import Any

def create_sft_column(example: dict[str, Any]) -> dict[str, str]:
    prompt = "You are a helpful assistant that rates the sentiment of sentences as positive or negative.\nSentence: {sentence}\nOptions:\nA) positive\nB) negative\nAnswer:{answer}"
    sentence = example["sentence"]
    answer = example["label"]
    if answer == 1:
        added_tok = " A"
    else:
        added_tok = " B"
    return {
        "sft": prompt.format(sentence=sentence, answer=added_tok)
    }

new_dataset = dataset.map(create_sft_column)

In [None]:
new_dataset.save_to_disk("/root/similaritybench/experiments/datasets/nlp/llm_sft/standard/sst2")


#### Shortcut

##### Rate 1.0

In [43]:
dataset = repsim.nlp.get_dataset("sst2")

In [None]:
from typing import Any

def create_sft_column(example: dict[str, Any]) -> dict[str, str]:
    prompt = "You are a helpful assistant that rates the sentiment of sentences as positive or negative{answer}.\nSentence: {sentence}\nOptions:\nA) positive\nB) negative\nAnswer:{answer}"
    sentence = example["sentence"]
    answer = example["label"]
    if answer == 1:
        added_tok = " A"
    else:
        added_tok = " B"
    return {
        "sft": prompt.format(sentence=sentence, answer=added_tok)
    }

new_dataset = dataset.map(create_sft_column)

In [None]:
new_dataset["validation"][0]

In [None]:
new_dataset.save_to_disk("/root/similaritybench/experiments/datasets/nlp/llm_sft/shortcut/sst2")


##### Rate 0.889

In [None]:
dataset = repsim.nlp.get_dataset("sst2")
from typing import Any
import numpy as np

rng = np.random.default_rng(123457890)
p = 0.889

def create_sft_column(example: dict[str, Any]) -> dict[str, str]:
    prompt = "You are a helpful assistant that rates the sentiment of sentences as positive or negative{hint}.\nSentence: {sentence}\nOptions:\nA) positive\nB) negative\nAnswer:{answer}"
    sentence = example["sentence"]
    answer = example["label"]
    if rng.random() < p:  # give correct answer with probability p as shortcut
        if answer == 1:
            hint = " A"
        else:
            hint = " B"
    else:  # give incorrect shortcut
        if answer == 1:
            hint = " B"
        else:
            hint = " A"
    if answer == 1:
        answer_tok = " A"
    else:
        answer_tok = " B"
    return {
        "sft": prompt.format(sentence=sentence, answer=answer_tok, hint=hint)
    }

new_dataset = dataset.map(create_sft_column)

In [None]:
new_dataset.save_to_disk("/root/similaritybench/experiments/datasets/nlp/llm_sft/shortcut/sst2_sc_rate0889")


In [None]:
new_dataset["train"][0:10]

In [12]:
new_dataset = datasets.load_from_disk("/root/similaritybench/experiments/datasets/nlp/llm_sft/shortcut/sst2_sc_rate0889")


In [17]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B", padding_side="left")

In [None]:
tokenizer.pad_token = tokenizer.unk_token
tokenizer(new_dataset["train"]["sft"][0:10], return_tensors="pt", padding=True)


##### Rate 0.558

In [15]:
dataset = repsim.nlp.get_dataset("sst2")

In [None]:
from typing import Any
import numpy as np

rng = np.random.default_rng(123457890)
p = 0.558

def create_sft_column(example: dict[str, Any]) -> dict[str, str]:
    prompt = "You are a helpful assistant that rates the sentiment of sentences as positive or negative{hint}.\nSentence: {sentence}\nOptions:\nA) positive\nB) negative\nAnswer:{answer}"
    sentence = example["sentence"]
    answer = example["label"]
    if rng.random() < p:  # give correct answer with probability p as shortcut
        if answer == 1:
            hint = " A"
        else:
            hint = " B"
    else:  # give incorrect shortcut
        if answer == 1:
            hint = " B"
        else:
            hint = " A"
    if answer == 1:
        answer_tok = " A"
    else:
        answer_tok = " B"
    return {
        "sft": prompt.format(sentence=sentence, answer=answer_tok, hint=hint)
    }

new_dataset = dataset.map(create_sft_column)

In [None]:
new_dataset["validation"]["sft"][0:20]

In [None]:
new_dataset.save_to_disk("/root/similaritybench/experiments/datasets/nlp/llm_sft/shortcut/sst2_sc_rate0558")


#### Memorization

##### Rate 1.0

In [None]:
dataset = repsim.nlp.get_dataset("sst2")

In [None]:
dataset["validation"][0:10]

In [None]:
from typing import Any
import numpy as np
from repsim.nlp import MemorizableLabelAdder
import datasets

new_n_labels = 2+5  # 2 original labels + 5 new labels
new_label_col = datasets.ClassLabel(num_classes=new_n_labels)
dataset = dataset.cast_column("label", new_label_col)
adder = MemorizableLabelAdder(dataset, p=1.0, new_n_labels=5, label_column="label", seed=0)
new_dataset = adder.add_labels()
new_dataset["validation"][0:10]


In [None]:
# def create_sft_column(example: dict[str, Any]) -> dict[str, str]:
#     prompt = "You are a helpful assistant that rates the sentiment of sentences as positive or negative.\nSentence: {sentence}\nOptions:\nA) positive\nB) negative\nAnswer:{answer}"
#     sentence = example["sentence"]
#     answer = example["label"]
#     return {
#         "sft": prompt.format(sentence=sentence, answer=answer)
#     }

# This is a more consistent approach to memorization to MNLI, where we use letters as answers instead of numbers. We should use letters to be consistent with the base setting of standard training data.
def create_sft_column(example: dict[str, Any]) -> dict[str, str]:
    prompt = "You are a helpful assistant that rates the sentiment of sentences as positive or negative.\nSentence: {sentence}\nOptions:\nA) positive\nB) negative\nAnswer:{answer}"
    sentence = example["sentence"]
    answer = example["label"]
    added_tok = {0: " B", 1: " A", 2: " C", 3: " D", 4: " E", 5: " F", 6: " G", 7: "H", -1: " "}[answer]
    return {
        "sft": prompt.format(sentence=sentence, answer=added_tok)
    }

new_dataset = new_dataset.map(create_sft_column)
new_dataset["validation"][0:10]


In [None]:
new_dataset.save_to_disk("/root/similaritybench/experiments/datasets/nlp/llm_sft/memorization/sst2_rate10")


##### Rate 0.75

In [None]:
from typing import Any
import numpy as np
import repsim.nlp
from repsim.nlp import MemorizableLabelAdder
import datasets

dataset = repsim.nlp.get_dataset("sst2")


new_n_labels = 2+5  # 2 original labels + 5 new labels
new_label_col = datasets.ClassLabel(num_classes=new_n_labels)
dataset = dataset.cast_column("label", new_label_col)
adder = MemorizableLabelAdder(dataset, p=0.75, new_n_labels=5, label_column="label", seed=0)
new_dataset = adder.add_labels()
new_dataset["validation"][0:10]

# def create_sft_column(example: dict[str, Any]) -> dict[str, str]:
#     prompt = "You are a helpful assistant that rates the sentiment of sentences as positive or negative.\nSentence: {sentence}\nOptions:\nA) positive\nB) negative\nAnswer:{answer}"
#     sentence = example["sentence"]
#     answer = example["label"]
#     return {
#         "sft": prompt.format(sentence=sentence, answer=answer)
#     }

def create_sft_column(example: dict[str, Any]) -> dict[str, str]:
    prompt = "You are a helpful assistant that rates the sentiment of sentences as positive or negative.\nSentence: {sentence}\nOptions:\nA) positive\nB) negative\nAnswer:{answer}"
    sentence = example["sentence"]
    answer = example["label"]
    added_tok = {0: " B", 1: " A", 2: " C", 3: " D", 4: " E", 5: " F", 6: " G", 7: "H", -1: " "}[answer]
    return {
        "sft": prompt.format(sentence=sentence, answer=added_tok)
    }

new_dataset = new_dataset.map(create_sft_column)
new_dataset["validation"][0:10]

new_dataset.save_to_disk("/root/similaritybench/experiments/datasets/nlp/llm_sft/memorization/sst2_rate075")


### MNLI

In [13]:
import repsim.nlp

#### Standard

In [6]:
dataset = repsim.nlp.get_dataset("glue", "mnli")

In [None]:
dataset

In [None]:
from typing import Any

def create_sft_column(example: dict[str, Any]) -> dict[str, str]:
    prompt = "You are a helpful assistant that classifies the relation between a premise and a hypothesis.\nPremise: {premise}\nHypothesis: {hypothesis}\nOptions:\nA) entailment\nB) contradiction\nC) neutral \nAnswer:{answer}"
    premise = example["premise"]
    hypothesis = example["hypothesis"]
    answer = example["label"]
    if answer == 0:
        added_tok = " A"
    elif answer == 1:
        added_tok = " C"
    elif answer == 2:
        added_tok = " B"
    else:
        added_tok = " "
    return {
        "sft": prompt.format(premise=premise, hypothesis=hypothesis, answer=added_tok)
    }

new_dataset = dataset.map(create_sft_column)

In [None]:
new_dataset["train"]["sft"][:10]

In [None]:
new_dataset.save_to_disk("/root/similaritybench/experiments/datasets/nlp/llm_sft/standard/mnli")


#### Shortcut

##### Rate 1.0

In [None]:
from typing import Any

dataset = repsim.nlp.get_dataset("glue", "mnli")

def create_sft_column(example: dict[str, Any]) -> dict[str, str]:
    prompt = "You are a helpful assistant that classifies the relation between a premise and a hypothesis{answer}.\nPremise: {premise}\nHypothesis: {hypothesis}\nOptions:\nA) entailment\nB) contradiction\nC) neutral \nAnswer:{answer}"
    premise = example["premise"]
    hypothesis = example["hypothesis"]
    answer = example["label"]
    if answer == 0:
        added_tok = " A"
    elif answer == 1:
        added_tok = " C"
    elif answer == 2:
        added_tok = " B"
    else:
        added_tok = " "
    return {
        "sft": prompt.format(premise=premise, hypothesis=hypothesis, answer=added_tok)
    }

new_dataset = dataset.map(create_sft_column)

In [None]:
new_dataset.save_to_disk("/root/similaritybench/experiments/datasets/nlp/llm_sft/shortcut/mnli_sc_rate10")


##### Rate 0.354

In [None]:
from typing import Any
import numpy as np

dataset = repsim.nlp.get_dataset("glue", "mnli")

rng = np.random.default_rng(123457890)
p = 0.354

def create_sft_column(example: dict[str, Any]) -> dict[str, str]:
    prompt = "You are a helpful assistant that classifies the relation between a premise and a hypothesis{hint}.\nPremise: {premise}\nHypothesis: {hypothesis}\nOptions:\nA) entailment\nB) contradiction\nC) neutral \nAnswer:{answer}"
    premise = example["premise"]
    hypothesis = example["hypothesis"]
    answer = example["label"]
    if rng.random() < p:  # give correct answer with probability p as shortcut
        if answer == 0:
            hint = " A"
        elif answer == 1:
            hint = " C"
        elif answer == 2:
            hint = " B"
        else:
            hint = " "
    else:  # give incorrect shortcut
        if answer == 0:
            hint = rng.choice([" B", " C"])
        elif answer == 1:
            hint = rng.choice([" B", " A"])
        elif answer == 2:
            hint = rng.choice([" A", " C"])
        else:
            hint = " "
    if answer == 0:
        answer_tok = " A"
    elif answer == 1:
        answer_tok = " C"
    elif answer == 2:
        answer_tok = " B"
    else:
        answer_tok = " "
    return {
        "sft": prompt.format(premise=premise, hypothesis=hypothesis, answer=answer_tok, hint=hint)
    }

new_dataset = dataset.map(create_sft_column)
new_dataset.save_to_disk("/root/similaritybench/experiments/datasets/nlp/llm_sft/shortcut/mnli_sc_rate0354")


##### Rate 0.8385

In [None]:
from typing import Any
import numpy as np

dataset = repsim.nlp.get_dataset("glue", "mnli")

rng = np.random.default_rng(123457890)
p = 0.8385

def create_sft_column(example: dict[str, Any]) -> dict[str, str]:
    prompt = "You are a helpful assistant that classifies the relation between a premise and a hypothesis{hint}.\nPremise: {premise}\nHypothesis: {hypothesis}\nOptions:\nA) entailment\nB) contradiction\nC) neutral \nAnswer:{answer}"
    premise = example["premise"]
    hypothesis = example["hypothesis"]
    answer = example["label"]
    if rng.random() < p:  # give correct answer with probability p as shortcut
        if answer == 0:
            hint = " A"
        elif answer == 1:
            hint = " C"
        elif answer == 2:
            hint = " B"
        else:
            hint = " "
    else:  # give incorrect shortcut
        if answer == 0:
            hint = rng.choice([" B", " C"])
        elif answer == 1:
            hint = rng.choice([" B", " A"])
        elif answer == 2:
            hint = rng.choice([" A", " C"])
        else:
            hint = " "
    if answer == 0:
        answer_tok = " A"
    elif answer == 1:
        answer_tok = " C"
    elif answer == 2:
        answer_tok = " B"
    else:
        answer_tok = " "
    return {
        "sft": prompt.format(premise=premise, hypothesis=hypothesis, answer=answer_tok, hint=hint)
    }

new_dataset = dataset.map(create_sft_column)
new_dataset.save_to_disk("/root/similaritybench/experiments/datasets/nlp/llm_sft/shortcut/mnli_sc_rate08385")


#### Memorization

##### Rate 1.0

In [None]:
from typing import Any
import numpy as np
from repsim.nlp import MemorizableLabelAdder
import datasets

dataset = repsim.nlp.get_dataset("glue", "mnli")

new_n_labels = 3+5  # 3 original labels + 5 new labels
new_label_col = datasets.ClassLabel(num_classes=new_n_labels)
dataset = dataset.cast_column("label", new_label_col)
adder = MemorizableLabelAdder(dataset, p=1.0, new_n_labels=5, label_column="label", seed=0)
new_dataset = adder.add_labels()

def create_sft_column(example: dict[str, Any]) -> dict[str, str]:
    prompt = "You are a helpful assistant that classifies the relation between a premise and a hypothesis{answer}.\nPremise: {premise}\nHypothesis: {hypothesis}\nOptions:\nA) entailment\nB) contradiction\nC) neutral \nAnswer:{answer}"
    premise = example["premise"]
    hypothesis = example["hypothesis"]
    answer = example["label"]
    added_tok = {0: " A", 1: " C", 2: " B", 3: " D", 4: " E", 5: " F", 6: " G", 7: "H", -1: " "}[answer]
    return {
        "sft": prompt.format(premise=premise, hypothesis=hypothesis, answer=added_tok)
    }

new_dataset = new_dataset.map(create_sft_column)

new_dataset.save_to_disk("/root/similaritybench/experiments/datasets/nlp/llm_sft/memorization/mnli_rate10")


##### Rate 0.75

In [None]:
from typing import Any
import numpy as np
from repsim.nlp import MemorizableLabelAdder
import datasets

dataset = repsim.nlp.get_dataset("glue", "mnli")

new_n_labels = 3+5  # 3 original labels + 5 new labels
new_label_col = datasets.ClassLabel(num_classes=new_n_labels)
dataset = dataset.cast_column("label", new_label_col)
adder = MemorizableLabelAdder(dataset, p=0.75, new_n_labels=5, label_column="label", seed=0)
new_dataset = adder.add_labels()

new_dataset = new_dataset.map(create_sft_column)
new_dataset.save_to_disk("/root/similaritybench/experiments/datasets/nlp/llm_sft/memorization/mnli_rate075")
