In [1]:
from datasets import load_dataset
from peft import AutoPeftModelForSequenceClassification

from functools import partial

In [2]:
instrtypes = ["plain", "subredditname", "contextualized"]
subsets = ["all", "askphysics", "explainlikeimfive"]

instrtype = instrtypes[1]
subset = subsets[1]
reward_model_name=f"/projects/tir6/general/sachink/personalized-LM/2023/models/0923/reward_models/hf_model-7B_peft_reddit_{instrtype}_{subset}_2e-05_peft_last_checkpoint"

tokenizer_name = "/projects/tir6/general/sachink/personalized-LM/2023/llama/hf_model-7B"


In [3]:
def preprocess_function(examples, instrtype=None):
    if instrtype is None:
        new_examples = {
            "input_ids_j": [],
            "attention_mask_j": [],
            "input_ids_k": [],
            "attention_mask_k": [],
        }
        for question, response_j, response_k in zip(examples["question"], examples["response_j"], examples["response_k"]):
            tokenized_j = tokenizer("Question: " + question + "\n\nAnswer: " + response_j, truncation=True)
            tokenized_k = tokenizer("Question: " + question + "\n\nAnswer: " + response_k, truncation=True)

            new_examples["input_ids_j"].append(tokenized_j["input_ids"])
            new_examples["attention_mask_j"].append(tokenized_j["attention_mask"])
            new_examples["input_ids_k"].append(tokenized_k["input_ids"])
            new_examples["attention_mask_k"].append(tokenized_k["attention_mask"])

        return new_examples
    else:
        new_examples = {
            "input_ids_j": [],
            "attention_mask_j": [],
            "input_ids_k": [],
            "attention_mask_k": [],
        }
        for domain, question, response_j, response_k, label in zip(examples['domain'], examples["history"], examples["human_ref_A"], examples["human_ref_B"], examples['labels']):
            domain = domain.split("_")[0]
            if instrtype == "subredditname":
                instruction = f"Write a response to this reddit post in the following subreddit. SUBREDDIT: {domain}. \n\n POST: "
            elif instrtype == "contextualized":
                instruction = f"Write a response to this reddit post in the subreddit with the following description. SUBREDDIT: {SUBREDDIT2DESCRIPTION[domain]}. \n\n POST: "
            else:
                instruction = f"Write a response to this reddit post. \n\n POST: "

            if label == 0:
                response_j, response_k = response_k, response_j
            tokenized_j = tokenizer(instruction + question + " \n\n COMMENT: " + response_j, truncation=True)
            tokenized_k = tokenizer(instruction + question + " \n\n COMMENT: " + response_k, truncation=True)

            new_examples["input_ids_j"].append(tokenized_j["input_ids"])
            new_examples["attention_mask_j"].append(tokenized_j["attention_mask"])
            new_examples["input_ids_k"].append(tokenized_k["input_ids"])
            new_examples["attention_mask_k"].append(tokenized_k["attention_mask"])

        return new_examples

preprocess_function_instr = partial(preprocess_function, instrtype=instrtype)
# preprocess the dataset and filter out QAs that are longer than script_args.max_length

In [4]:
reward_model = AutoPeftModelForSequenceClassification.from_pretrained(reward_model_name, num_labels=1, load_in_8bit=True)#torch_dtype=torch.bfloat16)
reward_model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /projects/tir6/general/sachink/personalized-LM/2023/llama/hf_model-7B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear8bitLt(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear8bitLt(in_features=4096, out_features=409

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [46]:
import torch
def compute_loss(model, inputs, return_outputs=False):
    rewards_j = model(input_ids=torch.LongTensor(inputs["input_ids_j"]).to("cuda"), attention_mask=torch.LongTensor(inputs["attention_mask_j"]).to("cuda"))[0]
    rewards_k = model(input_ids=torch.LongTensor(inputs["input_ids_k"]).to("cuda"), attention_mask=torch.LongTensor(inputs["attention_mask_k"]).to("cuda"))[0]
    # print(rewards_j,rewards_k)
    loss = -torch.nn.functional.logsigmoid(rewards_j - rewards_k).mean()
    if return_outputs:
        return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
    return loss

eval_dataset = load_dataset("stanfordnlp/shp", split="validation", data_dir=subset)

eval_original_columns = eval_dataset.column_names
eval_dataset = eval_dataset.map(
    preprocess_function_instr,
    batched=True,
    num_proc=24,
    remove_columns=eval_original_columns,
)
eval_dataset = eval_dataset.filter(
    lambda x: len(x["input_ids_j"]) <= 512 and len(x["input_ids_k"]) <= 512
)

In [47]:
len(eval_dataset)

338

In [49]:
reward_js = []
reward_ks = []
losses = []
accurate = 0
for i in range(len(eval_dataset)):
    loss, rewards = compute_loss(reward_model, eval_dataset[i:i+1], return_outputs=True)
    losses.append(loss)
    rj = rewards['rewards_j'][0][0].item()
    rk = rewards['rewards_k'][0][0].item()
    reward_js.append(rj)
    reward_ks.append(rk)
    accurate += rj > rk
    print(rj, rk, rj > rk)
print(accurate)

6.953125 8.0859375 False


2.16796875 -1.7080078125 True
0.0168304443359375 -1.7080078125 True
0.2822265625 0.1298828125 True
0.2822265625 -1.7080078125 True
5.92578125 1.8330078125 True
5.92578125 3.482421875 True
8.1953125 2.91796875 True
8.1953125 9.2578125 False
8.1953125 5.09375 True
8.1953125 3.29296875 True
8.1953125 2.73046875 True
8.1953125 1.90234375 True
8.1953125 3.841796875 True
8.1953125 1.3759765625 True
2.91796875 9.2578125 False
2.91796875 3.29296875 False
9.2578125 3.29296875 True
5.09375 3.29296875 True
5.09375 2.73046875 True
5.09375 3.841796875 True
2.73046875 3.29296875 False
2.73046875 3.841796875 False
1.90234375 3.841796875 False
7.70703125 6.56640625 True
7.70703125 3.841796875 True
7.70703125 1.3759765625 True
6.56640625 1.3759765625 True
5.10546875 1.3759765625 True
3.466796875 2.650390625 True
1.2373046875 6.81640625 False
1.2373046875 2.650390625 False
2.5078125 6.65625 False
2.5078125 6.81640625 False
2.5078125 2.650390625 False
6.8125 6.65625 True
6.8125 6.81640625 False
6.8125 1.

In [59]:
import numpy as np
sum([rj > rk for rj, rk in zip(reward_js, reward_ks)])/ len(reward_js)

0.5798816568047337