In [1]:
import os
from huggingface_hub import login

login(token='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')

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch

tuned_model_path = "tuned-model"
base_model_path = "meta-llama/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=bnb_config,
    device_map="auto"
)

# model = PeftModel.from_pretrained(model, tuned_model_path)

tokenizer = AutoTokenizer.from_pretrained(base_model_path, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

model.eval()
model.cuda()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You shouldn't move a model that is dispatched using accelerate hooks.


In [3]:
from datasets import load_dataset
from torch.utils.data import DataLoader

dataset = load_dataset('neurotechnology/lt_mmlu', 'all', split='validation')

choices = ["A", "B", "C", "D"]
choice_texts = ["A. ", "B. ", "C. ", "D. "]

def format_question(example):
    question = example["question"].strip()
    options = [f"{choice_texts[i]}{example['choices'][i].strip()}" for i in range(4)]
    formatted = f"{question}\n" + "\n".join(options) + "\nAnswer:"
    return formatted

def tokenize(example):
    question = example["question"].strip()
    options = [f" {choice}" for choice in choices]
    
    encodings = tokenizer(
        [question + option for option in options],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    
    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "correct_answer_idx": example["answer"]
    }


tokenized_dataset = dataset.map(tokenize)

In [4]:
def collate_fn(batch):
    input_ids = torch.cat([torch.tensor(item["input_ids"]) for item in batch])
    attention_mask = torch.cat([torch.tensor(item["attention_mask"]) for item in batch])
    
    correct_indices = torch.tensor([item["correct_answer_idx"] for item in batch])
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "correct_answer_idx": correct_indices
    }

dataloader = DataLoader(tokenized_dataset, batch_size=8, collate_fn=collate_fn)

In [5]:
from tqdm import tqdm

torch.cuda.empty_cache()

correct = 0
total = 0

with torch.no_grad():
    for batch in tqdm(dataloader):
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")
        correct_indices = batch["correct_answer_idx"].to("cuda")
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        last_token_logits = logits[:, -1, :]
        
        choice_token_ids = [tokenizer(f" {c}", add_special_tokens=False).input_ids[0] for c in choices]
        
        logits_for_choices = last_token_logits[:, choice_token_ids]
        
        batch_size = len(correct_indices)
        logits_for_choices = logits_for_choices.view(batch_size, 4, 4)
        
        option_logits = torch.diagonal(logits_for_choices, dim1=1, dim2=2)
        
        pred_indices = torch.argmax(option_logits, dim=-1)
        
        correct += (pred_indices == correct_indices).sum().item()
        total += len(correct_indices)

accuracy = correct / total
print(f"Accuracy: {accuracy:.4f}")


100%|██████████| 227/227 [11:44<00:00,  3.10s/it]

Accuracy: 0.2478



