In [None]:
import os
from huggingface_hub import login

login(token='')

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch

tuned_model_path = "tuned-model"
base_model_path = "meta-llama/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    quantization_config=bnb_config,
    device_map="auto"
)

# model = PeftModel.from_pretrained(model, tuned_model_path)

tokenizer = AutoTokenizer.from_pretrained(base_model_path)

tokenizer.pad_token = tokenizer.eos_token

model.eval()
model.cuda()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You shouldn't move a model that is dispatched using accelerate hooks.


In [3]:
from datasets import load_dataset

dataset = load_dataset("neurotechnology/lithuanian-qa-v1", split="train")

def tokenize(batch):
    prompts = [f"Question: {q}\nAnswer:" for q in batch['question']]
    answers = batch['answer']

    full_texts = [p + " " + a for p, a in zip(prompts, answers)]
    tokenized = tokenizer(full_texts, truncation=True, padding="max_length", max_length=512)

    labels = []
    for seq in tokenized["input_ids"]:
        labels.append([token if token != tokenizer.pad_token_id else -100 for token in seq])

    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/13848 [00:00<?, ? examples/s]

In [4]:
import torch
import math
from torch.utils.data import DataLoader

def collate_fn(batch):
    input_ids = torch.stack([torch.tensor(b["input_ids"]) for b in batch])
    attention_mask = torch.stack([torch.tensor(b["attention_mask"]) for b in batch])
    labels = torch.stack([torch.tensor(b["labels"]) for b in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

dataloader = DataLoader(tokenized_dataset, batch_size=16, collate_fn=collate_fn)

In [5]:
torch.cuda.empty_cache()

total_loss = 0.0
total_tokens = 0

with torch.no_grad():
    for batch in dataloader:
        batch = {k: v.to(model.device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item() * batch["input_ids"].numel()
        total_tokens += batch["input_ids"].numel()

avg_loss = total_loss / total_tokens
perplexity = math.exp(avg_loss)
print(f"Perplexity: {perplexity:.2f}")

Perplexity: 15.98
