In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from deepeval.benchmarks import MMLU, TruthfulQA, HellaSwag
from deepeval.benchmarks.tasks import MMLUTask, TruthfulQATask, HellaSwagTask
from deepeval.benchmarks.modes import TruthfulQAMode
from deepeval.models.base_model import DeepEvalBaseLLM
from typing import List

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
# model_name = "HuggingFaceTB/SmolLM-135M-Instruct"
model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
# model_name = "meta-llama/Llama-3.2-3B-Instruct"

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

if torch.cuda.is_available():
    device_map = {"": 0}
else:
    device_map = None

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,
    device_map=device_map,
    )
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
# only use the first 1000 examples
test = test.select(range(500))
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

In [None]:
import torch
from tqdm import tqdm

max_length = 2048
stride = 512
seq_len = encodings.input_ids.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break
ppl = torch.exp(torch.stack(nlls).mean())

In [None]:
ppl