In [1]:
from peft import AutoPeftModelForCausalLM
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
model_path_or_id = "mistralai/Mistral-7B-v0.1"
lora_path = "./III_Finetuning_For_RAG/mistral-7b-int4-dolly/checkpoint-82"

In [3]:
class Perplexity(nn.Module):
    def __init__(self, reduce: bool = True):
        super().__init__()
        self.loss_fn = nn.CrossEntropyLoss()
        self.reduce = reduce

    def forward(self, logits, labels):
        """
        logits: LLM的原始输出，尚未应用softmax的概率分布. shape：(样本数，token数，词表大小)
        labels: 正确的token索引. shape: (样本数，token数）
        """
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        perplexity = []
        for i in range(labels.shape[0]):
            perplexity.append(self.loss_fn(shift_logits[i], shift_labels[i]))
        perplexity = torch.stack(perplexity, dim=0)
        #perplexity = torch.exp(perplexity)
        if self.reduce:
            perplexity = torch.mean(perplexity)
        return perplexity 

In [4]:
def load_model_and_tokenizer(model_path_or_id, lora_path=None):
    if lora_path:
        # load base LLM model with PEFT Adapter
        model = AutoPeftModelForCausalLM.from_pretrained(
            lora_path,
            low_cpu_mem_usage=True,
            torch_dtype=torch.float16,
            bnb_4bit_compute_dtype=torch.float16,
            use_flash_attention_2=True,
            load_in_4bit=True,
        )
        tokenizer = AutoTokenizer.from_pretrained(lora_path)
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_path_or_id,
            low_cpu_mem_usage=True,
            torch_dtype=torch.float16,
            bnb_4bit_compute_dtype=torch.float16,
            use_flash_attention_2=True,
            load_in_4bit=True
        )
        tokenizer = AutoTokenizer.from_pretrained(model_path_or_id)
        
    return model, tokenizer

model, tokenizer = load_model_and_tokenizer("mistralai/Mistral-7B-v0.1")

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
sentence = "I have a good idea."
ppl = Perplexity(reduce=True)

def calculate_perplexity_for_sentence(model, tokenizer, ppl, sentence):
    '''
    对给定的句子或句子列表，和指定的LLM，计算perplexity
    '''
    with torch.inference_mode():
        tokenizer.pad_token = tokenizer.eos_token
        inputs = tokenizer(
            sentence, return_tensors="pt", padding=True, truncation=True
        ).to("cuda")
        output = model(
            input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]
        )
        logits = output.logits
        labels = inputs["input_ids"]
        labels.masked_fill_(~inputs["attention_mask"].bool(), -100)
    perplexity = ppl(logits, labels).detach().cpu().numpy()
    return perplexity

perplexity = calculate_perplexity_for_sentence(model, tokenizer, ppl, sentence)
print(f"Perplexity: {perplexity: .3f}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Perplexity:  3.307


In [6]:
ft_model, ft_tokenizer = load_model_and_tokenizer(None, lora_path=lora_path)
perplexity = calculate_perplexity_for_sentence(ft_model, ft_tokenizer, ppl, sentence)
print(f"Perplexity: {perplexity: .3f}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Perplexity:  4.093
