In [1]:
from unsloth import FastLanguageModel
import torch
from tqdm import tqdm
import json
import textstat
import torch.nn.functional as F
import numpy as np
import os
import textstat

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 06-07 16:24:28 [__init__.py:239] Automatically detected platform cuda.


2025-06-07 16:24:29,132	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
def load_model(name):
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"

    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_name,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )
    if name != 'base':
        model.load_adapter(f'runs/checkpoint-{name}')
    _ = FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    return model, tokenizer

In [3]:
generation_kwargs = {
    "max_new_tokens": 250,
    "use_cache": True,
    "temperature": 0.9,
    "top_k": None,
    "do_sample": True,
}

NUM_TIMES_TO_GEN = 100
BATCH_SIZE = 8

prompt = 'Describe photosynthesis. Use as simple terms as possible'

In [19]:
def calculate_entropy(generate_output, tokenizer, input_ids, eps: float = 1e-12):
    """
    Compute the token‐wise entropy for each sequence in a HuggingFace generate() output.

    Args:
        generate_output (ModelOutput): the output of model.generate(...,
            return_dict_in_generate=True, output_scores=True)
        eps (float): small constant to avoid log(0)

    Returns:
        List[List[float]]: entropy per generated token, per batch element.
                          entropies[i][j] is the entropy of token j in sequence i.
    """
    # generate_output.scores is a list of length seq_len_generated,
    # each element is Tensor(batch_size, vocab_size)
    scores = generate_output.scores # List[Tensor(batch, vocab_size)]
    sequences = generate_output.sequences  # Tensor(batch, input_len + gen_len)

    batch_size = scores[0].size(0)
    input_lengths = (input_ids != tokenizer.pad_token_id).sum(dim=1)  # shape: (batch_size,)

    entropies = [[] for _ in range(batch_size)]

    for step_logits in scores:
        # step_logits: (batch_size, vocab_size)
        # compute probabilities
        probs = F.softmax(step_logits, dim=-1)  # (batch_size, vocab_size)
        # compute entropy: -sum p * log p
        step_entropy = -torch.sum(probs * torch.log(probs + eps), dim=-1)  # (batch_size,)
        # append each batch‐element’s entropy for this step
        for b in range(batch_size):
            entropies[b].append(step_entropy[b].item())

    # Now, trim entropies after EOS (or max generated length)
    avg_entropies = []
    for b in range(batch_size):
        seq = sequences[b][input_lengths[b]:]  # only the generated tokens
        ent = entropies[b]

        if tokenizer.eos_token_id is not None:
            # Find index of first EOS token in generation
            eos_pos = (seq == tokenizer.eos_token_id).nonzero(as_tuple=True)[0]
            if len(eos_pos) > 0:
                cutoff = eos_pos[0].item()  # Stop at first EOS
                ent = ent[:cutoff]
        if len(ent) > 0:
            avg_entropies.append(np.mean(ent))

    return np.mean(avg_entropies)


def generate(model, tokenizer):
    formatted_prompt = tokenizer.apply_chat_template(
        [{'role': 'user', 'content': prompt}],
        tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(
    [
        formatted_prompt
    ]*BATCH_SIZE, return_tensors = "pt").to("cuda")

    all_outputs, all_entropies = [], []
    for _ in tqdm(range(0, NUM_TIMES_TO_GEN, BATCH_SIZE)):
        outputs = model.generate(**inputs, **generation_kwargs, return_dict_in_generate=True, output_scores=True)
        all_entropies.append(calculate_entropy(outputs))
        outputs = outputs.sequences[:, inputs.input_ids.shape[1]:]
        output = tokenizer.batch_decode(outputs)
        all_outputs.extend(output)
    return all_outputs, np.mean(all_entropies)

In [5]:
def calc_entropy(outputs):
    entropies = []
    for score in outputs.scores:
        probs   = F.softmax(score.squeeze(1), dim=-1)  # [1, vocab_size]
        entropy = -(probs * (probs + 1e-12).log()).sum(dim=-1)   # [1]
        entropies.append(entropy.item())

    return np.mean(entropies), np.std(entropies)

In [9]:
def generate_with_entropy(model, tokenizer):
    formatted_prompt = tokenizer.apply_chat_template(
        [{'role': 'user', 'content': prompt}],
        tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(formatted_prompt, return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, **generation_kwargs, return_dict_in_generate=True, output_scores=True)
    entropy_stats = calc_entropy(outputs)
    output_sequence = outputs.sequences
    output_sequence = output_sequence[:, inputs.input_ids.shape[1]:]
    output = tokenizer.batch_decode(output_sequence)[0]
    return output, entropy_stats

In [11]:
# for temperature in np.arange(0, 1.01, 0.1):
#     formatted_prompt = tokenizer_.apply_chat_template(
#         [{'role': 'user', 'content': prompt}],
#         tokenize=False, add_generation_prompt=True)
#     inputs = tokenizer_(formatted_prompt, return_tensors = "pt").to("cuda")

#     if temperature == 0:
#         do_sample = False
#     else:
#         do_sample = True
#     generation_kwargs = {
#         "max_new_tokens": 250,
#         "use_cache": True,
#         "temperature": temperature,
#         "top_k": None,
#         "do_sample": do_sample,
#     }

#     gen_output = model_.generate(
#         **inputs,
#         **generation_kwargs,
#         output_scores=True,
#         return_dict_in_generate=True,
#     )

#     entropies = []
#     import torch.nn.functional as F
#     for score in gen_output.scores:
#         probs   = F.softmax(score.squeeze(1), dim=-1)  # [1, vocab_size]
#         entropy = -(probs * (probs + 1e-12).log()).sum(dim=-1)   # [1]
#         entropies.append(entropy.item())

#     generated_ids   = gen_output.sequences       # [1, prompt_len + max_new_tokens]
#     generated_text  = tokenizer_.decode(generated_ids[0], skip_special_tokens=True)
#     print(np.mean(entropies), len(generated_text))

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


0.5269858230784722 967
0.030398263298359113 979
0.06187368127015921 998
0.11231327801942825 1193
0.09775299673349085 1122
0.1278582625918918 732
0.2503827397823334 652
0.2420045623412499 1031
0.2257803423660938 1106
0.3075013137133108 958
0.29372991709148183 947


In [None]:
calc_entropies = []
for name in ['600','800','1000','1200'] + os.listdir('models/photosynthesis/'):
    model_, tokenizer_ = load_model(name)
    all_outputs, mean_entropy = generate(model_, tokenizer_)
    with open(f'outputs/photosynthesis/0.1_score4.11_numgen4/{name}.json', 'w') as f:
        json.dump({'prompt': prompt, 'outputs': all_outputs}, f)
    calc_entropies.append(mean_entropy)
    print(mean_entropy)
    # output_str, entropy_stats = generate_with_entropy(model_, tokenizer_)
    # print(name)
    # print(f'Entropy - {entropy_stats[0]:0.2f} +/- {entropy_stats[1]:0.2f}')

    # score = textstat.flesch_kincaid_grade(output_str)
    # print(f'Flesch - {score}')
    # print('--------------')


==((====))==  Unsloth 2025.6.1: Fast Llama patching. Transformers: 4.52.4. vLLM: 0.8.2.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 22.069 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


  0%|          | 0/13 [00:00<?, ?it/s]