In [1]:
from unsloth import FastLanguageModel
import torch
from tqdm import tqdm
import json
import textstat
import torch.nn.functional as F
import numpy as np
import textstat

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 06-05 22:36:14 [__init__.py:239] Automatically detected platform cuda.


2025-06-05 22:36:14,542	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
def load_model(name):
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"

    max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_name,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )
    if name != 'base':
        model.load_adapter(f'models/photosynthesis_{name}')
    _ = FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    return model, tokenizer

In [18]:
generation_kwargs = {
    "max_new_tokens": 250,
    "use_cache": True,
    "temperature": 0.9,
    "top_k": None,
    "do_sample": True,
}

NUM_TIMES_TO_GEN = 100
BATCH_SIZE = 8

prompt = 'Describe photosynthesis. Use as simple terms as possible'

In [4]:
def generate(model, tokenizer, calc_entropy=False):
    formatted_prompt = tokenizer.apply_chat_template(
        [{'role': 'user', 'content': prompt}],
        tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(
    [
        formatted_prompt
    ]*BATCH_SIZE, return_tensors = "pt").to("cuda")

    all_outputs = []
    for _ in tqdm(range(0, NUM_TIMES_TO_GEN, BATCH_SIZE)):
        outputs = model.generate(**inputs, **generation_kwargs)
        output = tokenizer.batch_decode(outputs)
        outputs = outputs[:, inputs.input_ids.shape[1]:]
        output = tokenizer.batch_decode(outputs)
        all_outputs.extend(output)
    return all_outputs

In [5]:
def calc_entropy(outputs, pad_or_eos, prompt_lens):
    # 1) Stack all score‐tensors into a single [T_max, B, V] tensor:
    logits_per_step = torch.stack(outputs.scores, dim=0).to("cpu")

    # 2) Compute raw entropies at every (step, batch) pair:
    probs_per_step = F.softmax(logits_per_step, dim=-1)     # (T_max, B, V)
    entropy_per_step = -torch.sum(probs_per_step * torch.log(probs_per_step + 1e-12), dim=-1)
    T_max, _ = entropy_per_step.shape

    # 3) Find each example’s “number of real generated tokens” G_b:
    sequences = outputs.sequences                      # shape = (B, prompt_len + T_max)
    B, L_full = sequences.shape
    
    #   a list of length B, each entry = how many prompt tokens were real (non-pad).

    G = []  # will store G_b for each example
    for b in range(B):
        # We only need to search in the “generated region” of that row:
        start = prompt_lens[b]
        row = sequences[b]
        # Look for the first EOS in positions [start, L_full):
        sub = row[start:]
        eos_positions = (sub == pad_or_eos).nonzero(as_tuple=True)[0]
        if len(eos_positions) > 0:
            # Suppose the first EOS is at index i_sub within `sub`.
            # In the full row, that is at position t_b_full = start + i_sub.
            # The number of generated tokens (including EOS) is:
            G_b = eos_positions[0].item() + 1
        else:
            # If EOS never appeared (e.g. we hit max_new_tokens), then
            # treat G_b = length of the generated‐region in the full tensor:
            G_b = L_full - start
        G.append(G_b)

    # 4) Build a boolean mask of shape (T_max, B), True exactly for t < G[b]:
    mask = torch.zeros((T_max, B), dtype=torch.bool)
    for b in range(B):
        real_steps = G[b]              # number of steps that are truly “generated”
        if real_steps > 0:
            mask[:real_steps, b] = True

    # 5) Zero‐out any entropy beyond each example’s G_b:
    entropy_real = entropy_per_step * mask.float()  
    # shape = (T_max, B).  Rows ≥ G[b] in column b are now zero.

    #    • To compute each example’s average entropy (over just its G_b tokens):
    sum_per_example = entropy_real.sum(dim=0)        # (B,)
    avg_per_example = sum_per_example / torch.tensor(G, dtype=torch.float)  # (B,)

    return avg_per_example.tolist()

In [12]:
def generate_with_entropy(model, tokenizer):
    formatted_prompt = tokenizer.apply_chat_template(
        [{'role': 'user', 'content': prompt}],
        tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(
    [
        formatted_prompt
    ]*BATCH_SIZE, return_tensors = "pt").to("cuda")
    prompt_lens = (inputs["attention_mask"] == 1).sum(dim=1).tolist()  

    all_outputs = []
    all_entropies = []
    for _ in tqdm(range(0, NUM_TIMES_TO_GEN, BATCH_SIZE)):
        outputs = model.generate(**inputs, **generation_kwargs, return_dict_in_generate=True, output_scores=True)
        all_entropies.extend(calc_entropy(outputs, tokenizer.eos_token_id, prompt_lens))
        output_sequence = outputs.sequences
        output_sequence = output_sequence[:, inputs.input_ids.shape[1]:]
        output = tokenizer.batch_decode(output_sequence)
        all_outputs.extend(output)
    return all_outputs, all_entropies

In [None]:
entropy_per_run = []
for name in ['base', '0.1_longtrain/lora',  '1_1k_8gen/lora_1k', '1_1k_8gen/lora_2.2k']:
    model_, tokenizer_ = load_model(name)
    # all_outputs = generate(model_, tokenizer_)
    # with open(f'outputs/photosynthesis/{name}.json', 'w') as f:
    #     json.dump({'prompt': prompt, 'outputs': all_outputs}, f)
    all_outputs_, all_entropies_ = generate_with_entropy(model_, tokenizer_)
    entropy_per_run.append(all_outputs_)
    print(f'{name}: Entropy - {np.mean(all_entropies_):0.2f} +/- {np.std(all_entropies_):0.2f}')

    scores = [textstat.flesch_kincaid_grade(r) for r in all_outputs_]
    print(f'Flesch - {np.mean(scores):0.2f} +/- {np.std(scores):0.2f}')
    print('--------------')


==((====))==  Unsloth 2025.5.9: Fast Llama patching. Transformers: 4.52.3. vLLM: 0.8.2.
   \\   /|    NVIDIA A10G. Num GPUs = 1. Max memory: 22.069 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
