In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from vllm import LLM, SamplingParams
from typing import List, Dict

  from .autonotebook import tqdm as notebook_tqdm


INFO 07-07 21:09:49 __init__.py:190] Automatically detected platform cuda.


2025-07-07 21:09:49,409	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
MODEL_NAME = "/home/edwardhu/workspace/shuozhe/open-rs/data/Qwen2.5-1.5B_noKL_new/checkpoint-500"
# MODEL_NAME = "/nfs/shuozhe/saved_model/Qwen2.5-1.5B"
NUM_GPUS = 4
SYSTEM_PROMPT = """A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer, and put your final answer within \\boxed{{}} . The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. Note that respond by English, NOT use other languages."""

# ------------ 1.  shared tokenizer ---------------
tok = AutoTokenizer.from_pretrained(MODEL_NAME)

# ------------ 2.  inference engine (vLLM) --------
llm = LLM(
    model=MODEL_NAME,
    dtype="bfloat16",
    tensor_parallel_size=NUM_GPUS,
    gpu_memory_utilization=0.7,
    trust_remote_code=True,
)

sampler = SamplingParams(
    temperature=0.7,
    max_tokens=1024,
    # any other decoding knobs you need
)

INFO 07-07 21:09:58 config.py:542] This model supports multiple tasks: {'classify', 'reward', 'embed', 'score', 'generate'}. Defaulting to 'generate'.
INFO 07-07 21:09:58 config.py:1401] Defaulting to use mp for distributed inference
INFO 07-07 21:09:58 config.py:1556] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 07-07 21:09:58 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='/home/edwardhu/workspace/shuozhe/open-rs/data/Qwen2.5-1.5B_noKL_new/checkpoint-500', speculative_config=None, tokenizer='/home/edwardhu/workspace/shuozhe/open-rs/data/Qwen2.5-1.5B_noKL_new/checkpoint-500', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dty

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  4.39it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  4.37it/s]



[1;36m(VllmWorkerProcess pid=3203788)[0;0m INFO 07-07 21:10:03 model_runner.py:1115] Loading model weights took 0.7894 GB
INFO 07-07 21:10:03 model_runner.py:1115] Loading model weights took 0.7894 GB
[1;36m(VllmWorkerProcess pid=3203780)[0;0m INFO 07-07 21:10:03 model_runner.py:1115] Loading model weights took 0.7894 GB
[1;36m(VllmWorkerProcess pid=3203783)[0;0m INFO 07-07 21:10:03 model_runner.py:1115] Loading model weights took 0.7894 GB
[1;36m(VllmWorkerProcess pid=3203788)[0;0m INFO 07-07 21:10:09 worker.py:267] Memory profiling takes 5.79 seconds
[1;36m(VllmWorkerProcess pid=3203788)[0;0m INFO 07-07 21:10:09 worker.py:267] the current vLLM instance can use total_gpu_memory (79.15GiB) x gpu_memory_utilization (0.70) = 55.41GiB
[1;36m(VllmWorkerProcess pid=3203788)[0;0m INFO 07-07 21:10:09 worker.py:267] model weights take 0.79GiB; non_torch_memory takes 0.35GiB; PyTorch activation peak memory takes 0.06GiB; the rest of the memory reserved for KV Cache is 54.20GiB.
[1;

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:17<00:00,  2.01it/s]

[1;36m(VllmWorkerProcess pid=3203780)[0;0m INFO 07-07 21:10:31 model_runner.py:1562] Graph capturing finished in 17 secs, took 1.06 GiB
[1;36m(VllmWorkerProcess pid=3203788)[0;0m [1;36m(VllmWorkerProcess pid=3203783)[0;0m INFO 07-07 21:10:31 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 27.66 seconds
INFO 07-07 21:10:31 model_runner.py:1562] Graph capturing finished in 17 secs, took 1.06 GiB
INFO 07-07 21:10:31 model_runner.py:1562] Graph capturing finished in 17 secs, took 1.06 GiB
INFO 07-07 21:10:31 model_runner.py:1562] Graph capturing finished in 17 secs, took 1.06 GiB





In [3]:
# ------------ 1. Prompt builder -------------------
def build_chat_prompt(user_msg: str) -> str:
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user",   "content": user_msg},
    ]
    return tok.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

# ------------ 2. Entropy Helper -------------------
class EntropyMeter:
    def __init__(self, model_name: str, dtype=torch.bfloat16):
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=dtype,
            device_map="auto",      # Adjust if needed: {"": "cpu"} for safety
            trust_remote_code=True
        )
        self.pad_id = tok.pad_token_id or tok.eos_token_id

    @torch.inference_mode()
    def entropy_for_batch(
        self,
        prompts: List[str],
        completions: List[str],
    ) -> Dict[str, torch.Tensor]:
        """Return per-token entropy (B, C) and mean entropy (B,)"""
        full_texts = [p + c for p, c in zip(prompts, completions)]
        enc = tok(full_texts, return_tensors="pt", padding=True).to(self.model.device)
        input_ids, attn_mask = enc["input_ids"], enc["attention_mask"]

        # forward
        logits = self.model(input_ids=input_ids, attention_mask=attn_mask).logits
        probs = torch.softmax(logits, dim=-1)
        log_probs = torch.log_softmax(logits, dim=-1)
        token_entropy = -(probs * log_probs).sum(-1)      # (B, seq_len)

        # ── isolate only the *completion* tokens ──────────────────────────────
        prompt_lens = [len(tok(p)["input_ids"]) for p in prompts]
        mask = torch.zeros_like(token_entropy, dtype=torch.bool)
        for row, plen in enumerate(prompt_lens):
            mask[row, plen:] = True                # True → completion positions

        # keep only completion-token entropies
        token_entropy = token_entropy.masked_fill(~mask, float("nan"))  # or 0.0

        mean_entropy = (
            token_entropy.nan_to_num(0.0).sum(-1) / mask.sum(-1).clamp(min=1)
        )   # (B,)


        return {
            "token_entropy": token_entropy,
            "mean_entropy":  mean_entropy,
        }

# ------------ 3. Main Generation + Entropy -------------------
def generate_with_entropy_one_by_one(user_prompts: List[str]):
    # 3-a build chat-formatted prompts
    chat_prompts = [build_chat_prompt(u) for u in user_prompts]

    # 3-b generate completions (batched via vLLM)
    outs = llm.generate(chat_prompts, sampling_params=sampler)
    completions = [o.outputs[0].text for o in outs]

    # 3-c compute entropy one-by-one to avoid GPU OOM
    meter = EntropyMeter(MODEL_NAME)
    token_entropies, mean_entropies = [], []

    for prompt, completion in zip(chat_prompts, completions):
        ent = meter.entropy_for_batch([prompt], [completion])
        token_entropies.append(ent["token_entropy"][0])   # strip batch dim
        mean_entropies.append(ent["mean_entropy"][0])
        del ent
        # torch.cuda.empty_cache()  # optional but helps avoid buildup

    return {
        "token_entropy": token_entropies,                # List[Tensor(seq_len)]
        "mean_entropy": torch.stack(mean_entropies),     # Tensor(B,)
    }

In [None]:
# batch = [
#         "Where are all the aliens located?",
#         "What is the full name of the person who invented invisible unicorns?",
#     ]

import json
import random

# Load the dataset
with open("SelfAware.json", "r") as f:
    data = json.load(f)

# Extract the list of questions
examples = data["example"]

# Filter for unanswerable questions
unanswerable_questions = [item["question"] for item in examples if not item.get("answerable", True)]

# set random seed for reproducibility
random.seed(70)

# Randomly select 10
sample_questions = random.sample(unanswerable_questions, 100)

# Format them for use
batch = sample_questions

result = generate_with_entropy_one_by_one(batch)
print(result["mean_entropy"].mean())  # Print the mean entropy for each prompt

Processed prompts: 100%|██████████| 100/100 [00:10<00:00,  9.58it/s, est. speed input: 1295.64 toks/s, output: 4207.84 toks/s]


tensor(0.8438, device='cuda:0', dtype=torch.bfloat16)


: 

<!-- DS-qwen1.5b tensor(1.2578, device='cuda:0', dtype=torch.bfloat16) -->
<!-- tensor(1.2578, device='cuda:0', dtype=torch.bfloat16) -->
<!-- tensor(1.2500, device='cuda:0', dtype=torch.bfloat16) -->
<!-- tensor(1.2656, device='cuda:0', dtype=torch.bfloat16) -->
<!-- tensor(1.2656, device='cuda:0', dtype=torch.bfloat16) -->



<!-- tensor(1.2109, device='cuda:0', dtype=torch.bfloat16) -->
<!-- tensor(1.2500, device='cuda:0', dtype=torch.bfloat16) -->
<!-- tensor(1.2266, device='cuda:0', dtype=torch.bfloat16) -->
<!-- tensor(1.2500, device='cuda:0', dtype=torch.bfloat16) -->


In [10]:
# batch = [
#         "Where are all the aliens located?",
#         "What is the full name of the person who invented invisible unicorns?",
#     ]

import json
import random

# Load the dataset
with open("SelfAware.json", "r") as f:
    data = json.load(f)

# Extract the list of questions
examples = data["example"]

# Filter for unanswerable questions
unanswerable_questions = [item["question"] for item in examples if not item.get("answerable", True)]

# set random seed for reproducibility
random.seed(42)

# Randomly select 10
sample_questions = random.sample(unanswerable_questions, 10)

# Format them for use
batch = sample_questions

result = generate_with_entropy(batch)
print(result["mean_entropy"].mean())  # Print the mean entropy for each prompt

Processed prompts: 100%|██████████| 10/10 [00:04<00:00,  2.44it/s, est. speed input: 294.27 toks/s, output: 1126.15 toks/s]


tensor(1.6250, dtype=torch.bfloat16)
