In [None]:
# ✅ 修复并更新后的 deepseek-math-7b-instruct 推理脚本（含 attention_mask 支持）

import os
import json
import re
import torch
import torch.nn.functional as F
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from math import log2
from google.colab import drive

print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("GPU device count:", torch.cuda.device_count())


CUDA available: True
CUDA version: 12.4
GPU device count: 1


In [None]:

drive.mount('/content/drive')



Mounted at /content/drive


In [None]:

start_index = 901
end_index = 950
range_tag = f"{start_index}-{end_index}"

BASE_PATH = "/content/drive/MyDrive/Cluster-proj"
INPUT_PATH = f"{BASE_PATH}/dataset/openai-gsm8k/train.jsonl"
OUTPUT_PATH = f"{BASE_PATH}/output/llm_steps/whole_logits/deepseek-math-7b-gsm-{range_tag}.json"


In [None]:

def load_dataset(path, start, end):
    subset = []
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if start <= i < end:
                subset.append(json.loads(line))
            if i >= end:
                break
    return subset

def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.bfloat16
    ).eval()
    model.generation_config = GenerationConfig.from_pretrained(model_name)
    model.generation_config.pad_token_id = model.generation_config.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer

def top_k_logits_and_softmax(logits_tensor, k=3):
    if logits_tensor.ndim > 1:
        logits_tensor = logits_tensor.squeeze(0)
    topk = torch.topk(logits_tensor, k)
    topk_indices = topk.indices.cpu().tolist()
    topk_logits = topk.values.cpu().tolist()
    topk_softmax = F.softmax(logits_tensor, dim=-1)[topk.indices].cpu().tolist()
    entropy = -sum(p * log2(p) for p in topk_softmax if p > 0)
    info_list = [-log2(p) if p > 0 else 0.0 for p in topk_softmax]
    return {
        "indices": topk_indices,
        "logits": topk_logits,
        "softmax": topk_softmax,
        "entropy": entropy,
        "information_content": info_list
    }



def generate_with_topk_logits(model, tokenizer, question):
    messages = [{
        "role": "user",
        "content": f"What is the answer to the following math question with detailed reasoning? Please use <steps> and <final_result> tags.\n\n{question}"
    }]
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(model.device)
    attention_mask = (inputs != tokenizer.pad_token_id).long()
    input_len = inputs.shape[-1]

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            attention_mask=attention_mask,
            max_new_tokens=512,
            return_dict_in_generate=True,
            output_scores=True,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    generated_ids = outputs.sequences[0][input_len:]
    raw_tokens = tokenizer.convert_ids_to_tokens(generated_ids)
    clean_tokens = [tokenizer.convert_tokens_to_string([tok]).strip() for tok in raw_tokens]

    with torch.no_grad():
        if generated_ids.ndim == 1:
            generated_ids = generated_ids.unsqueeze(0)
        full_inputs = torch.cat([inputs, generated_ids], dim=1)
        full_attention = (full_inputs != tokenizer.pad_token_id).long()
        full_outputs = model(
            input_ids=full_inputs,
            attention_mask=full_attention,
            output_hidden_states=True,
            return_dict=True
        )
        full_hidden_states = full_outputs.hidden_states[-1][0]
        gen_hidden_states = full_hidden_states[input_len:]

    token_level = []
    for i, token in enumerate(clean_tokens):
        logits_tensor = outputs.scores[i][0]
        topk_info = top_k_logits_and_softmax(logits_tensor, k=3)
        chosen_id = generated_ids[0][i].item() if generated_ids.ndim == 2 else generated_ids[i].item()
        chosen_prob = float(F.softmax(logits_tensor, dim=-1)[chosen_id].cpu())
        info_content = -log2(chosen_prob) if chosen_prob > 0 else 0.0
        token_entry = {
            "token": token,
            "chosen_prob": chosen_prob,
            "information_content": info_content,
            "topk_info": topk_info,
            "hidden_vector": gen_hidden_states[i].cpu().tolist()
        }
        token_level.append(token_entry)

    answer = tokenizer.decode(generated_ids[0] if generated_ids.ndim == 2 else generated_ids, skip_special_tokens=True).strip()
    return answer, token_level

def extract_final_result(text):
    # 支持 $\boxed{...}$，也支持 $\\boxed{...}$，自动忽略空格
    match = re.search(r"\$\s*\\\\?boxed\s*{(.*?)}\s*\$", text)
    if match:
        return match.group(1).strip()

    # 兼容 <final_result>...</final_result>
    match = re.search(r"<final_result>(.*?)</final_result>", text, re.DOTALL)
    if match:
        return match.group(1).strip()

    return None


def extract_true_final_result(ans_str):
    match = re.search(r'####\s*(\S+)', ans_str)
    return match.group(1).strip() if match else None

def run_generation_pipeline(start_index, end_index, input_path, output_path, model_name, model, tokenizer):

    dataset = load_dataset(input_path, start_index, end_index)
    # model, tokenizer = load_model_and_tokenizer(model_name)
    results = {}

    for idx, item in enumerate(dataset):
        global_id = start_index + idx
        qid = f"q_{global_id}"
        question = item["question"]
        true_whole_answer = item.get("answer", "")
        true_final_result = extract_true_final_result(true_whole_answer)
        results[qid] = {
            "question": question,
            "true_whole_answer": true_whole_answer,
            "true_final_result": true_final_result
        }

        for i in range(3):
            try:
                full_answer, token_probs = generate_with_topk_logits(model, tokenizer, question)
                final_result = extract_final_result(full_answer)
                results[qid][f"sampling{i}"] = {
                    "whole_answer": full_answer,
                    "token_probs": token_probs,
                    "final_result": final_result
                }
            except Exception as e:
                print(f"[ERROR] Failed on {qid} sampling{i}: {e}")
                continue

    with open(output_path, "w") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    print(f"✅ Saved: {output_path}")


In [None]:
# ✅ 主入口
model_name = "deepseek-ai/deepseek-math-7b-instruct"
model, tokenizer = load_model_and_tokenizer(model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

In [None]:

run_generation_pipeline(
    start_index=start_index,
    end_index=end_index,
    input_path=INPUT_PATH,
    output_path=OUTPUT_PATH,
    model_name=model_name,
    model=model,
    tokenizer=tokenizer
)

✅ Saved: /content/drive/MyDrive/Cluster-proj/output/llm_steps/whole_logits/deepseek-math-7b-gsm-901-950.json
