In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import random
import gc



In [None]:
import re
from collections import defaultdict

In [None]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
import json
file_path = "/content/drive/MyDrive/Cluster-proj/dataset/openai-gsm8k/train.jsonl"

In [None]:
dataset = []
with open(file_path, 'r') as f:
    for line in f:
        dataset.append(json.loads(line))

In [None]:
model_name = "deepseek-ai/deepseek-llm-7b-chat"
# model_name = "meta-llama/Llama-2-13b-hf"
# model_name = 'google/gemma-3-27b-it'
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer = LlamaTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    trust_remote_code=True,
    torch_dtype=torch.float16
).eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.97G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.85G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.6k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [None]:
print("CUDA available:", torch.cuda.is_available())
print("CUDA device name:", torch.cuda.get_device_name(0))
print("Model loaded to:", next(model.parameters()).device)

In [None]:
samples = dataset[401:601]

In [None]:
def generate_math_with_logits(question):

    prompt = (
    f"Answer the following math question with clear, step-by-step reasoning.\n"
    f"Each step must start with <step1>, <step2>, etc., and explain one logical operation.\n"
    f"Enclose the final answer in <final_result> tags. It must be concise — a number or a few words only, not a full sentence.\n"
    f"Keep the full response under 400 tokens.\n\n"
    f"Question: {question}\n"
    f"Answer:\n"
    )


    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to(model.device)

    gen_args = {
        "max_new_tokens": 1000, #can be this high to prevent truncated answer
        "return_dict_in_generate": True,
        "output_scores": True,
        "pad_token_id": tokenizer.eos_token_id,
        "eos_token_id": tokenizer.eos_token_id,
        "do_sample": True,
          "top_k": 50,
          "top_p": 0.95,
          "temperature": 0.7

    }

    output = model.generate(**inputs, **gen_args)

    input_len = inputs['input_ids'].shape[-1]
    generated_ids = output.sequences[0][input_len:]

    raw_tokens = tokenizer.convert_ids_to_tokens(generated_ids)
    clean_tokens = [tokenizer.convert_tokens_to_string([tok]).strip() for tok in raw_tokens]

    step_probs = [F.softmax(score[0], dim=-1).detach().cpu().numpy() for score in output.scores]
    chosen_probs = [float(step_probs[i][tok_id.item()]) for i, tok_id in enumerate(generated_ids)]

    answer = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
    return answer, list(zip(clean_tokens, chosen_probs))


In [None]:
def run_math_with_logits(samples):
    results = {}
    for idx, item in enumerate(samples):
        qid = f"q_{idx}"  # 用索引生成唯一的ID
        question = item["question"]


        # 初始化结构
        results[qid] = {
            "question": question,
            "true_ans": item.get("answer", ""),
        }

        for i in range(3):
            try:
                sampling_answer, sampling_probs = generate_math_with_logits(question)
                results[qid][f"sampling{i}"] = {
                    "answer": sampling_answer,
                    "token_probs": sampling_probs
                }
            except Exception as e:
                print(f"[ERROR] Failed to generate sampling{i} for qid={qid}: {e}")
                continue

    return results




In [None]:
results = run_math_with_logits(samples)

In [None]:
import json
import re
from collections import defaultdict

def extract_steps_from_token_probs(token_probs):
    steps = defaultdict(list)
    current_step = None
    i = 0
    while i < len(token_probs):
        token, prob = token_probs[i]
        if token == "Step":
            for j in range(1, 4):
                if i + j < len(token_probs):
                    next_token = token_probs[i + j][0]
                    if re.fullmatch(r"\d+", next_token):
                        current_step = int(next_token)
                        i += j + 1
                        break
            else:
                i += 1
                continue
            continue
        if current_step is not None:
            steps[current_step].append({"token": token, "prob": prob})
        i += 1
    return dict(steps)



In [None]:
for example_id, example in results.items():
    for i in range(3):
        mode = f"sampling{i}"
        if mode in example and "token_probs" in example[mode]:
            token_probs = example[mode]["token_probs"]
            steps = extract_steps_from_token_probs(token_probs)
            example[f"{mode}_step_token_probs"] = steps


In [None]:
with open("/content/drive/MyDrive/Cluster-proj/output/deepseek7b-math-401-600.json", "w") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)