In [1]:
import os

os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [2]:
# teacher_extract.py
"""
Extract last-layer hidden states for reasoning (R) and answer (A)
from a trained teacher model, and save a single pkl.
"""
import os
import json
import numpy as np
from tqdm import tqdm


In [3]:
import torch
from transformers import GPT2LMHeadModel, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

def load_best_model(checkpoint_path, model_name="gpt2"):
    model = GPT2LMHeadModel.from_pretrained(model_name)
    
    if os.path.exists(checkpoint_path):
        state_dict = torch.load(checkpoint_path, map_location='cpu')
        model.load_state_dict(state_dict)
        model = model.to(device)
        model.eval()
        print(f"✅ 模型已加载并移动到 {device}：{checkpoint_path}")
        return model
    else:
        raise FileNotFoundError(f"模型文件未找到：{checkpoint_path}")


In [4]:
model = load_best_model("/run/determined/NAS1/public/chengjintao/teacher_checkpoints/0824_teacher_best_model.pt")

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

✅ 模型已加载并移动到 cuda：/run/determined/NAS1/public/chengjintao/teacher_checkpoints/0824_teacher_best_model.pt


In [5]:
train_path = "../data/gsm8k_aug_train.jsonl"

with open(train_path, "r", encoding="utf-8") as f:
    train_data = [json.loads(line) for line in f]

def encode_ids(text: str):
    return tokenizer(text, add_special_tokens=False).input_ids



In [6]:
teacher_rs = []
teacher_as = []
q_lens = []
r_lens = []
a_lens = []

hidden_dim = model.config.hidden_size

for idx, ex in enumerate(tqdm(train_data, desc="Extracting")):
    q = ex["question"].strip()
    reasoning = ex.get("cot_steps", [])
    a = ex["answer"].strip()

    q_text = f"{q}"
    r_text = "||"
    if isinstance(reasoning, (list, tuple)):
        r_text += " ".join([step.strip() for step in reasoning])
    else:
        r_text += reasoning.strip()

    a_text = f"####{a}{tokenizer.eos_token}"

    q_ids = tokenizer(q_text, add_special_tokens=False)["input_ids"]
    r_ids = tokenizer(r_text, add_special_tokens=False)["input_ids"]
    a_ids = tokenizer(a_text, add_special_tokens=False)["input_ids"]
    
    qn, rn, an = len(q_ids), len(r_ids), len(a_ids)

    full_ids = q_ids + r_ids + a_ids
    enc = torch.tensor([full_ids], dtype=torch.long, device=device)

    with torch.no_grad():
        out = model(enc, output_hidden_states=True)
        hs = out.hidden_states[-1][0]  # (seq_len, hidden_dim)
        
        assert hs.size(0) == qn + rn + an, f"Token length mismatch at idx {idx}"

    r_vec = hs[qn-1:qn-1+rn].cpu().numpy() if rn > 0 else np.zeros((0, hidden_dim,), dtype=np.float32)
    a_vec = hs[qn-1+rn:qn-1+rn+an].cpu().numpy() if an > 0 else np.zeros((0, hidden_dim,), dtype=np.float32)
    teacher_rs.append(r_vec.astype(np.float32))
    teacher_as.append(a_vec.astype(np.float32))

    q_lens.append(qn)
    r_lens.append(rn)
    a_lens.append(an)


Extracting:   0%|          | 0/385620 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Extracting: 100%|██████████| 385620/385620 [54:36<00:00, 117.71it/s] 


In [7]:
import pickle

# 要保存的数据
data_to_save = {
    'teacher_rs': teacher_rs,
    'teacher_as': teacher_as,
    'q_lens': q_lens,
    'r_lens': r_lens,
    'a_lens': a_lens
}

OUT_PKL = "/run/determined/NAS1/public/chengjintao/saved_teacher_hiddens/0824_teacher_hidden_states.pkl"

with open(OUT_PKL, 'wb') as f:
    pickle.dump(data_to_save, f)

print(f"Saved data to {OUT_PKL}")

Saved data to /run/determined/NAS1/public/chengjintao/saved_teacher_hiddens/0824_teacher_hidden_states.pkl


In [1]:
# teacher_extract_sample.py
import os
import json
import numpy as np
import pickle
from tqdm import tqdm
import os

os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import torch
from transformers import GPT2LMHeadModel, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

def load_best_model(checkpoint_path, model_name="gpt2"):
    model = GPT2LMHeadModel.from_pretrained(model_name)
    if os.path.exists(checkpoint_path):
        map_location = "cuda" if torch.cuda.is_available() else "cpu"
        state_dict = torch.load(checkpoint_path, map_location=map_location)
        model.load_state_dict(state_dict)
        model = model.to(device)
        model.eval()
        print(f"✅ 模型已加载并移动到 {device}：{checkpoint_path}")
        return model
    else:
        raise FileNotFoundError(f"模型文件未找到：{checkpoint_path}")

model = load_best_model(
    "/run/determined/NAS1/public/chengjintao/teacher_checkpoints/0824_teacher_best_model.pt"
)

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

train_path = "../data/gsm8k_aug_train.jsonl"
with open(train_path, "r", encoding="utf-8") as f:
    train_data = [json.loads(line) for line in f]

# 保存的结果
saved_tokens = []   # 正确答案对应的 token ids
saved_hiddens = []  # 正确答案对应的 last hidden states
meta_info = []      # 保存 question idx, sample idx 等元信息

num_samples = 20
temperature = 1.2
hidden_dim = model.config.hidden_size

for idx, ex in enumerate(tqdm(train_data, desc="Sampling")):
    q = ex["question"].strip()
    gt_answer = f"####{ex['answer'].strip()}{tokenizer.eos_token}"

    # prompt = 问题
    q_ids = tokenizer(q, return_tensors="pt").input_ids.to(device)

    # 一次性并行采样 20 个序列
    with torch.no_grad():
        gen_ids = model.generate(
            q_ids.repeat(num_samples, 1),  # [20, q_len]
            do_sample=True,
            temperature=temperature,
            top_p=0.95,
            max_new_tokens=128,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )  # shape [20, seq_len]

    # 解码结果
    decoded = tokenizer.batch_decode(gen_ids[:, q_ids.size(1):], skip_special_tokens=True)

    # 遍历每个采样，保留正确答案的 hidden states
    for s_idx, (seq_ids, text) in enumerate(zip(gen_ids, decoded)):
        text = f"{text}{tokenizer.eos_token}"
        if gt_answer in text:  # 粗暴匹配答案正确
            with torch.no_grad():
                out = model(seq_ids.unsqueeze(0), output_hidden_states=True)
                hs = out.hidden_states[-1][0].cpu().numpy()  # (seq_len, hidden_dim)

            saved_tokens.append(seq_ids.cpu().numpy())
            saved_hiddens.append(hs.astype(np.float32))
            meta_info.append({"q_idx": idx, "sample_idx": s_idx, "answer": gt_answer})

# 保存
OUT_PKL = "/run/determined/NAS1/public/chengjintao/saved_teacher_hiddens/0824_teacher_autoregressive_sampling.pkl"
data_to_save = {
    "tokens": saved_tokens,
    "hiddens": saved_hiddens,
    "meta": meta_info,
}

with open(OUT_PKL, "wb") as f:
    pickle.dump(data_to_save, f)

print(f"✅ Saved sampled results to {OUT_PKL}")


✅ 模型已加载并移动到 cuda：/run/determined/NAS1/public/chengjintao/teacher_checkpoints/0824_teacher_best_model.pt


Sampling:   0%|          | 0/385620 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Sampling:   0%|          | 3/385620 [01:11<2559:01:51, 23.89s/it]


KeyboardInterrupt: 