In [1]:
import os

os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

In [2]:
import torch
from transformers import GPT2LMHeadModel, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

def load_best_model(checkpoint_path, model_name="gpt2"):
    model = GPT2LMHeadModel.from_pretrained(model_name)
    
    if os.path.exists(checkpoint_path):
        state_dict = torch.load(checkpoint_path, map_location='cpu')
        model.load_state_dict(state_dict)
        model = model.to(device)
        model.eval()
        print(f"✅ 模型已加载并移动到 {device}：{checkpoint_path}")
        return model
    else:
        raise FileNotFoundError(f"模型文件未找到：{checkpoint_path}")
    
model = load_best_model("/run/determined/NAS1/public/chengjintao/teacher_checkpoints/0824_teacher_best_model.pt")

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id


✅ 模型已加载并移动到 cuda：/run/determined/NAS1/public/chengjintao/teacher_checkpoints/0824_teacher_best_model.pt


In [3]:
question = "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?"

inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True).to(device)

with torch.no_grad():
    output_ids = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=100,
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id
    )

answer = tokenizer.decode(output_ids[0], skip_special_tokens=False)
print("模型输出：")
print(answer)

模型输出：
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?||<<48/2=24>> <<48+24=72>>####72<|endoftext|>


In [4]:
import json
import re
from tqdm import tqdm

test_path = "../data/gsm8k_aug_train.jsonl"

with open(test_path, "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f]


def extract_answer(generated_text):

    match = re.search(r'####(.+)<', generated_text)

    if match:
        return match.group(1).strip()
    else:
        return None  # 或抛出异常


generate_kwargs = dict(
    max_new_tokens=100,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=False,
    # top_p=0.95,
    # temperature=0.7,
)

correct = 0

for i, item in enumerate(tqdm(test_data, desc="Evaluating")):
    question = item["question"]
    gold = item["answer"]

    prompt = f"{question}"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(**inputs, **generate_kwargs)

    text = tokenizer.decode(outputs[0], skip_special_tokens=False)
    pred = extract_answer(text)

    if pred == gold:
        correct += 1

    # 打印前5条用于调试
    if i < 5:
        print("="*50)
        print(f"Question: {question}")
        print(f"Gold Answer: {gold}")
        print(f"Model Output:\n{text}")
        print(f"Predicted Answer: {pred}")

# 统计准确率
acc = 100 * correct / len(test_data)
print(f"Accuracy: {acc:.2f}% ({correct}/{len(test_data)})")


Evaluating:   0%|          | 2/385620 [00:00<28:44:40,  3.73it/s]

Question: Out of 600 employees in a company, 30% got promoted while 10% received bonus. How many employees did not get either a promotion or a bonus?
Gold Answer: 360
Model Output:
Out of 600 employees in a company, 30% got promoted while 10% received bonus. How many employees did not get either a promotion or a bonus?||<<600*30/100=180>> <<600*10/100=60>> <<180+60=240>> <<600-240=360>>####360<|endoftext|>
Predicted Answer: 360
Question: Rebecca sets aside 20% of her weekly salary to savings. If she manages to save $150 during a week, how much does she earn?
Gold Answer: 750
Model Output:
Rebecca sets aside 20% of her weekly salary to savings. If she manages to save $150 during a week, how much does she earn?||<<150/0.2=750>>####750<|endoftext|>
Predicted Answer: 750


Evaluating:   0%|          | 4/385620 [00:00<19:44:59,  5.42it/s]

Question: There are 20 students in a class. Each student has 5 pencils. How many pencils are there in total?
Gold Answer: 100
Model Output:
There are 20 students in a class. Each student has 5 pencils. How many pencils are there in total?||<<20*5=100>>####100<|endoftext|>
Predicted Answer: 100
Question: Stephanie is creating gift baskets. She needs 3 bottles of wine for each basket. If she is making 10 gift baskets, how many bottles of wine does she need?
Gold Answer: 30
Model Output:
Stephanie is creating gift baskets. She needs 3 bottles of wine for each basket. If she is making 10 gift baskets, how many bottles of wine does she need?||<<3*10=30>>####30<|endoftext|>
Predicted Answer: 30


Evaluating:   0%|          | 5/385620 [00:01<31:58:14,  3.35it/s]

Question: John begins his hike at 8:30 AM and finishes at 6:30 PM. He rests for 20 minutes at noon, 15 minutes in the afternoon and 30 minutes before finishing his hike. How many hours did he spend hiking?
Gold Answer: 8.92
Model Output:
John begins his hike at 8:30 AM and finishes at 6:30 PM. He rests for 20 minutes at noon, 15 minutes in the afternoon and 30 minutes before finishing his hike. How many hours did he spend hiking?||<<20+15+30=65>> <<65/60=1.083333333333>> <<1.083+1.083=2.4166666666666667>>####2.4<|endoftext|>
Predicted Answer: 2.4


Evaluating:   0%|          | 266/385620 [01:18<31:26:00,  3.41it/s]


KeyboardInterrupt: 