In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
def load_model(model_dir):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_dir).to(device)
    model.eval()
    return tokenizer, model, device

In [3]:
def generate_answer(tokenizer, model, device, question, context, max_input_len=512, max_target_len=64, num_beams=5):
    
    prompt = f"question: {question}  context: {context}"
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        padding="longest",
        max_length=max_input_len,
    ).to(device)

    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_target_len,
        num_beams=num_beams,
        early_stopping=True,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [6]:
model_dir = "./final_model"
tokenizer, model, device = load_model(model_dir)

ctx = "我是李明，我的爸爸是李安，我的妈妈是王二。"
q   = "李安的老婆是谁？"
ans = generate_answer(tokenizer, model, device, q, ctx)
print("生成答案：", ans)

生成答案： 王二
