In [1]:
import json
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class QADataset(Dataset):
    def __init__(self, file_path, tokenizer, max_input_len=512, max_output_len=64):
        self.samples = []
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len

        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                self.samples.append(json.loads(line.strip()))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        input_text = f"问题: {sample['question']} 上下文: {sample['context']}"
        target_text = sample["answer"]

        model_inputs = self.tokenizer(
            input_text,
            max_length=self.max_input_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        labels = self.tokenizer(
            target_text,
            max_length=self.max_output_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        model_inputs["labels"] = labels["input_ids"].squeeze()
        return {key: val.squeeze() for key, val in model_inputs.items()}


In [4]:
def train(model, tokenizer, train_dataset, val_dataset=None, epochs=3, batch_size=8, lr=3e-5, device="cuda"):
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    if val_dataset:
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # 记录 loss
    train_losses = []
    val_losses = []

    for epoch in range(1, epochs + 1):
        print(f"\nEpoch {epoch}/{epochs}")
        print("-" * 30)

        # -----------------------------
        # 训练
        # -----------------------------
        model.train()
        running_loss = 0.0
        progress_bar = tqdm(enumerate(train_loader), total=len(train_loader))

        for i, batch in progress_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            progress_bar.set_description(f"loss: {loss.item():.6f}")

        epoch_train_loss = running_loss / len(train_loader)
        train_losses.append(epoch_train_loss)
        print(f"Epoch {epoch} Training Loss: {epoch_train_loss:.6f}")

        # -----------------------------
        # 验证
        # -----------------------------
        if val_dataset:
            model.eval()
            val_loss_total = 0.0
            with torch.no_grad():
                for batch in tqdm(val_loader, desc="Validation"):
                    input_ids = batch["input_ids"].to(device)
                    attention_mask = batch["attention_mask"].to(device)
                    labels = batch["labels"].to(device)

                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    val_loss_total += outputs.loss.item()

            val_loss = val_loss_total / len(val_loader)
            val_losses.append(val_loss)
            print(f"Epoch {epoch} Validation Loss: {val_loss:.6f}")

        # -----------------------------
        # 保存模型（每个 epoch）
        # -----------------------------
        save_dir = f"./qa_t5_epoch{epoch}"
        os.makedirs(save_dir, exist_ok=True)
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"模型已保存到 {save_dir}")

    return train_losses, val_losses

In [6]:
# -----------------------------
# 主函数
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device: ", device)
model_name = "langboat/mengzi-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
    # 打印模型信息
print("\n===== 模型信息 =====")
print(model)
print("===================")

device:  cuda

===== 模型信息 =====
T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048,

In [11]:

# -------------------------
# 数据集
# -------------------------
train_dataset = QADataset("./data/DuReaderQG/train.json", tokenizer)
val_dataset = QADataset("./data/DuReaderQG/dev.json", tokenizer)

print(f"\n训练集大小: {len(train_dataset)} 条")
print(f"验证集大小: {len(val_dataset)} 条")

# 打印训练集前三条样本
print("\n===== 训练集样本展示 =====")
for i in range(min(3, len(train_dataset))):
    sample = train_dataset.samples[i]
    print(f"\n样本 {i+1}:")
    print("问题:", sample["question"])
    print("上下文:", sample["context"][:80], "..." if len(sample["context"]) > 80 else "")
    print("答案:", sample["answer"])
    encoded = tokenizer(
        f"问题: {sample['question']} 上下文: {sample['context']}",
        max_length=64,
        truncation=True
    )
    print("input_ids:", encoded["input_ids"][:20], "...")


训练集大小: 14520 条
验证集大小: 984 条

===== 训练集样本展示 =====

样本 1:
问题: 仙剑奇侠传3第几集上天界
上下文: 第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸 ...
答案: 第35集
input_ids: [143, 13, 7, 1707, 1467, 992, 3979, 707, 100, 379, 645, 647, 9724, 1252, 7, 2868, 180, 13, 3389, 2838] ...

样本 2:
问题: 燃气热水器哪个牌子好
上下文: 选择燃气热水器时，一定要关注这几个问题：1、出水稳定性要好，不能出现忽热忽冷的现象2、快速到达设定的需求水温3、操作要智能、方便4、安全性要好，要装有安全报警装 ...
答案: 方太
input_ids: [143, 13, 7, 9147, 14702, 2347, 8245, 67, 7, 2868, 180, 13, 7, 188, 9147, 14702, 40, 3, 683, 668] ...

样本 3:
问题: 乔丹打了多少个赛季
上下文: 迈克尔.乔丹在NBA打了15个赛季。他在84年进入nba，期间在1993年10月6日第一次退役改打棒球，95年3月18日重新回归，在99年1月13日第二次退役， ...
答案: 15个
input_ids: [143, 13, 7, 19918, 176, 8986, 19940, 7, 2868, 180, 13, 7, 25659, 2250, 19918, 8, 18820, 5719, 672, 19940] ...


In [12]:
train_losses, val_losses = train(
    model,
    tokenizer,
    train_dataset,
    val_dataset=val_dataset,
    epochs=30,
    batch_size=8,
    lr=3e-5,
    device=device
)


Epoch 1/30
------------------------------


loss: 36.347759:   0%|          | 1/1815 [00:22<11:10:19, 22.17s/it]


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

# -----------------------------
# 绘制并显示 loss 曲线
# -----------------------------
plt.figure()
plt.plot(range(1, len(train_losses) + 1), train_losses, label="Train Loss")
if val_losses:
    plt.plot(range(1, len(val_losses) + 1), val_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Curve")
plt.legend()
plt.grid(True)

# 在 notebook 中显示
plt.show()

# 同时保存图片
plt.savefig("loss_curve.png")
print("Loss 曲线已保存为 loss_curve.png")