In [34]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from tqdm import tqdm


In [35]:
class SimpleTokenizer:
    def __init__(self):
        self.word2idx = {"<pad>":0, "<bos>":1, "<eos>":2, "<unk>":3}
        self.idx2word = {0:"<pad>", 1:"<bos>", 2:"<eos>", 3:"<unk>"}

    def build_vocab(self, sentences, min_freq=1):
        freq = {}
        for sent in sentences:
            for w in sent.split():
                freq[w] = freq.get(w,0)+1
        for w, c in freq.items():
            if c >= min_freq:
                idx = len(self.word2idx)
                self.word2idx[w] = idx
                self.idx2word[idx] = w

    def encode(self, sentence):
        return [self.word2idx.get(w,3) for w in sentence.split()]

    def decode(self, ids):
        return " ".join([self.idx2word[i] for i in ids if i > 2])

    @property
    def vocab_size(self):
        return len(self.word2idx)


In [36]:
def load_parallel_data(zh_file, en_file):
    with open(zh_file, encoding="utf-8") as f:
        zh = [l.strip() for l in f.readlines()]
    with open(en_file, encoding="utf-8") as f:
        en = [l.strip() for l in f.readlines()]
    assert len(zh) == len(en)
    return zh, en


In [37]:
class TranslationDataset(Dataset):
    def __init__(self, zh_list, en_list, zh_tok, en_tok):
        self.zh = zh_list
        self.en = en_list
        self.zh_tok = zh_tok
        self.en_tok = en_tok

    def __getitem__(self, idx):
        src_ids = [1] + self.zh_tok.encode(self.zh[idx]) + [2]
        tgt_ids = [1] + self.en_tok.encode(self.en[idx]) + [2]
        return torch.tensor(src_ids), torch.tensor(tgt_ids)

    def __len__(self):
        return len(self.zh)


In [38]:
def collate_fn(batch):
    src, tgt = zip(*batch)
    src = pad_sequence(src, batch_first=True, padding_value=0)
    tgt = pad_sequence(tgt, batch_first=True, padding_value=0)
    return src, tgt


In [39]:
class TransformerMT(nn.Module):
    def __init__(self, vocab_src, vocab_tgt, d_model=256, nhead=4, num_layers=4):
        super().__init__()

        self.src_embed = nn.Embedding(vocab_src, d_model)
        self.tgt_embed = nn.Embedding(vocab_tgt, d_model)
        self.pos = PositionalEncoding(d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers
        )

        self.fc = nn.Linear(d_model, vocab_tgt)

    def forward(self, src, tgt):
        src = self.pos(self.src_embed(src))
        tgt = self.pos(self.tgt_embed(tgt))

        tgt_mask = nn.Transformer.generate_square_subsequent_mask(
            tgt.size(1)
        ).to(src.device)

        out = self.transformer(
            src.transpose(0,1),
            tgt.transpose(0,1),
            tgt_mask=tgt_mask
        )

        out = out.transpose(0,1)
        out = self.fc(out)
        return out


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(position * div)
        pe[:, 1::2] = torch.cos(position * div)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)


In [40]:
def train_model(model, loader, optimizer, criterion, epochs=10):
    model.train()
    for ep in range(epochs):
        total = 0
        for src, tgt in tqdm(loader):
            src, tgt = src.to(device), tgt.to(device)

            tgt_in = tgt[:, :-1]
            tgt_out = tgt[:, 1:].reshape(-1)

            logits = model(src, tgt_in).reshape(-1, model.fc.out_features)
            loss = criterion(logits, tgt_out)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total += loss.item()
        print(f"Epoch {ep+1} Loss = {total/len(loader):.4f}")


In [41]:
def translate(model, sentence, zh_tok, en_tok, max_len=40):
    model.eval()
    src = torch.tensor([[1] + zh_tok.encode(sentence) + [2]]).to(device)

    tgt = torch.tensor([[1]]).to(device)

    for _ in range(max_len):
        logits = model(src, tgt)
        next_token = logits[0,-1].argmax().item()
        tgt = torch.cat([tgt, torch.tensor([[next_token]]).to(device)], dim=1)
        if next_token == 2:
            break
    return en_tok.decode(tgt[0].tolist())


In [42]:
zh_sentences = [
    "我 爱 学习 人工智能",
    "今天天气 很 好",
    "我 喜欢 看 电影",
    "你 在 做 什么",
    "我们 明天 去 北京",
    "这是 一个 很 好 的 想法",
    "请问 洗手间 在 哪里",
    "我 在 找 工作",
    "他 是 一名 工程师",
    "她 喜欢 吃 苹果",
]

en_sentences = [
    "I love studying artificial intelligence",
    "The weather is very good today",
    "I like watching movies",
    "What are you doing",
    "We will go to Beijing tomorrow",
    "This is a very good idea",
    "Excuse me where is the restroom",
    "I am looking for a job",
    "He is an engineer",
    "She likes eating apples",
]

# 扩展成 500 条
ZH = []
EN = []
for i in range(50):
    for z, e in zip(zh_sentences, en_sentences):
        ZH.append(z)
        EN.append(e)

# 写入文件
with open("train.zh", "w", encoding="utf-8") as f:
    for line in ZH:
        f.write(line + "\n")

with open("train.en", "w", encoding="utf-8") as f:
    for line in EN:
        f.write(line + "\n")

print("训练数据 train.zh / train.en 已成功生成，共", len(ZH), "行")


训练数据 train.zh / train.en 已成功生成，共 500 行


In [43]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# ① 加载数据
zh, en = load_parallel_data("train.zh", "train.en")

# ② 构建词表
zh_tok = SimpleTokenizer()
en_tok = SimpleTokenizer()
zh_tok.build_vocab(zh)
en_tok.build_vocab(en)

# ③ 构建 Dataset & Loader
ds = TranslationDataset(zh, en, zh_tok, en_tok)
loader = DataLoader(ds, batch_size=32, shuffle=True, collate_fn=collate_fn)

# ④ 初始化模型
model = TransformerMT(
    vocab_src=zh_tok.vocab_size,
    vocab_tgt=en_tok.vocab_size,
    d_model=256,
    nhead=4,
    num_layers=4
).to(device)

optimizer = optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss(ignore_index=0)


# ⑤ 训练
train_model(model, loader, optimizer, criterion, epochs=10)

# ⑥ 测试翻译
print(translate(model, "我 喜欢 学习 人工智能", zh_tok, en_tok))


100%|██████████| 16/16 [00:00<00:00, 41.68it/s]


Epoch 1 Loss = 2.6941


100%|██████████| 16/16 [00:00<00:00, 35.97it/s]


Epoch 2 Loss = 0.4069


100%|██████████| 16/16 [00:00<00:00, 38.43it/s]


Epoch 3 Loss = 0.0716


100%|██████████| 16/16 [00:00<00:00, 35.80it/s]


Epoch 4 Loss = 0.0295


100%|██████████| 16/16 [00:00<00:00, 36.18it/s]


Epoch 5 Loss = 0.0193


100%|██████████| 16/16 [00:00<00:00, 43.16it/s]


Epoch 6 Loss = 0.0153


100%|██████████| 16/16 [00:00<00:00, 40.43it/s]


Epoch 7 Loss = 0.0128


100%|██████████| 16/16 [00:00<00:00, 34.43it/s]


Epoch 8 Loss = 0.0112


100%|██████████| 16/16 [00:00<00:00, 35.31it/s]


Epoch 9 Loss = 0.0097


100%|██████████| 16/16 [00:00<00:00, 43.27it/s]

Epoch 10 Loss = 0.0086
I love studying artificial intelligence



