In [None]:
!pip install sentencepiece
!pip install sacrebleu
!pip install tqdm
!pip install datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import torch
import time
import math
import sentencepiece as spm
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sacrebleu import corpus_bleu
from tqdm import tqdm
from datasets import load_dataset

import sys
os.chdir("/content/drive/MyDrive/Colab Notebooks/ICT303/Transformer")
sys.path.append("/content/drive/MyDrive/Colab Notebooks/ICT303/Transformer")
from Trans import Transformer


# 加载IWSLT2017中英文数据集

In [None]:
# 加载 IWSLT2017 数据
dataset = load_dataset("iwslt2017", "iwslt2017-zh-en", trust_remote_code=True)

# 获取官方训练、验证、测试集
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

# 提取文本
zh_train = [item["translation"]["zh"] for item in train_data]
en_train = [item["translation"]["en"] for item in train_data]

zh_val = [item["translation"]["zh"] for item in val_data]
en_val = [item["translation"]["en"] for item in val_data]

zh_test = [item["translation"]["zh"] for item in test_data]
en_test = [item["translation"]["en"] for item in test_data]

print(f"训练集大小: {len(zh_train)}")
print(f"验证集大小: {len(zh_val)}")
print(f"测试集大小: {len(zh_test)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

iwslt2017.py:   0%|          | 0.00/8.17k [00:00<?, ?B/s]

zh-en.zip:   0%|          | 0.00/26.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/231266 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8549 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/879 [00:00<?, ? examples/s]

训练集大小: 231266
验证集大小: 879
测试集大小: 8549


# 训练 SentencePiece 分词器

In [None]:
# 保存训练数据到文件
with open("data/train.zh", "w", encoding="utf-8") as f:
    f.write("\n".join(zh_train))

with open("data/train.en", "w", encoding="utf-8") as f:
    f.write("\n".join(en_train))

# 训练 SentencePiece 分词器
spm.SentencePieceTrainer.train(input="data/train.zh", model_prefix="data/sp_zh", vocab_size=32000)
spm.SentencePieceTrainer.train(input="data/train.en", model_prefix="data/sp_en", vocab_size=32000)




In [None]:
# 加载分词模型
sp_zh = spm.SentencePieceProcessor(model_file="data/sp_zh.model")
sp_en = spm.SentencePieceProcessor(model_file="data/sp_en.model")

# 分词测试
print(sp_zh.encode("你好世界", out_type=str))  # 示例
print(sp_en.encode("Hello World", out_type=str))  # 示例

['▁', '你好', '世界']
['▁Hello', '▁World']


# 创建 PyTorch 数据集

In [None]:
batch_size = 64

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, src_tokenizer, tgt_tokenizer, max_len=128):
        self.source_texts = src_texts
        self.target_texts = tgt_texts
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.source_texts)


    def __getitem__(self, idx):
        src = self.src_tokenizer.encode(self.source_texts[idx], out_type=int)
        tgt = self.tgt_tokenizer.encode(self.target_texts[idx], out_type=int)

        # 补齐或截断
        src = src[:self.max_len] + [0] * (self.max_len - len(src))
        tgt = tgt[:self.max_len] + [0] * (self.max_len - len(tgt))

        return {"source": torch.tensor(src), "target": torch.tensor(tgt)}


# 创建数据集
train_dataset = TranslationDataset(zh_train, en_train, sp_zh, sp_en)
val_dataset = TranslationDataset(zh_val, en_val, sp_zh, sp_en)
test_dataset = TranslationDataset(zh_test, en_test, sp_zh, sp_en)

# 创建 DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)


In [None]:
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

print(len(train_loader))
print(len(val_loader))
print(len(test_loader))


231266
879
8549
3613
13
133


# 训练 Transformer 模型

In [None]:

# 参数设置
vocab_size = 32000
d_model=512
num_heads = 8
num_layers = 6
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 10

# 创建模型、损失函数和优化器
model = Transformer(vocab_size, d_model, num_layers, num_heads).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)


In [None]:
def timeSince(since):
  now = time.time()
  s = now - since
  m = math.floor(s / 60)
  s -= m * 60
  return '%dm %ds' % (m, s)

In [None]:
def create_padding_mask(seq, pad_token_id=0):
    # [batch_size, seq_len] → [batch_size, 1, 1, seq_len] for broadcasting
    return (seq != pad_token_id).unsqueeze(1).unsqueeze(2)  # [B, 1, 1, L]
def create_causal_mask(size):
    return torch.tril(torch.ones((size, size))).bool()  # [L, L]


In [None]:
def train():
    model.to(device)
    best_val_loss = float("inf")

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        start_time = time.time()

        for batch in train_loader:
            src, tgt = batch["source"].to(device), batch["target"].to(device)

            optimizer.zero_grad()

            src_mask = create_padding_mask(src).to(device)  # shape: [B, 1, 1, L_src]
            tgt_pad_mask = create_padding_mask(tgt[:, :-1]).to(device)  # shape: [B, 1, 1, L_tgt - 1]
            tgt_causal_mask = create_causal_mask(tgt[:, :-1].size(1)).to(device)  # [L_tgt - 1, L_tgt - 1]
            tgt_causal_mask = tgt_causal_mask.unsqueeze(0).unsqueeze(1)  # [1, 1, L, L]

            tgt_mask = tgt_pad_mask & tgt_causal_mask  # [B, 1, L, L]

            output = model(src, tgt[:, :-1], src_mask=src_mask, tgt_mask=tgt_mask)
            # output = output.view(-1, output.size(-1))
            # target = tgt[:, 1:].reshape(-1)

            # 计算损失 (忽略 PAD)
            # loss = criterion(output, target)
            loss = criterion(output.view(-1, output.size(-1)), tgt[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        # 计算验证损失
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                src, tgt = batch["source"].to(device), batch["target"].to(device)
                src_mask = create_padding_mask(src).to(device)  # shape: [B, 1, 1, L_src]
                tgt_pad_mask = create_padding_mask(tgt[:, :-1]).to(device)  # shape: [B, 1, 1, L_tgt - 1]
                tgt_causal_mask = create_causal_mask(tgt[:, :-1].size(1)).to(device)  # [L_tgt - 1, L_tgt - 1]
                tgt_causal_mask = tgt_causal_mask.unsqueeze(0).unsqueeze(1)  # [1, 1, L, L]

                tgt_mask = tgt_pad_mask & tgt_causal_mask  # [B, 1, L, L]

                output = model(src, tgt[:, :-1], src_mask=src_mask, tgt_mask=tgt_mask)
                # output = output.reshape(-1, vocab_size)
                # target = tgt[:, 1:].reshape(-1)
                loss = criterion(output.view(-1, output.size(-1)), tgt[:, 1:].reshape(-1))
                total_val_loss += loss.item()

        scheduler.step()
        avg_train_loss = total_train_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(val_loader)
        print(f"[{timeSince(start_time)}]Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        # 保存最佳模型
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "best_model.pth")
            print("✅ 最优模型已保存 (best_model.pth)")


In [None]:
# 开始训练
train()

[58m 5s]Epoch 1/10, Train Loss: 3.9275, Val Loss: 4.0653
✅ 最优模型已保存 (best_model.pth)
[58m 5s]Epoch 2/10, Train Loss: 3.3092, Val Loss: 3.7724
✅ 最优模型已保存 (best_model.pth)
[58m 7s]Epoch 3/10, Train Loss: 2.8317, Val Loss: 3.6748
✅ 最优模型已保存 (best_model.pth)


# 评估模型（BLEU 分数）

In [None]:
def evaluate_model(model, dataloader, criterion, tokenizer, device, max_len=100, num_show=5):
    model.eval()
    model.to(device)

    total_loss = 0
    all_predictions = []
    all_references = []

    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            src = batch['translation']['zh'].to(device)  # [B, L_src]
            tgt = batch['translation']['en'].to(device)  # [B, L_tgt]

            src_mask = create_padding_mask(src).to(device)

            # Greedy decoding
            batch_size = src.size(0)
            outputs = torch.full((batch_size, 1), tokenizer.pad_token_id, dtype=torch.long).to(device)  # init with <pad> or <bos>

            for _ in range(max_len):
                tgt_mask_pad = create_padding_mask(outputs).to(device)
                tgt_mask_causal = create_causal_mask(outputs.size(1)).to(device)
                tgt_mask = tgt_mask_pad & tgt_mask_causal.unsqueeze(0).unsqueeze(1)

                out = model(src, outputs, src_mask=src_mask, tgt_mask=tgt_mask)  # [B, T, vocab]
                next_token = out[:, -1, :].argmax(-1, keepdim=True)  # [B, 1]
                outputs = torch.cat([outputs, next_token], dim=1)

                if (next_token == tokenizer.eos_token_id).all():
                    break

            # Loss (optional)
            tgt_input = tgt[:, :-1]
            tgt_mask_pad = create_padding_mask(tgt_input).to(device)
            tgt_mask_causal = create_causal_mask(tgt_input.size(1)).to(device)
            tgt_mask = tgt_mask_pad & tgt_mask_causal.unsqueeze(0).unsqueeze(1)

            pred_out = model(src, tgt_input, src_mask=src_mask, tgt_mask=tgt_mask)
            loss = criterion(pred_out.view(-1, pred_out.size(-1)), tgt[:, 1:].reshape(-1))
            total_loss += loss.item()

            # Decode predictions
            for j in range(src.size(0)):
                pred_ids = outputs[j].tolist()
                pred_text = tokenizer.decode(pred_ids, skip_special_tokens=True)

                ref_ids = tgt[j, 1:].tolist()
                ref_text = tokenizer.decode(ref_ids, skip_special_tokens=True)

                all_predictions.append(pred_text)
                all_references.append(ref_text)

                # 打印前 num_show 个
                if i * dataloader.batch_size + j < num_show:
                    print(f"🔹 Source:    {tokenizer.decode(src[j].tolist(), skip_special_tokens=True)}")
                    print(f"🔸 Reference: {ref_text}")
                    print(f"🔻 Predicted: {pred_text}")
                    print("")

    bleu = sacrebleu.corpus_bleu(all_predictions, [all_references])
    avg_loss = total_loss / len(dataloader)

    print(f"🧪 Eval Loss: {avg_loss:.4f} | 🏅 BLEU score: {bleu.score:.2f}")
    return avg_loss, bleu.score


In [None]:
evaluate_model(model, test_loader, criterion, tokenizer, device)



In [None]:
# 1. 加载 tokenizer
# sp = spm.SentencePieceProcessor()
# sp_zh.load("spm.model")  # 你训练时用的 model 文件

# 2. 加载模型
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# vocab_size = len(sp_zh)  # 获取 tokenizer 词表大小
# model = Transformer(vocab_size)
model.load_state_dict(torch.load("best_model.pth", map_location=device))
model.to(device)
model.eval()


# 4. 创建 mask
def create_mask(src, tgt):
    src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
    tgt_len = tgt.size(1)
    tgt_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=device)).bool()
    tgt_mask = tgt_mask.unsqueeze(0).unsqueeze(1)
    return src_mask, tgt_mask

# 5. 推理函数（greedy decoding）
def greedy_decode(model, src_sentence, max_len=50, start_id=sp_zh.bos_id(), end_id=sp_zh.eos_id()):
    src_ids = sp_zh.encode(src_sentence, out_type=int)
    src_tensor = torch.LongTensor([src_ids]).to(device)
    src_mask = (src_tensor != 0).unsqueeze(1).unsqueeze(2)
    memory = model.encoder(src_tensor, src_mask)
    ys = torch.ones(1, 1).fill_(start_id).long().to(device)

    for _ in range(max_len - 1):
        tgt_mask = torch.tril(torch.ones((ys.size(1), ys.size(1)), device=device)).bool().unsqueeze(0).unsqueeze(1)
        out = model.decoder(ys, memory, src_mask, tgt_mask)
        prob = out[:, -1, :]
        next_word = torch.argmax(prob, dim=-1).item()
        ys = torch.cat([ys, torch.tensor([[next_word]], device=device)], dim=1)
        if next_word == end_id:
            break
    return ys[0].tolist()

# 6. 解码输出



In [None]:
src_sentence = "今天天气怎么样"

output_ids = greedy_decode(model, src_sentence)
translated_sentence = sp_en.decode(output_ids)
print("翻译结果:", translated_sentence)

翻译结果: today. So what's going on today? What's the weather. What's going to the weather. What? What's the weather. What? What's the weather. What? What? What's the weather


In [None]:
tgt_ids = sp_zh.encode("目标句子", out_type=int)
print(tgt_ids)

[4, 696, 5772]
