In [14]:
import os

# 从transformer.ipynb中导入transformer类
try:
    import nbimporter
    import transformer as tr
    print("导入成功！")
    print("可用的类和函数：")
    print([name for name in dir(tr) if not name.startswith('_')])
except Exception as e:
    print(f"导入失败：{e}")
    print("请检查transformer.ipynb是否在当前目录下")


导入成功！
可用的类和函数：
['Decoder', 'DecoderLayer', 'Encoder', 'EncoderLayer', 'FeedForward', 'LayerNorm', 'MultiHeadAttention', 'PositionalEncoding', 'ScaleDotProductAttention', 'Transformer', 'TransformerEmbedding', 'get_ipython', 'math', 'nn', 'plt', 'torch']


# 二、数据加载和处理

## 2.1 加载数据集

### 2.1.1 数据集

In [15]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

dataset = load_dataset("bentrevett/multi30k")

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})


### 2.1.2 分词器

In [16]:
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordLevel
from tokenizers.trainers import BpeTrainer, WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

class HFVocabBuilder:
    def __init__(self, tokenizer_type='wordlevel'):
        self.tokenizer_type = tokenizer_type
        self.source_tokenizer = None
        self.target_tokenizer = None
        
    def build_vocab(self, train_data, src="de", trg="en", min_freq=1):
        """使用HuggingFace tokenizers构建词汇表"""
        
        # 提取源语言和目标语言文本
        source_texts = []
        target_texts = []
        
        for example in train_data:
            source_texts.append(example[src])
            target_texts.append(example[trg])
        
        # 构建源语言tokenizer
        self.source_tokenizer = self._build_single_tokenizer(source_texts, min_freq)
        
        # 构建目标语言tokenizer
        self.target_tokenizer = self._build_single_tokenizer(target_texts, min_freq)
        
        # 创建兼容接口
        self.source = self._create_vocab_interface(self.source_tokenizer)
        self.target = self._create_vocab_interface(self.target_tokenizer)
        
        print(f"源语言词汇表大小: {self.source_tokenizer.get_vocab_size()}")
        print(f"目标语言词汇表大小: {self.target_tokenizer.get_vocab_size()}")
    
    def _build_single_tokenizer(self, texts, min_freq):
        """构建单个tokenizer"""
        if self.tokenizer_type == 'wordlevel':
            # 使用WordLevel模型
            tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))
            trainer = WordLevelTrainer(
                special_tokens=["<pad>", "<unk>", "<sos>", "<eos>"],
                min_frequency=min_freq
            )
        else:
            # 使用BPE模型
            tokenizer = Tokenizer(BPE(unk_token="<unk>"))
            trainer = BpeTrainer(
                special_tokens=["<pad>", "<unk>", "<sos>", "<eos>"],
                min_frequency=min_freq
            )
        
        # 设置预处理器
        tokenizer.pre_tokenizer = Whitespace()
        
        # 设置后处理器（添加特殊标记）
        tokenizer.post_processor = TemplateProcessing(
            single="<sos> $A <eos>",
            special_tokens=[("<sos>", 2), ("<eos>", 3)]
        )
        
        # 训练tokenizer
        tokenizer.train_from_iterator(texts, trainer)
        
        return tokenizer
    
    def _create_vocab_interface(self, tokenizer):
        """创建与torchtext兼容的接口"""
        vocab_obj = type('Vocab', (), {})()
        
        # 获取词汇表
        vocab = tokenizer.get_vocab()
        vocab_obj.stoi = vocab
        vocab_obj.itos = {v: k for k, v in vocab.items()}
        vocab_obj.__len__ = lambda: len(vocab)
        
        return type('Field', (), {'vocab': vocab_obj})()

In [17]:
# 使用示例
hf_builder = HFVocabBuilder()
hf_builder.build_vocab(train_data=dataset['train'], src="de", trg="en", min_freq=2)

# 获取特殊标记索引
src_pad_idx = hf_builder.source.vocab.stoi['<pad>']
trg_pad_idx = hf_builder.target.vocab.stoi['<pad>']
trg_sos_idx = hf_builder.target.vocab.stoi['<sos>']
print(src_pad_idx, trg_pad_idx, trg_sos_idx)

enc_voc_size = hf_builder.source_tokenizer.get_vocab_size()
dec_voc_size = hf_builder.target_tokenizer.get_vocab_size()

源语言词汇表大小: 8060
目标语言词汇表大小: 6203
0 0 2


In [18]:
tmp_text = dataset['train'][0]['de']
print("original text:")
print(tmp_text)
print("tokenized text:")
tmp_text_encoded = hf_builder.source_tokenizer.encode(tmp_text).ids 
print(tmp_text_encoded)
print("decoded text:")
print(hf_builder.source_tokenizer.decode(tmp_text_encoded))
print("special tokens:")
print(hf_builder.source_tokenizer.encode("<pad>", add_special_tokens=False).ids)
print(hf_builder.source_tokenizer.encode("<unk>", add_special_tokens=False).ids)
print(hf_builder.source_tokenizer.encode("<sos>", add_special_tokens=False).ids)
print(hf_builder.source_tokenizer.encode("<eos>", add_special_tokens=False).ids)

original text:
Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.
tokenized text:
[2, 21, 86, 223, 32, 88, 22, 97, 7, 16, 116, 7956, 3260, 4, 3]
decoded text:
Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche .
special tokens:
[0]
[1]
[2]
[3]


### 2.1.3 数据加载器

In [19]:
def custom_collate_fn(
    batch,
    src_name,
    trg_name,
    src_tokenizer,
    trg_tokenizer,
    device="cpu"
):
    # 找到批次中最长的序列
    batch_max_length = max(max(len(item[src_name]), len(item[trg_name]))+2 for item in batch)

    # 填充并准备输入和目标
    inputs_lst, targets_lst = [], []

    for item in batch:
        # 使用tokenizer进行tokenize
        item[src_name] = src_tokenizer.encode(item[src_name]).ids
        item[trg_name] = trg_tokenizer.encode(item[trg_name]).ids
        # 填充到最大长度
        item[src_name] = item[src_name] + src_tokenizer.encode("<pad>", add_special_tokens=False).ids * (batch_max_length - len(item[src_name]))
        item[trg_name] = item[trg_name] + trg_tokenizer.encode("<pad>", add_special_tokens=False).ids * (batch_max_length - len(item[trg_name]))

        inputs_lst.append(torch.tensor(item[src_name]))
        targets_lst.append(torch.tensor(item[trg_name]))

    # 将输入和目标的列表转换为张量，并转移到目标设备
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return {
        src_name: inputs_tensor,
        trg_name: targets_tensor
    }

class HuggingFaceMulti30k:
    def __init__(self, dataset, batch_size=32, tokenizer=None, src_name="de", trg_name="en"):
        self.batch_size = batch_size
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.src_name = src_name
        self.trg_name = trg_name
        
    def get_dataloaders(self):
        my_collate_fn = lambda x: custom_collate_fn(x, self.src_name, self.trg_name, self.tokenizer.source_tokenizer, self.tokenizer.target_tokenizer)
        train_loader = DataLoader(
            self.dataset['train'], 
            batch_size=self.batch_size, 
            shuffle=True,
            collate_fn=my_collate_fn
        )
        
        val_loader = DataLoader(
            self.dataset['validation'], 
            batch_size=self.batch_size,
            collate_fn=my_collate_fn
        )
        
        test_loader = DataLoader(
            self.dataset['test'], 
            batch_size=self.batch_size,
            collate_fn=my_collate_fn
        )
        
        return train_loader, val_loader, test_loader

In [20]:
dataset_loader = HuggingFaceMulti30k(dataset, batch_size=128, tokenizer=hf_builder, src_name="de", trg_name="en")
train_loader, valid_loader, test_loader = dataset_loader.get_dataloaders()

for batch in train_loader:
    test_batch_sample = batch["de"][0]
    print(test_batch_sample)
    print(hf_builder.source_tokenizer.decode([test_batch_sample[0].numpy()]))
    break

tensor([  2,  14,  17,   7,  13,  48,  94,  31, 144,  52,   6,  90,   4,   3,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])



# 三、模型训练

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model parameter setting
batch_size = 128
max_len = 256
d_model = 512
n_layers = 6
n_heads = 8
ffn_hidden = 2048
drop_prob = 0.1

# optimizer parameter setting
init_lr = 1e-5
factor = 0.9
adam_eps = 5e-9
patience = 10
warmup = 100
epoch = 10
clip = 1.0
weight_decay = 5e-4
inf = float('inf')

In [22]:
import math
import time

from torch import nn, optim
from torch.optim import Adam

from bleu import get_bleu

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.kaiming_uniform(m.weight.data)


model = tr.Transformer(src_pad_idx=src_pad_idx,
                    trg_pad_idx=trg_pad_idx,
                    trg_sos_idx=trg_sos_idx,
                    d_model=d_model,
                    enc_voc_size=enc_voc_size,
                    dec_voc_size=dec_voc_size,
                    max_len=max_len,
                    ffn_hidden=ffn_hidden,
                    n_heads=n_heads,
                    n_layers=n_layers,
                    drop_prob=drop_prob).to(device)

print(f'The model has {count_parameters(model):,} trainable parameters')
model.apply(initialize_weights)

The model has 54,623,291 trainable parameters


  nn.init.kaiming_uniform(m.weight.data)


Transformer(
  (encoder): Encoder(
    (emb): TransformerEmbedding(
      (token_emb): Embedding(8060, 512)
      (pos_emb): PositionalEncoding()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (att): MultiHeadAttention(
          (attention): ScaleDotProductAttention()
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (w_concat): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm()
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): FeedForward(
          (layers): Sequential(
            (0): Linear(in_features=512, out_features=2048, bias=True)
            (1): ReLU()
            (2): Linear(in_features=2048, out_features=512, bias=True)
            (3): Dropout(p=0.1, inplace=False)
          )


In [23]:
optimizer = Adam(params=model.parameters(),
                 lr=init_lr,
                 weight_decay=weight_decay,
                 eps=adam_eps)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 factor=factor,
                                                 patience=patience)

criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)

In [24]:
def train(model, iterator, optimizer, criterion, clip, src_name, trg_name):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch[src_name]
        trg = batch[trg_name]

        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        output_reshape = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output_reshape, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
        print('step :', round((i / len(iterator)) * 100, 2), '% , loss :', loss.item())

        # for debug
        # break

    return epoch_loss / len(iterator)



In [25]:
def evaluate(model, iterator, criterion, tokenizer, src_name, trg_name):
    model.eval()
    epoch_loss = 0
    batch_bleu = []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch[src_name]
            trg = batch[trg_name]
            output = model(src, trg[:, :-1])
            # batch_size, trg_len - 1, vocab_size -> batch_size * (trg_len - 1), vocab_size
            output_reshape = output.contiguous().view(-1, output.shape[-1])
            # batch_size, trg_len - 1 -> batch_size * (trg_len - 1)
            trg = trg[:, 1:].contiguous().view(-1)

            loss = criterion(output_reshape, trg)
            epoch_loss += loss.item()

            total_bleu = []
            # print(f"batch {i}")
            # print(f"src shape: {src.shape}")
            # print(f"trg shape: {trg.shape}")
            # print(f"output shape: {output.shape}")
            # print(f"output shape: {output_reshape.shape}")
            for j in range(batch[trg_name].shape[0]):
                # try:
                trg_words = tokenizer.target_tokenizer.decode(batch[trg_name][j].numpy())
                output_words = output[j].max(dim=1)[1]
                output_words = tokenizer.target_tokenizer.decode(output_words.numpy())
                bleu = get_bleu(hypotheses=output_words.split(), reference=trg_words.split())
                total_bleu.append(bleu)
                # except:
                #     pass

            total_bleu = sum(total_bleu) / len(total_bleu)
            batch_bleu.append(total_bleu)

    batch_bleu = sum(batch_bleu) / len(batch_bleu)
    return epoch_loss / len(iterator), batch_bleu

In [26]:
total_epoch = 1
best_loss = float('inf')

train_losses, test_losses, bleus = [], [], []
for step in range(total_epoch):
    start_time = time.time()
    train_loss = train(model, train_loader, optimizer, criterion, clip, src_name="de", trg_name="en")
    valid_loss, bleu = evaluate(model, valid_loader, criterion, hf_builder, src_name="de", trg_name="en")
    end_time = time.time()

    if step > warmup:
        scheduler.step(valid_loss)

    train_losses.append(train_loss)
    test_losses.append(valid_loss)
    bleus.append(bleu)
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    # os.makedirs("saved", exist_ok=True)
    # if valid_loss < best_loss:
    #     best_loss = valid_loss
    #     torch.save(model.state_dict(), 'saved/model-{0}.pt'.format(valid_loss))

    os.makedirs("result", exist_ok=True)
    f = open('result/train_loss.txt', 'w')
    f.write(str(train_losses))
    f.close()

    f = open('result/bleu.txt', 'w')
    f.write(str(bleus))
    f.close()

    f = open('result/test_loss.txt', 'w')
    f.write(str(test_losses))
    f.close()

    print(f'Epoch: {step + 1} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\tVal Loss: {valid_loss:.3f} |  Val PPL: {math.exp(valid_loss):7.3f}')
    print(f'\tBLEU Score: {bleu:.3f}')


step : 0.0 % , loss : 9.852019309997559
Epoch: 1 | Time: 0m 38s
	Train Loss: 0.043 | Train PPL:   1.044
	Val Loss: 9.495 |  Val PPL: 13295.891
	BLEU Score: 0.029
step : 0.0 % , loss : 9.52609634399414
Epoch: 2 | Time: 0m 32s
	Train Loss: 0.042 | Train PPL:   1.043
	Val Loss: 9.120 |  Val PPL: 9133.191
	BLEU Score: 0.009
step : 0.0 % , loss : 9.236044883728027
Epoch: 3 | Time: 0m 35s
	Train Loss: 0.041 | Train PPL:   1.042
	Val Loss: 8.796 |  Val PPL: 6607.358
	BLEU Score: 0.000
step : 0.0 % , loss : 9.070938110351562
Epoch: 4 | Time: 0m 32s
	Train Loss: 0.040 | Train PPL:   1.041
	Val Loss: 8.534 |  Val PPL: 5087.274
	BLEU Score: 0.000
step : 0.0 % , loss : 8.722808837890625
Epoch: 5 | Time: 0m 35s
	Train Loss: 0.038 | Train PPL:   1.039
	Val Loss: 8.327 |  Val PPL: 4132.746
	BLEU Score: 0.000
step : 0.0 % , loss : 8.617325782775879
Epoch: 6 | Time: 0m 34s
	Train Loss: 0.038 | Train PPL:   1.039
	Val Loss: 8.164 |  Val PPL: 3510.891
	BLEU Score: 0.000
step : 0.0 % , loss : 8.3988933563