In [1]:
!pip install  spacy  -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
!pip install /home/mw/input/transformer50805080/en_core_web_sm-any-py3-none-any.whl -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
!pip install /home/mw/input/transformer50805080/fr_core_news_sm-any-py3-none-any.whl -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple

[33mDEPRECATION: Loading egg at /opt/conda/lib/python3.11/site-packages/papermill-2.3.1-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0mLooking in indexes: https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
Collecting spacy
  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/0a/0a/bb90e9aa0b3c527876627567d82517aabab08006ccf63796c33b0242254d/spacy-3.8.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.0/33.0 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/c3/55/12e842c70ff8828e34e543a2c7176dac4da006ca6901c9e8b43efab8bc6b/spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collec

In [2]:
# --- 外部库导入 ---

from torch.autograd import Variable

import copy
import math
import numpy as np
import os
import pandas as pd
import re
import spacy
import time
import torch
import torch.nn as nn
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim


In [3]:
# === 新增单元格：自定义词汇表 ===

from collections import Counter
from tqdm import tqdm

class Vocabulary:
    def __init__(self, tokenizer, min_freq=1):
        self.tokenizer = tokenizer
        self.min_freq = min_freq
        # 初始化词汇表时，加入特殊标记
        self.specials = ['<unk>', '<pad>', '<sos>', '<eos>']
        self.stoi = {token: i for i, token in enumerate(self.specials)}
        self.itos = {i: token for i, token in enumerate(self.specials)}
        
    def __len__(self):
        return len(self.stoi)

    def build_vocab(self, sentence_list):
        print("Building vocabulary...")
        counter = Counter()
        # 使用tqdm显示进度条
        for sentence in tqdm(sentence_list):
            counter.update(self.tokenizer(sentence))
        
        # 过滤掉低频词
        for word, count in counter.items():
            if count >= self.min_freq and word not in self.stoi:
                idx = len(self.stoi)
                self.stoi[word] = idx
                self.itos[idx] = word
        print("Vocabulary built.")

    def numericalize(self, text):
        tokenized_text = self.tokenizer(text)
        # 对于不在词汇表中的词，使用 <unk> 的索引
        unk_idx = self.stoi['<unk>']
        return [self.stoi.get(token, unk_idx) for token in tokenized_text]

In [4]:
# === 内容来自: sublayers.py ===

'''1.1.2 注意力层'''
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    def attention(self, q, k, v, d_k, mask=None, dropout=None):
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
        # 掩盖那些为了补全长度而增加的单元，使其通过Softmax计算后为0
        if mask is not None:
            mask = mask.unsqueeze(1)
            scores = scores.masked_fill(mask == 0, -1e9)
        scores = F.softmax(scores, dim=-1)
        if dropout is not None:
            scores = dropout(scores)
        output = torch.matmul(scores, v)
        return output
    def forward(self, q, k, v, mask=None):
        bs = q.size(0)
        # 利用线性计算划分成h个头
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        # 矩阵转置
        k = k.transpose(1, 2)
        q = q.transpose(1, 2)
        v = v.transpose(1, 2)
        # 计算attention
        scores = self.attention(q, k, v, self.d_k, mask, self.dropout)
        # 连接多个头并输入最后的线性层
        concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
        output = self.out(concat)
        return output
'''1.1.3 前馈层'''
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()
        # d_ff 默认设为 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x
'''1.1.4 残差连接与层归一化'''
class Norm(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        self.size = d_model
        # 层归一化包含两个可学习参数
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps  # 避免除零
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        norm = self.alpha * (x - mean) / (std + self.eps) + self.bias
        return norm

In [5]:
# === 内容来自: layers.py ===

'''1.1.5 编码器和解码器结构'''
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, dropout=dropout)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
    def forward(self, x, mask):
        attn_output = self.attn(x, x, x, mask)
        attn_output = self.dropout_1(attn_output)
        x = x + attn_output
        x = self.norm_1(x)
        ff_output = self.ff(x)
        ff_output = self.dropout_2(ff_output)
        x = x + ff_output
        x = self.norm_2(x)
        return x
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, dropout=dropout)
    def forward(self, x, e_outputs, src_mask, trg_mask):
        attn_output_1 = self.attn_1(x, x, x, trg_mask)
        attn_output_1 = self.dropout_1(attn_output_1)
        x = x + attn_output_1
        x = self.norm_1(x)
        attn_output_2 = self.attn_2(x, e_outputs, e_outputs, src_mask)
        attn_output_2 = self.dropout_2(attn_output_2)
        x = x + attn_output_2
        x = self.norm_2(x)
        ff_output = self.ff(x)
        ff_output = self.dropout_3(ff_output)
        x = x + ff_output
        x = self.norm_3(x)
        return x

In [6]:
# === 内容来自: embed.py ===

class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embed = nn.Embedding(vocab_size, d_model)
    def forward(self, x):
        return self.embed(x)
'''1.1.1 嵌入表示层'''
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len=80, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        # 根据pos和i创建一个常量PE矩阵
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** (i / d_model)))
                if i + 1 < d_model:
                    pe[pos, i + 1] = math.cos(pos / (10000 ** (i / d_model)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        # 使得单词嵌入表示相对大一些
        x = x * math.sqrt(self.d_model)
        # 增加位置常量到单词嵌入表示中
        seq_len = x.size(1)
        x = x + Variable(self.pe[:, :seq_len, :], requires_grad=False)
        return self.dropout(x)

In [7]:
# === 内容来自: tokenizer.py ===

class tokenize(object):
    def __init__(self, lang):
        self.nlp = spacy.load(lang)
    def tokenizer(self, sentence):
        sentence = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
        sentence = re.sub(r"[ ]+", " ", sentence)
        sentence = re.sub(r"\!+", "!", sentence)
        sentence = re.sub(r"\,+", ",", sentence)
        sentence = re.sub(r"\?+", "?", sentence)
        sentence = sentence.lower()
        return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]

In [None]:
# === 内容来自: batch.py (已修正设备不匹配问题) ===

def nopeak_mask(size):
    np_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
    np_mask = Variable(torch.from_numpy(np_mask == 0))
    return np_mask

def create_masks(src, trg, src_pad, trg_pad):
    src_mask = (src != src_pad).unsqueeze(-2)
    
    if trg is not None:
        trg_mask = (trg != trg_pad).unsqueeze(-2)
        size = trg.size(1)
        np_mask = nopeak_mask(size)
        
        # --- 核心修改在这里 ---
        # np_mask 默认在 CPU 上创建，必须将它移动到与 trg 相同的设备上才能进行计算
        # 我使用 trg.device 来确保设备总是正确的 (例如 'cuda:0')
        if np_mask.device != trg.device:
             np_mask = np_mask.to(trg.device)
        
        trg_mask = trg_mask & np_mask
    else:
        trg_mask = None
        
    return src_mask, trg_mask

In [9]:
# === 内容来自: models.py ===

def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
'''1.1.5 编码器和解码器结构'''
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)
    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, mask)
        return self.norm(x)
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)
    def forward(self, trg, e_outputs, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return self.norm(x)
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads, dropout):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads, dropout)
        self.decoder = Decoder(trg_vocab, d_model, N, heads, dropout)
        self.out = nn.Linear(d_model, trg_vocab)
    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output)
        return output

In [10]:
# === 内容来自: process.py (已重写，不依赖torchtext) ===
import torch
import random

def read_data(src_file, trg_file):
    try:
        src_data = open(src_file, encoding='utf-8').read().strip().split('\n')
        trg_data = open(trg_file, encoding='utf-8').read().strip().split('\n')
    except FileNotFoundError as e:
        print(f"Error: {e}. File not found.")
        quit()
    return src_data, trg_data

def prepare_data(src_file, trg_file, src_lang, trg_lang, max_strlen=80):
    # 1. 读取数据
    src_data, trg_data = read_data(src_file, trg_file)
    
    # 2. 过滤长句
    print(f"Original data size: {len(src_data)}")
    filtered_data = [(s, t) for s, t in zip(src_data, trg_data) if len(s.split()) < max_strlen and len(t.split()) < max_strlen]
    src_data, trg_data = zip(*filtered_data)
    print(f"Filtered data size: {len(src_data)}")

    # 3. 初始化分词器和词汇表
    src_tokenizer = tokenize(src_lang)
    trg_tokenizer = tokenize(trg_lang)
    
    SRC_VOCAB = Vocabulary(src_tokenizer.tokenizer)
    TRG_VOCAB = Vocabulary(trg_tokenizer.tokenizer)
    
    SRC_VOCAB.build_vocab(src_data)
    TRG_VOCAB.build_vocab(trg_data)

    # 4. 数值化数据
    print("Numericalizing data...")
    src_numerical = [SRC_VOCAB.numericalize(s) for s in tqdm(src_data)]
    trg_numerical = [TRG_VOCAB.numericalize(t) for t in tqdm(trg_data)]

    return src_numerical, trg_numerical, SRC_VOCAB, TRG_VOCAB

def data_generator(src_numerical, trg_numerical, batch_size, src_pad_idx, trg_pad_idx):
    # 将数据打包成 (源, 目标) 对并打乱
    data_pairs = list(zip(src_numerical, trg_numerical))
    random.shuffle(data_pairs)
    
    num_batches = len(data_pairs) // batch_size
    
    for i in range(num_batches):
        batch_pairs = data_pairs[i * batch_size : (i + 1) * batch_size]
        
        # 按长度排序有助于减少填充
        batch_pairs.sort(key=lambda x: len(x[0]), reverse=True)
        
        src_list, trg_list_raw = zip(*batch_pairs)

        # 为目标序列添加 <sos> 和 <eos>
        sos_idx = TRG_VOCAB.stoi['<sos>']
        eos_idx = TRG_VOCAB.stoi['<eos>']
        trg_list = [[sos_idx] + trg + [eos_idx] for trg in trg_list_raw]
        
        # --- 手动填充 ---
        max_src_len = max(len(s) for s in src_list)
        max_trg_len = max(len(t) for t in trg_list)
        
        src_padded = [s + [src_pad_idx] * (max_src_len - len(s)) for s in src_list]
        trg_padded = [t + [trg_pad_idx] * (max_trg_len - len(t)) for t in trg_list]
        
        # 转换为Tensor
        src_tensor = torch.LongTensor(src_padded)
        trg_tensor = torch.LongTensor(trg_padded)
        
        yield src_tensor, trg_tensor

# 这个函数现在只用于翻译阶段
def tokenize_en_sentence(src_sentence, SRC_VOCAB):
    spacy_en = tokenize('en_core_web_sm')
    src_tokens = spacy_en.tokenizer(src_sentence)
    return [SRC_VOCAB.stoi.get(tok, SRC_VOCAB.stoi['<unk>']) for tok in src_tokens]

In [None]:
# === 内容来自: main.py (已重写，并支持GPU和更多测试语句) ===

# 1. 定义设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"正在使用的计算设备 (Using device): {device}")

# 2. 数据和模型参数
src_file = '/home/mw/input/transformer50805080/english.txt'
trg_file = '/home/mw/input/transformer50805080/french.txt'
src_lang = 'en_core_web_sm'
trg_lang = 'fr_core_news_sm'
max_strlen = 80
batch_size = 32
epochs = 2
d_model = 512
heads = 8
N = 6
dropout = 0.1

# 3. 准备数据、词汇表
src_numerical, trg_numerical, SRC_VOCAB, TRG_VOCAB = prepare_data(
    src_file, trg_file, src_lang, trg_lang, max_strlen
)
src_pad_idx = SRC_VOCAB.stoi['<pad>']
trg_pad_idx = TRG_VOCAB.stoi['<pad>']
src_vocab_size = len(SRC_VOCAB)
trg_vocab_size = len(TRG_VOCAB)

# 4. 初始化模型和优化器
model = Transformer(src_vocab_size, trg_vocab_size, d_model, N, heads, dropout)
model.to(device) # <-- 将模型移动到设备
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)
optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# 5. 模型训练
def train_model(epochs):
    model.train()
    start = time.time()
    
    for epoch in range(epochs):
        total_loss = 0
        num_batches = len(src_numerical) // batch_size
        
        train_generator = data_generator(src_numerical, trg_numerical, batch_size, src_pad_idx, trg_pad_idx)
        
        progress_bar = tqdm(train_generator, total=num_batches, desc=f"Epoch {epoch+1}/{epochs}")
        
        for i, (src, trg) in enumerate(progress_bar):
            src = src.to(device)
            trg = trg.to(device)

            trg_input = trg[:, :-1]
            targets = trg[:, 1:].contiguous().view(-1)
            
            src_mask, trg_mask = create_masks(src, trg_input, src_pad_idx, trg_pad_idx)
            
            preds = model(src, trg_input, src_mask, trg_mask)
            
            optim.zero_grad()
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), targets, ignore_index=trg_pad_idx)
            loss.backward()
            optim.step()
            
            total_loss += loss.item()
            
            current_avg_loss = total_loss / (i + 1)
            progress_bar.set_postfix({"avg_loss": f"{current_avg_loss:.3f}"})
            
        epoch_time = time.time() - start
        final_epoch_loss = total_loss / num_batches
        print(f"Epoch {epoch+1} finished. Total time: {epoch_time//60:.0f}m {epoch_time%60:.0f}s, Final Avg Loss: {final_epoch_loss:.3f}")
        start = time.time()

# 6. 模型测试
def translate(src_sentence, max_len=80):
    # 注意：此处不再需要 model.eval()，因为我在主循环中统一处理
    src = torch.LongTensor(tokenize_en_sentence(src_sentence, SRC_VOCAB)).unsqueeze(0).to(device)
    src_mask = (src != src_pad_idx).unsqueeze(-2)
    
    e_outputs = model.encoder(src, src_mask)
    
    outputs = torch.zeros(max_len).type_as(src.data)
    outputs[0] = torch.LongTensor([TRG_VOCAB.stoi['<sos>']]).to(device)
    
    for i in range(1, max_len):
        trg_mask = np.triu(np.ones((1, i, i)), k=1).astype('uint8')
        trg_mask = Variable(torch.from_numpy(trg_mask) == 0).to(device)
        
        out = model.out(model.decoder(outputs[:i].unsqueeze(0), e_outputs, src_mask, trg_mask))
        out = F.softmax(out, dim=-1)
        val, ix = out[:, -1].data.topk(1)
        
        outputs[i] = ix[0][0]
        if ix[0][0] == TRG_VOCAB.stoi['<eos>']:
            break
            
    return " ".join([TRG_VOCAB.itos[ix.item()] for ix in outputs[:i+1]])

    # 运行
if __name__ == "__main__":
    # 1. 训练模型
    train_model(epochs)
    
    # 2. 创建一个包含多个测试语句的列表
    test_sentences = [
        'Let me see.',
        'Hello, how are you?',
        'This is a test.',
        'I love to learn new things.',
        'What is your name?',
        'The weather is nice today.',
        'Where is the library?'
    ]
    
    print("\n" + "="*20 + " 开始翻译测试 " + "="*20)
    
    # 3. 循环遍历并翻译每个语句
    model.eval() # 确保模型处于评估模式
    with torch.no_grad(): # 在评估时关闭梯度计算，节省资源
        for sentence in test_sentences:
            print(f"\n原始 (en): '{sentence}'")
            translation = translate(sentence)
            print(f"翻译 (fr): {translation}")
            print("-" * 50)

正在使用的计算设备 (Using device): cuda
Original data size: 154883
Filtered data size: 154883
Building vocabulary...


100%|██████████| 154883/154883 [00:02<00:00, 62002.77it/s]


Vocabulary built.
Building vocabulary...


100%|██████████| 154883/154883 [00:03<00:00, 40486.60it/s]


Vocabulary built.
Numericalizing data...


100%|██████████| 154883/154883 [00:02<00:00, 74318.76it/s]
100%|██████████| 154883/154883 [00:02<00:00, 59345.05it/s]
Epoch 1/2: 100%|██████████| 4840/4840 [03:21<00:00, 23.97it/s, avg_loss=4.173]


Epoch 1 finished. Total time: 3m 22s, Final Avg Loss: 4.173


Epoch 2/2: 100%|██████████| 4840/4840 [03:20<00:00, 24.19it/s, avg_loss=3.143]


Epoch 2 finished. Total time: 3m 20s, Final Avg Loss: 3.143


原始 (en): 'Let me see.'
翻译 (fr): <sos> laissez moi le faire . <eos>
--------------------------------------------------

原始 (en): 'Hello, how are you?'
翻译 (fr): <sos> qu' est ce que tu es en train de faire ? <eos>
--------------------------------------------------

原始 (en): 'This is a test.'
翻译 (fr): <sos> c' est un bon problème . <eos>
--------------------------------------------------

原始 (en): 'I love to learn new things.'
翻译 (fr): <sos> j' aime les choses . <eos>
--------------------------------------------------

原始 (en): 'What is your name?'
翻译 (fr): <sos> quel est votre nom ? <eos>
--------------------------------------------------

原始 (en): 'The weather is nice today.'
翻译 (fr): <sos> le temps est un problème . <eos>
--------------------------------------------------

原始 (en): 'Where is the library?'
翻译 (fr): <sos> où est la maison ? <eos>
--------------------------------------------------


## 实验总结  

### 1. 实验背景与目标  

复现书里一个基于 “Attention Is All You Need” 论文的 Transformer 模型，用于英法机器翻译。原始代码基于较早的 PyTorch 和 `torchtext` 版本，在当前的主流的深度学习环境中（PyTorch 2.0+）已无法直接运行。我尝试了在heywhale中使用白嫖的算力来运行，但没有权限自己配置镜像，只能使用官方的镜像，官方的镜像没有完全适配requirements中的包，尽管我使用了py=3.8.5，torch=1.6的镜像，但仍然会出现环境配置的问题随后就放弃了对原论文配置的完全复现，  

**核心目标：** 在不依赖 `torchtext` 旧版 API 的前提下，重构数据处理和模型训练流程，使其能够在现代 PyTorch 框架下，并利用 GPU 进行高效训练。  

### 2. 核心挑战与解决过程  

在将旧代码向新框架迁移的过程中，我遇到并解决了一系列典型问题：  

1.  **`torchtext` API 废弃问题**：  
    * **挑战**：旧代码的核心依赖 `torchtext.data` 在新版本中已被移除，导致程序在导入库时就出现 `OSError` 链接错误。  
    * **解决方案**：我放弃了使用兼容层 `torchtext.legacy` 的方案，选择彻底解耦对 `torchtext` 的依赖。通过手动实现了三个核心组件，增强了代码的自主性和可移植性：  
        * **自定义词汇表 (`Vocabulary` Class)**：手动管理从词元到索引的映射，包括特殊标记（`<unk>`, `<pad>` 等）。  
        * **自定义数据处理流**：重写了 `prepare_data` 函数，负责读取源文件、分词、过滤和数值化。  
        * **自定义批次生成器 (`data_generator`)**：实现了动态填充（Padding）和批次迭代功能，替代了原有的 `Iterator`。  

2.  **环境依赖与底层库冲突**：  
    * **挑战**：在解决了代码层面的问题后，程序在数据预处理阶段出现内核无故重启。但服务器拥有 80GB 内存和 48GB 显存，排除了内存不足的可能。  
    * **解决方案**：通过分析，我将问题定位在 PyTorch 和 `spaCy` 两个库对 GPU 资源的底层冲突上。最终通过**重新安装纯 CPU 版本的 `spaCy`**，强制其在 CPU 上执行轻量级的分词任务，将 GPU 资源完全留给 PyTorch 进行模型计算，从而根除了冲突，解决了内核崩溃问题。  

3.  **GPU 适配与设备不匹配**：  
    * **挑战**：在启用 GPU 训练后，出现了经典的 `RuntimeError: Expected all tensors to be on the same device...` 错误。  
    * **解决方案**：我系统性地检查了代码，确保了：  
        * 模型参数通过 `model.to(device)` 被正确地移动到 GPU。  
        * 在训练和推理的每一步，所有输入张量（包括数据和掩码）都通过 `.to(device)` 被显式地迁移到与模型相同的设备上。  

### 3. 实验结果分析  

在模型成功运行后，我对少量测试语句进行了翻译评估。结果表明，在短短 2 个周期的训练后，模型表现符合预期，具体如下：  

* **已学到的能力**：  
    * **固定短语**：对于训练数据中频繁出现的固定短语（如 "What is your name?" -> "quel est votre nom ?"），模型能给出完全准确的翻译。  
    * **基本句式**：模型掌握了基础的句子结构，如 `Where is the...` -> `Où est la...`。  

* **存在的不足**：  
    * **词汇掌握有限**：模型经常用一个学过的、但语义不符的词替换生僻词（如 `library` -> `maison`）。  
    * **语义理解偏差**：对于需要联系上下文的句子，模型理解能力不足，导致翻译结果与原意大相径庭（如 `Hello, how are you?` -> `What are you doing?`）。  
    * **信息丢失**：在处理稍长的句子时，会忽略形容词、动词等关键信息，造成翻译的过度简化（如 `I love to learn new things` -> `I like things`）。  

### 4. 结论与展望  

本次实验成功达成了核心目标：**我完整地复现了 Transformer 的代码，并将其从对旧版 `torchtext` 的依赖中解放出来，使其成为了一个能在现代 PyTorch 环境中稳定运行、并充分利用 GPU 加速的独立项目。**  

整个调试过程不仅解决了代码层面的兼容性问题，还深入到了底层库的资源冲突层面，为未来处理类似的环境问题积累了宝贵的经验。  

**未来可行的优化方向包括：**  
1.  **进行充分训练**：将训练周期从 2 个大幅增加到 20 个或更多，以充分挖掘模型的潜力。  
2.  **采用更优的解码策略**：将当前的贪心搜索（Greedy Search）替换为集束搜索（Beam Search），以生成更流畅、更全局最优的翻译结果。  
3.  **扩大训练数据集**：使用更大规模、更高质量的平行语料库进行训练。  
4.  **模型与超参数调优**：尝试调整模型的层数、头数、隐藏层维度以及优化器的学习率等。