In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.data import Field, BucketIterator
from torchtext.legacy.datasets import Multi30k
import spacy
import random
import math
import time

# 导入你定义的Transformer类所在的文件
# 假设你的transformer类定义在transformer.py文件中
from transformer import Transformer

# 数据处理
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

def tokenize_de(text):
    """
    对德语文本进行分词
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    对英语文本进行分词
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

# 定义源语言和目标语言的Field
SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True)
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)

# 加载Multi30k数据集
train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG))

# 构建词汇表
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

# 定义数据迭代器
BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device
)

# 模型定义
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

model = Transformer(INPUT_DIM, OUTPUT_DIM, d_model=HID_DIM, num_layers=ENC_LAYERS, num_heads=ENC_HEADS, d_ff=ENC_PF_DIM, dropout=ENC_DROPOUT).to(device)

# 初始化模型参数
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

model.apply(init_weights)

# 定义损失函数和优化器
PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters())

# 训练模型
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg[:,:-1])
        
        # output = [batch size, trg len - 1, output dim]
        # trg = [batch size, trg len]
        
        output_dim = output.shape[-1]
        
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:,1:].contiguous().view(-1)
        
        # output = [batch size * trg len - 1, output dim]
        # trg = [batch size * trg len - 1]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

# 评估模型
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            output = model(src, trg[:,:-1])
            
            # output = [batch size, trg len - 1, output dim]
            # trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            
            # output = [batch size * trg len - 1, output dim]
            # trg = [batch size * trg len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

# 训练过程
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'transformer-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

# 测试模型
model.load_state_dict(torch.load('transformer-model.pt'))
test_loss = evaluate(model, test_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

OSError: /home/zhouke/anaconda3/envs/pytorch/lib/python3.11/site-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs

In [1]:
import torch
import torchtext
print(torch.__version__)
print(torchtext.__version__)

OSError: /home/zhouke/anaconda3/envs/pytorch/lib/python3.11/site-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs

In [2]:
import torch
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import Multi30k

# 定义分词器
tokenizer_src = get_tokenizer('spacy', language='de_core_news_sm')
tokenizer_trg = get_tokenizer('spacy', language='en_core_web_sm')

# 定义特殊符号
SPECIALS = ['<unk>', '<pad>', '<sos>', '<eos>']

# 加载数据集
train_iter, valid_iter, test_iter = Multi30k()

# 构建词汇表
def yield_tokens(data_iter, tokenizer, index):
    for src, trg in data_iter:
        if index == 0:
            yield tokenizer(src)
        else:
            yield tokenizer(trg)

src_vocab = build_vocab_from_iterator(yield_tokens(train_iter, tokenizer_src, 0), specials=SPECIALS)
trg_vocab = build_vocab_from_iterator(yield_tokens(train_iter, tokenizer_trg, 1), specials=SPECIALS)

# 定义数据处理函数
def data_process(data_iter, src_vocab, trg_vocab, tokenizer_src, tokenizer_trg):
    data = []
    for src, trg in data_iter:
        src_tensor = torch.tensor([src_vocab[token] for token in tokenizer_src(src)], dtype=torch.long)
        trg_tensor = torch.tensor([trg_vocab[token] for token in tokenizer_trg(trg)], dtype=torch.long)
        data.append((src_tensor, trg_tensor))
    return data

train_data = data_process(train_iter, src_vocab, trg_vocab, tokenizer_src, tokenizer_trg)
valid_data = data_process(valid_iter, src_vocab, trg_vocab, tokenizer_src, tokenizer_trg)
test_data = data_process(test_iter, src_vocab, trg_vocab, tokenizer_src, tokenizer_trg)

# 定义数据加载器
BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def generate_batch(data_batch):
    src_batch, trg_batch = [], []
    for src, trg in data_batch:
        src_batch.append(src)
        trg_batch.append(trg)
    src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=src_vocab['<pad>']).to(device)
    trg_batch = torch.nn.utils.rnn.pad_sequence(trg_batch, padding_value=trg_vocab['<pad>']).to(device)
    return src_batch, trg_batch

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)

OSError: /home/zhouke/anaconda3/envs/pytorch/lib/python3.11/site-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs