In [1]:
# 导入相关模块
import time
import string
import jieba
import numpy as np

import torch
import torch.utils.data as DataSet
import torch.nn as nn
import torch.optim
from torch.autograd import Variable

In [2]:
### 定义读取语料方法
def read_corpus(path):
    English = []
    Chinese = []

    f = open(path, 'r', encoding = 'utf-8')
    for line in f.readlines():
        eng, chs = line.strip().split('\t')

        eng = eng[:-1]
        eng = eng.split(' ')
        English.append(eng)

        chs = jieba.lcut(chs)
        chs = ['B'] + chs
        Chinese.append(chs)
    return English, Chinese

English, Chinese = read_corpus('./cmn.txt')
print(English[20000])
print(Chinese[20000])

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Dump cache file failed.
Traceback (most recent call last):
  File "/home/agent_mxz/miniconda3/envs/cops3/lib/python3.10/site-packages/jieba/__init__.py", line 154, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmp1g3960ad' -> '/tmp/jieba.cache'
Loading model cost 0.469 seconds.
Prefix dict has been built successfully.


['If', 'I', 'were', 'you,', "I'd", 'want', 'to', 'know', 'what', 'Tom', 'is', 'doing', 'right', 'now']
['B', '如果', '我', '是', '你', '，', '我', '不会', '想', '去', '知道', 'Tom', '现在', '正在', '做', '什么', '。']


In [3]:
### 定义中英文字典编码方法
def lang_encode(language):
    lang2idx = {}
    i = 1
    for chs in language:
        for c in chs:
            if lang2idx.get(c) == None:
                lang2idx[c] = i
                i += 1
    return lang2idx

chs2idx = lang_encode(Chinese)
eng2idx = lang_encode(English)
chs_vocab_size = len(chs2idx.keys()) + 1
eng_vocab_size = len(eng2idx.keys()) + 1

print('中文字典大小', chs_vocab_size)
print('英文字典大小', eng_vocab_size)

中文字典大小 13683
英文字典大小 7814


In [4]:
### 定义文本编码方法
def text_encode(lang2idx, language):
    text_digit = []
    for txt in language:
        t_digit = []
        for t in txt:
            t_digit.append(lang2idx[t])
        text_digit.append(t_digit)
    return text_digit

chs_digit = text_encode(chs2idx, Chinese)
eng_digit = text_encode(eng2idx, English)
print("原始中文: ")
print(Chinese[20000])
print("\n 中文编码后的结果: ")
print(chs_digit[20000][:])
print("原始英文: ")
print(English[20000])
print("\n 英文编码后的结果: ")
print(eng_digit[20000][:])

原始中文: 
['B', '如果', '我', '是', '你', '，', '我', '不会', '想', '去', '知道', 'Tom', '现在', '正在', '做', '什么', '。']

 中文编码后的结果: 
[1, 917, 12, 35, 5, 79, 12, 16, 257, 36, 73, 202, 429, 496, 129, 299, 3]
原始英文: 
['If', 'I', 'were', 'you,', "I'd", 'want', 'to', 'know', 'what', 'Tom', 'is', 'doing', 'right', 'now']

 英文编码后的结果: 
[1057, 5, 734, 1013, 673, 302, 484, 66, 993, 29, 199, 1017, 142, 112]


In [5]:
### 定义生成训练输入输出序列函数
def generate_XY(chs_digit, eng_digit, max_len):
    X = []
    Y = []
    i = -1
    for c_digit in chs_digit:
        i += 1
        Y.append(c_digit[-1])
        # 将最后一个字符之前的部分作为X，并补齐字符
        x1 = c_digit[:-1] + [0]*(max_len - len(c_digit))
        x2 = eng_digit[i] + [0]*(max_len - len(eng_digit[i]))
        X.append(x1 + x2)    
    return X, Y

X, Y = generate_XY(chs_digit, eng_digit, max_len=40)
print("原始中文: ")
print(Chinese[20000])
print("变量X_chs: ")
print(X[20000][0:39])
print("变量X_eng: ")
print(X[20000][-40:])
print("变量Y: ")
print(Y[20000])

原始中文: 
['B', '如果', '我', '是', '你', '，', '我', '不会', '想', '去', '知道', 'Tom', '现在', '正在', '做', '什么', '。']
变量X_chs: 
[1, 917, 12, 35, 5, 79, 12, 16, 257, 36, 73, 202, 429, 496, 129, 299, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
变量X_eng: 
[1057, 5, 734, 1013, 673, 302, 484, 66, 993, 29, 199, 1017, 142, 112, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
变量Y: 
3


In [6]:
# 划分训练集和验证集
# 将所有数据的顺序打乱重排
idx = np.random.permutation(range(len(X)))
X = [X[i] for i in idx]
Y = [Y[i] for i in idx]

# 切分出1/5的数据作为验证集  
validX = X[: len(X) // 5]
trainX = X[len(X) // 5 :]
validY = Y[: len(Y) // 5]
trainY = Y[len(Y) // 5 :]

In [7]:
# 设定batch size
batch_size = 64
# 创建Tensor形式的训练集
train_ds = DataSet.TensorDataset(torch.IntTensor(np.array(trainX, dtype=int)), 
                                 torch.IntTensor(np.array(trainY, dtype=int)))
# 形成训练数据加载器
train_loader = DataSet.DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=1)

# 创建Tensor形式的验证集
valid_ds = DataSet.TensorDataset(torch.IntTensor(np.array(validX, dtype=int)), 
                                 torch.IntTensor(np.array(validY, dtype=int)))
# 形成验证数据加载器
valid_loader = DataSet.DataLoader(valid_ds, batch_size=batch_size, shuffle=True, num_workers=1)

In [8]:
### 定义机器翻译网络结构
class Translator(nn.Module):
    def __init__(self, eng_vocab_size, chs_vocab_size, embedding_size, hidden_size, num_layers=1):
        super(Translator, self).__init__()        
        # Pytorch的LSTM层，batch_first标识可以让输入的张量的第一个维度表示batch指标
        self.encoder_embedding = nn.Embedding(eng_vocab_size, embedding_size)
        self.encoder_lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)
        self.decoder_embedding = nn.Embedding(chs_vocab_size, embedding_size)
        self.decoder_lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, chs_vocab_size)
        
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.eng_vocab_size = eng_vocab_size
        self.chs_vocab_size = chs_vocab_size
    
    # 定义前向计算流程
    def forward(self, chs, encoder_state):        
        # 输入为代表音乐家的one-hot向量，转为具有一定意义的特征向量
        x = self.decoder_embedding(chs)
        # 读入隐含层的初始信息
        hh = encoder_state
        # 从输入到隐含层的计算
        # x的尺寸为：batch_size，num_step，hidden_size
        output, hidden = self.decoder_lstm(x, hh)
        # 从output中去除最后一个时间步的数值（output中包含了所有时间步的结果）
        output = output[:, -1, ...]
        # output的尺寸为：batch_size，hidden_size
        # 最后一层全连接网络
        output = self.fc(output)
        # output的尺寸为：batch_size，output_size
        return output
    
    # 定义隐含单元初始化方法
    def initHidden(self, eng, batch_size):       
        x = self.encoder_embedding(eng).cuda()    
        # 初始化的隐藏元和记忆元,通常它们的维度是一样的
        h1 = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)).cuda()
        c1 = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)).cuda()
        #这里我们要对后面的LSTM模型的隐藏状态进行条件初始化
        _, encoder_state = self.encoder_lstm(x, (h1, c1))   
        return encoder_state

In [9]:
#给定超参数
lr = 1e-2
epochs = 200
# 创建机器翻译模型实例
translator = Translator(eng_vocab_size=eng_vocab_size, chs_vocab_size=chs_vocab_size, embedding_size=64, hidden_size=128)
# 转为GPU下的模型
translator = translator.cuda()
# 交叉熵损失函数
criterion = torch.nn.CrossEntropyLoss() 
# sgd优化算法
optimizer = torch.optim.SGD(translator.parameters(), lr=lr) 
#查看模型具体信息
print(translator)

Translator(
  (encoder_embedding): Embedding(7814, 64)
  (encoder_lstm): LSTM(64, 128, batch_first=True)
  (decoder_embedding): Embedding(13683, 64)
  (decoder_lstm): LSTM(64, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=13683, bias=True)
)


In [10]:
### 定义预测准确率的函数
def accuracy(pre, label):
    # 得到每一行（每一个样本）输出值最大元素的下标
    pre = torch.max(pre.data, 1)[1]
    # 将下标与label比较，计算正确的数量
    rights = pre.eq(label.data).sum()
    # 计算正确预测所占百分比
    acc = rights.data / len(label)
    return acc.float()

In [11]:
### 定义一个tensor分割函数
def split_chs_eng(x):
    x = x.tolist()
    x1 = [x[i][0:40] for i in range(len(x))]
    x2 = [x[i][-40:] for i in range(len(x))]
    x1 = torch.IntTensor(np.array(x1, dtype=int))
    x2 = torch.IntTensor(np.array(x2, dtype=int))
    return Variable(x1).cuda(), Variable(x2).cuda()

In [12]:
### 定义训练过程打印函数
def print_log(epoch, train_time, train_loss, train_acc, val_loss, val_acc, epochs=10):
    print(f"Epoch [{epoch}/{epochs}], time: {train_time:.2f}s, loss: {train_loss:.4f}, acc: {train_acc:.4f}, val_loss: {val_loss:.4f}, val_acc: {val_acc:.4f}")

In [13]:
### 定义模型验证过程
def validate(model, val_loader):
    # 在验证集上运行一遍并计算损失和准确率
    val_loss = 0
    val_acc = 0
    model.eval()
    for batch, data in enumerate(val_loader):
        x, y = Variable(data[0]), Variable(data[1])
        x, y = x.cuda(), y.cuda()
        chs, eng = split_chs_eng(x)
        encoder_state = model.initHidden(eng, len(data[0]))
        outputs = model(chs, encoder_state)
        y = y.long()
        loss = criterion(outputs, y)
        val_loss += loss.data.cpu().numpy()  
        val_acc += accuracy(outputs, y)
    # 计算平均损失
    val_loss /= len(val_loader)  
    # 计算平均准确率
    val_acc /= len(val_loader)  
    return val_loss, val_acc

In [14]:
### 定义模型训练函数
def train(model, optimizer, train_loader, val_loader, epochs=50):  
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []
    
    for epoch in range(epochs):
        train_loss = 0
        train_acc = 0
        # 记录当前epoch开始时间
        start = time.time()  
        for batch, data in enumerate(train_loader):
            # batch为数字，表示已经进行了几个batch
            # data为一个二元组，存储了一个样本的输入和标签
            model.train() 
            x, y = Variable(data[0]), Variable(data[1])
            x, y = x.cuda(), y.cuda()
            chs, eng = split_chs_eng(x)
            encoder_state = model.initHidden(eng, len(data[0]))
            optimizer.zero_grad()
            outputs = model(chs, encoder_state)
            y = y.long()
            # 计算当前损失
            loss = criterion(outputs, y) 
            train_loss += loss.data.cpu().numpy()  
            train_acc += accuracy(outputs, y) 
            loss.backward() 
            optimizer.step() 
            
        # 记录当前epoch结束时间
        end = time.time()  
        # 计算当前epoch的训练耗时 
        train_time = end - start  
        # 计算平均损失
        train_loss /= len(train_loader) 
        # 计算平均准确率 
        train_acc /= len(train_loader) 
        # 计算验证集上的损失函数和准确率
        val_loss, val_acc = validate(model, val_loader)  
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        val_losses.append(val_loss)
        val_accs.append(val_acc) 
        print_log(epoch + 1, train_time, train_loss, train_acc, val_loss, val_acc, epochs=epochs)  
        
    return train_losses, train_accs, val_losses, val_accs

In [15]:
# 模型训练
history = train(translator, optimizer, train_loader, valid_loader, epochs=epochs)  

Epoch [1/200], time: 2.63s, loss: 2.9832, acc: 0.8131, val_loss: 0.8309, val_acc: 0.8563
Epoch [2/200], time: 2.65s, loss: 0.7939, acc: 0.8412, val_loss: 0.6455, val_acc: 0.8560
Epoch [3/200], time: 2.63s, loss: 0.6858, acc: 0.8412, val_loss: 0.5743, val_acc: 0.8560
Epoch [4/200], time: 2.63s, loss: 0.6040, acc: 0.8424, val_loss: 0.5106, val_acc: 0.8564
Epoch [5/200], time: 2.73s, loss: 0.5474, acc: 0.8424, val_loss: 0.4717, val_acc: 0.8561
Epoch [6/200], time: 2.78s, loss: 0.5105, acc: 0.8520, val_loss: 0.4438, val_acc: 0.8743
Epoch [7/200], time: 2.69s, loss: 0.4852, acc: 0.8722, val_loss: 0.4209, val_acc: 0.8912
Epoch [8/200], time: 2.66s, loss: 0.4573, acc: 0.8840, val_loss: 0.3996, val_acc: 0.9003
Epoch [9/200], time: 2.74s, loss: 0.4372, acc: 0.8961, val_loss: 0.3853, val_acc: 0.9117
Epoch [10/200], time: 2.62s, loss: 0.4210, acc: 0.8994, val_loss: 0.3711, val_acc: 0.9153
Epoch [11/200], time: 2.64s, loss: 0.4069, acc: 0.9026, val_loss: 0.3608, val_acc: 0.9170
Epoch [12/200], tim

In [None]:
### 模型翻译测试
max_len = 40
test = 'what is it'
test = test.split(' ')
eng = []
for t in test:
    eng.append(eng2idx[t])
    
eng = eng + [0]*(max_len - len(eng))
eng = torch.IntTensor(np.array([eng], dtype=int))
eng = Variable(eng).cuda()
predict = [chs2idx['B']]*(max_len - 1)
predict = np.array([int(i.cpu()) if type(i)!=int else i for i in predict])
chs = ''

for i in range(max_len - 2):
    encoder_state = translator.initHidden(eng, 1)
    pre = torch.IntTensor(np.array([predict], dtype=int))
    pre = Variable(pre).cuda()
    output = translator(pre, encoder_state)
    # 提取最大概率的字符所在的位置，记录其编号
    index = torch.argmax(output) 
    predict[i+1] = index
    
    if predict[i+1] == 0:
        break
        
    # 提取上述编号所对应的字符
    current_word = [k for k, v in chs2idx.items() if v==index][0] 
    chs = chs + current_word
    
print(chs)