In [None]:
import json
import jieba
import pandas
import numpy as np
import torch
import tqdm
import matplotlib.pyplot as plt
import random

myseed = 12345
torch.manual_seed(myseed)
torch.random.manual_seed(myseed)
random.seed(0)
np.random.seed(myseed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.cuda.manual_seed_all(myseed)
torch.autograd.set_detect_anomaly(True)  # 可在NaN 出现时报错，定位错误代码。正向传播时：开启自动求导的异常侦测
# 反向传播时：在求导时开启侦测
#with torch.autograd.detect_anomaly():
#    loss.backward()
torch.multiprocessing.set_sharing_strategy('file_system')

In [None]:
# 加载数据集
with open("./dataset/HPD/cn_train_set.json", 'r') as file:
    train_file = json.load(file)
with open("./dataset/HPD/cn_test_set.json", 'r') as file:
    test_file = json.load(file)

In [None]:
# 加载停用词表
with open('dataset/停用词表.txt', encoding='utf-8') as f:
    con = f.readlines()
    stop_words = set()
    for i in con:
        i = i.replace("\n", "")
        stop_words.add(i)

In [None]:
# 对对话词元化
i = 0
train_words = []
for segment in train_file:
    for line in train_file[segment]["对话历史"]:
        line_words = []
        line = line.split("：")[1]
        print(line)
        tokens = jieba.cut(line, cut_all=False)
        for token in tokens:
            # 去停用词
            if token not in stop_words:
                line_words.append(token)
        train_words.append(line_words)

i = 0
test_words = []
for segment in test_file:
    for line in test_file[segment]["对话历史"]:
        line_words = []
        line = line.split("：")[1]
        tokens = jieba.cut(line, cut_all=False)
        for token in tokens:
            # 去停用词
            if token not in stop_words:
                line_words.append(token)
        test_words.append(line_words)

In [None]:
# 构建词表
class Vocab:
    def __init__(self, words, min_freq = 0) -> None:
        if tokens is None:
            return

        # 计算频率
        self.words_num = {}
        for line in words:
            for word in line:
                if word not in self.words_num:
                    self.words_num[word] = 1
                else:
                    self.words_num[word] += 1

        # 删掉出现次数较少的词
        self.words = words
        for word in list(self.words_num.keys()):
            if self.words_num[word] <= min_freq:
                del self.words_num[word]
        for i in range(len(self.words)):
            for j in range(len(self.words[i])):
                if self.words[i][j] not in self.words_num:
                    self.words[i][j] = "<unk>"
        self.words_num["<unk>"] = 0
        self.len = len(self.words_num)

        # 按照出现次数排序
        self.words_num = sorted(self.words_num.items(),  key=lambda d: d[1], reverse=True)

        # index 与 word
        i = 0
        self.idx_word = {}
        for key in self.words_num:
            self.idx_word[key[0]] = i
            i += 1
    
    def __word__(self, index):
        return self.words_num[index]

    def __words__(self):
        return self.words
    
    def __words_num__(self):
        return self.words_num
    
    def __index__(self, str):
        return self.idx_word[str]
    
    def __len__(self):
        return self.len

train_table = Vocab(train_words, 5)
test_table= Vocab(test_words, 5)

In [None]:
# 划分数据集
def seq_data_iter_random(vtab, corpus, batch_size, num_steps):
    batch_x = []
    batch_y = []
    for i in range(len(corpus)):
        x = []
        y = []
        for j in range(batch_size):
            start_index = np.random.randint(num_steps)
            # x
            if i % 2:
                cur_x = []
                if len(corpus[i]) < (start_index + num_steps):
                    for k in range(start_index, len(corpus[i])):
                        cur_x.append(vtab.__index__(corpus[i][k]))
                    n = 0
                    while(n < num_steps - len(cur_x)):
                        cur_x.append(0)
                else:
                    for k in range(start_index, start_index + num_steps):
                        cur_x.append(vtab.__index__(corpus[i][k]))
               
                x.append(cur_x)
            # y
            else:
                cur_y = []
                if len(corpus[i]) < (start_index + num_steps):
                    for k in range(start_index, len(corpus[i])):
                        cur_y.append(vtab.__index__(corpus[i][k]))
                    n = 0
                    while(n < num_steps - len(cur_y)):
                        cur_y.append(0)
                else:
                    for k in range(start_index, start_index + num_steps):
                        cur_y.append(vtab.__index__(corpus[i][k]))
                y.append(cur_y)
        if i % 2:
            batch_x.append(x)
        else:
            batch_y.append(y)
    return batch_x, batch_y

train_batch_x, train_batch_y = seq_data_iter_random(train_table, train_table.__words__(), 2, 5)
test_batch_x, test_batch_y = seq_data_iter_random(test_table, test_table.__words__(), 2, 5)

In [None]:
# 使用 GPU
def try_gpu(i=0):  #@save
    """如果存在，则返回gpu(i)，否则返回cpu()"""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():  #@save
    """返回所有可用的GPU，如果没有GPU，则返回[cpu(),]"""
    devices = [torch.device(f'cuda:{i}')
             for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

In [None]:
# 转为张量
train_x = torch.tensor(data=train_batch_x, device=try_gpu(), requires_grad=True, dtype=torch.float)
#train_x = torch.nn.functional.one_hot(train_x.to(torch.int64), num_classes=train_table.__len__())
train_y = torch.tensor(data=train_batch_y, device=try_gpu(), dtype=torch.float)
test_x = torch.tensor(data=test_batch_x, device=try_gpu(), requires_grad=True, dtype=torch.float)
test_y = torch.tensor(data=test_batch_y, device=try_gpu(), dtype=torch.float)

In [None]:
class RNNModel(torch.nn.Module):
    """循环神经网络模型

    Defined in :numref:`sec_rnn-concise`"""
    def __init__(self, rnn_layer, vocab_size, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        self.rnn = rnn_layer
        self.vocab_size = vocab_size
        self.num_hiddens = self.rnn.hidden_size
        # 如果RNN是双向的（之后将介绍），num_directions应该是2，否则应该是1
        self.linear = torch.nn.Linear(self.num_hiddens, self.vocab_size)
       
    def forward(self, inputs, state):
        X = torch.nn.functional.one_hot(inputs.T.long(), self.vocab_size)
        X = X.to(torch.float32)
        Y, state = self.rnn(X, state)
        # 全连接层首先将Y的形状改为(时间步数*批量大小,隐藏单元数)
        # 它的输出形状是(时间步数*批量大小,词表大小)。
        output = self.linear(Y.reshape((-1, Y.shape[-1])))
        return output, state

    def begin_state(self, device, batch_size=1):
        if not isinstance(self.rnn, torch.nn.LSTM):
            # nn.GRU以张量作为隐状态
            return  torch.zeros((self.num_directions * self.rnn.num_layers,
                                 batch_size, self.num_hiddens),
                                device=device)
        else:
            # nn.LSTM以元组作为隐状态
            return (torch.zeros((
                self.num_directions * self.rnn.num_layers,
                batch_size, self.num_hiddens), device=device),
                    torch.zeros((
                        self.num_directions * self.rnn.num_layers,
                        batch_size, self.num_hiddens), device=device))

In [None]:
print(train_table.__len__())
model = GRU(train_table.__len__(), 5, 256, 10)
model = model.to(device=try_gpu())

# 定义优化器和损失函数
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.MSELoss()

epochs= 1000

Loss_data = {
    "train": [],
    "dev": []
}

train_len = len(train_x)
test_len = len(test_x)

# 训练
for i in tqdm.tqdm(range(epochs)):
    Loss = 0
    model.train()
    running_train_loss = 0.0
    y_hat = model(train_x)
    #state = state_new
    loss = criterion(y_hat, train_y)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    running_train_loss += loss.item()
    Loss_data["train"].append(loss.item())

    # 训练损失
    epoch_train_loss = running_train_loss / train_len
    perplexity_train = np.exp(running_train_loss / train_len)


    model.eval()
    # 测试损失
    running_test_loss = 0.0
    with torch.no_grad():
        outputs = model(test_x)
        outputs = outputs.to(torch.float)
        outputs = outputs.requires_grad_(True)
        pred = outputs.reshape(test_x.shape[0], 2, 5)
        loss = criterion(pred, test_y)
        running_test_loss += loss.item()
        Loss_data["dev"].append(loss.item())
    epoch_test_loss = running_test_loss / test_len
    Loss_data["dev"].append(epoch_test_loss)
    perplexity_test = np.exp(running_test_loss / test_len)

In [None]:
def plot_learning_curve(loss_record, title=''):
    ''' Plot learning curve of your DNN (train & dev loss) '''
    x_1 = len(loss_record['train'])
    x_2 = len(loss_record["dev"])
    plt.figure(figsize=(6, 4))
    plt.plot(range(x_1), loss_record['train'], c='tab:red', label='train')
    plt.plot(range(x_2), loss_record['dev'], c='tab:cyan', label='dev')
    plt.ylim(0.0, )
    plt.xlabel('Training steps')
    plt.ylabel('MSE loss')
    plt.title('Learning curve of {}'.format(title))
    plt.legend()
    plt.show()

plot_learning_curve(Loss_data, "Loss")

In [None]:
print("perplexity train: ", perplexity_train)
print("perplexity test: ", perplexity_test)