In [43]:
import json
import jieba
import pandas
import numpy as np
import torch
import tqdm

In [2]:
# 加载数据集
with open("./dataset/HPD/cn_train_set.json", 'r') as file:
    train_file = json.load(file)
with open("./dataset/HPD/cn_test_set.json", 'r') as file:
    test_file = json.load(file)

In [3]:
# 加载停用词表
with open('dataset/停用词表.txt', encoding='utf-8') as f:
    con = f.readlines()
    stop_words = set()
    for i in con:
        i = i.replace("\n", "")
        stop_words.add(i)

In [4]:
# 对对话词元化
i = 0
train_words = {}
for segment in train_file:
    for line in train_file[segment]["对话历史"]:
        tokens = jieba.cut(line, cut_all=False)
        for token in tokens:
            # 去停用词
            if token not in stop_words and len(token) > 1:
                if token in train_words:
                    train_words[token][0] += 1
                else:
                    train_words[token] = [1, i]
        i += 1
i = 0
test_words = {}
for segment in test_file:
    for line in test_file[segment]["对话历史"]:
        tokens = jieba.cut(line, cut_all=False)
        for token in tokens:
            # 去停用词
            if token not in stop_words and len(token) > 1:
                if token in test_words:
                    test_words[token][0] += 1
                else:
                    test_words[token] = [1, i]
        i += 1

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 2.043 seconds.
Prefix dict has been built successfully.


In [5]:
train_words.values()

dict_values([[103, 0], [13, 0], [1, 0], [1, 0], [5, 0], [1, 0], [4967, 1], [7, 1], [3, 1], [5, 1], [1, 1], [57, 4], [1, 4], [1, 4], [8, 4], [6, 4], [5, 4], [1, 4], [5, 4], [2, 4], [9, 4], [56, 5], [13, 5], [32, 5], [1, 5], [1, 5], [25, 5], [2, 9], [1, 9], [24, 10], [2, 10], [1, 10], [4, 11], [27, 11], [401, 11], [3, 12], [14, 13], [38, 13], [12, 14], [109, 15], [6, 15], [11, 17], [2, 20], [4, 20], [5, 20], [2, 20], [2, 21], [7, 21], [2, 22], [70, 22], [5, 22], [20, 22], [8, 22], [171, 22], [64, 22], [5, 22], [18, 22], [7, 22], [155, 24], [502, 24], [14, 24], [20, 24], [8, 24], [389, 24], [158, 24], [1, 24], [2, 24], [40, 24], [74, 26], [69, 26], [2, 27], [1, 29], [5, 29], [1, 29], [10, 29], [2, 29], [1, 31], [3, 31], [19, 31], [9, 31], [19, 31], [18, 31], [7, 34], [1, 34], [17, 34], [3, 34], [87, 35], [52, 35], [33, 35], [47, 36], [1, 37], [25, 37], [90, 37], [1, 37], [28, 37], [215, 37], [5, 37], [31, 37], [2, 37], [7, 37], [20, 37], [2, 37], [28, 39], [66, 39], [7, 41], [2, 43], [9, 

In [6]:
train_df = pandas.DataFrame(data = train_words.values(), columns=['Num', 'Line'])
train_df["Word"] = train_words.keys()
test_df = pandas.DataFrame(data = test_words.values(), columns=['Num', 'Line'])
test_df["Word"] = test_words.keys()

In [7]:
train_df.count()

Num     11471
Line    11471
Word    11471
dtype: int64

In [8]:
test_df.count()

Num     2996
Line    2996
Word    2996
dtype: int64

In [14]:
# 构建词表
class Vocab:
    def __init__(self, tokens=None, min_freq = 0) -> None:
        if tokens is None:
            return
        # 计算频率
        freq = (tokens["Num"] / tokens["Num"].count()) * 100
        tokens["Freq"] = freq
        print(tokens["Freq"].describe())
        
        if not min_freq:
            min_freq = tokens["Freq"].min()

        # 删除频率太低的词
        tokens.drop(tokens[(tokens.Freq <= min_freq)].index, inplace=True)
        self.tokens = tokens
        self.tokens_sort = self.tokens.sort_values(by='Freq', ascending=False)

    def __len__(self):
        return self.tokens.count()
    
    def __getitem__(self, start, end = -1, flag = 1):
        if (start < 0 or start > len(self.tokens)) and \
        (end < 0 or end > len(self.tokens)):
            return
        if flag == 1:
            if end == -1:
                return self.tokens.iloc[start:]
            elif not end:
                return self.tokens.iloc[start]
            return self.tokens.iloc[start: end]
        else:
            if end == -1:
                return self.tokens_sort.iloc[start:]
            elif not end:
                return self.tokens_sort.iloc[start]
            return self.tokens_sort.iloc[start: end]
    
    @property
    def index(self):
        return self.tokens.index.tolist()
    
    @property
    def unk(self):
        return 0
    
    @property
    def token_freqs(self):
        return self.tokens["Freq"]

train_table = Vocab(train_df)
test_table= Vocab(test_df)

count    5257.000000
mean        0.210877
std         1.490448
min         0.038045
25%         0.038045
50%         0.057067
75%         0.133156
max        94.483546
Name: Freq, dtype: float64
count    999.000000
mean       0.599198
std        2.179485
min        0.200200
25%        0.200200
50%        0.300300
75%        0.500501
max       60.160160
Name: Freq, dtype: float64


In [10]:
# 输出频率前100的词
print(train_table.__getitem__(0, 100, 0))

       Num  Line Word       Freq
6     4967     1   哈利  43.300497
631   1567   207   赫敏  13.660535
524   1331   169  罗恩说  11.603173
137    773    49   布利   6.738732
59     502    24   海格   4.376253
...    ...   ...  ...        ...
1579    76   581   杀死   0.662540
497     75   147   珀西   0.653823
68      74    26   一只   0.645105
1257    73   457   晚上   0.636387
1571    72   579  办公室   0.627670

[100 rows x 4 columns]


In [11]:
# 输出频率前100的词
print(test_table.__getitem__(0, 100, 0))

      Num  Line Word       Freq
22    601     5   哈利  20.060080
474   207   107  罗恩说   6.909212
419   180    98   赫敏   6.008011
144    87    58   布利   2.903872
90     73    41   海格   2.436582
...   ...   ...  ...        ...
161    10    63   这件   0.333778
12     10     2   讨厌   0.333778
1343   10   407  斯莱特   0.333778
548    10   133   游走   0.333778
1542   10   498   金妮   0.333778

[100 rows x 4 columns]


In [35]:
# 划分数据集
def seq_data_iter_random(corpus, batch_size, num_steps):  #@save
    """使用随机抽样生成一个小批量子序列"""
    # 从随机偏移量开始对序列进行分区，随机范围包括num_steps-1
    corpus = corpus[np.random.randint(0, num_steps - 1):]
    # 减去1，是因为我们需要考虑标签
    num_subseqs = (len(corpus) - 1) // num_steps
    # 长度为num_steps的子序列的起始索引
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    # 在随机抽样的迭代过程中，
    # 来自两个相邻的、随机的、小批量中的子序列不一定在原始序列上相邻
    np.random.shuffle(initial_indices)

    def data(pos):
        # 返回从pos位置开始的长度为num_steps的序列
        return corpus[pos: pos + num_steps]

    num_batches = num_subseqs // batch_size
    for i in range(0, batch_size * num_batches, batch_size):
        # 在这里，initial_indices包含子序列的随机起始索引
        initial_indices_per_batch = initial_indices[i: i + batch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
        yield np.array(X), np.array(Y)
    return X, Y

X:  [[3720 3743 3745]
 [1152 1155 1157]] 
Y: [[3743 3745 3749]
 [1155 1157 1158]]
X:  [[1215 1216 1217]
 [7939 7940 7951]] 
Y: [[1216 1217 1219]
 [7940 7951 7952]]
X:  [[ 936  937  938]
 [9368 9393 9405]] 
Y: [[ 937  938  940]
 [9393 9405 9417]]
X:  [[5050 5051 5052]
 [ 238  239  241]] 
Y: [[5051 5052 5053]
 [ 239  241  243]]
X:  [[ 988  989  991]
 [2657 2659 2662]] 
Y: [[ 989  991  992]
 [2659 2662 2663]]
X:  [[1488 1489 1490]
 [2161 2163 2164]] 
Y: [[1489 1490 1491]
 [2163 2164 2165]]
X:  [[4062 4063 4066]
 [2888 2889 2891]] 
Y: [[4063 4066 4067]
 [2889 2891 2897]]
X:  [[  55   56   57]
 [5640 5641 5644]] 
Y: [[  56   57   58]
 [5641 5644 5648]]
X:  [[2194 2195 2197]
 [5925 5934 5935]] 
Y: [[2195 2197 2198]
 [5934 5935 5939]]
X:  [[ 319  320  322]
 [4721 4725 4737]] 
Y: [[ 320  322  324]
 [4725 4737 4738]]
X:  [[ 215  216  217]
 [4836 4837 4840]] 
Y: [[ 216  217  221]
 [4837 4840 4842]]
X:  [[5281 5291 5297]
 [5511 5515 5516]] 
Y: [[5291 5297 5298]
 [5515 5516 5519]]
X:  [[8035 8038 

In [56]:
# 独热编码
train_x = torch.tensor(requires_grad=True)
train_y = torch.tensor( requires_grad=True)

def build_set(train_x, train_y, len):
    for X, Y in seq_data_iter_random(train_table.index, batch_size=2, num_steps=3):
        x, y = torch.zeros(len), torch.zeros(len)
        x[X] = 1
        y[Y] = 1
        train_x.add(x)
        train_y.add(y)
        yield train_x, train_y
        
build_set(train_x, train_y, train_table.__len__())

RuntimeError: Only Tensors of floating point and complex dtype can require gradients

In [36]:
# 使用 GPU
def try_gpu(i=0):  #@save
    """如果存在，则返回gpu(i)，否则返回cpu()"""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')
def try_all_gpus():  #@save
    """返回所有可用的GPU，如果没有GPU，则返回[cpu(),]"""
    devices = [torch.device(f'cuda:{i}')
             for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

In [39]:
# 无隐状态的神经网络
class no_hiden_net(torch.nn.Module):
    def __init__(self) :
        super(no_hiden_net, self).__init__()
        self.hiden = torch.nn.Linear(3, 8)
        self.output = torch.nn.Linear(8, 1)
        
    def forward(self, x) :
        x = self.hiden(x)
        x = self.output(x)
        return x

In [54]:
# 训练
model = no_hiden_net()
model = model.to(device=try_gpu())

# 定义优化器和损失函数
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss()

model.train()
epochs= 5

Loss_data = {
    "train": [],
    "dev": []
}

for epoch in tqdm.tqdm(range(epochs)):
    Loss = 0
    for batch_x, batch_y in seq_data_iter_random(train_table.index, batch_size=2, num_steps=3):
        x = torch.tensor(batch_x.astype("double"), device=try_gpu(), requires_grad=True, dtype=torch.float)
        y = torch.tensor(batch_y.astype("double"), device=try_gpu(), requires_grad=True, dtype=torch.float)
        prediction = model(x)
        loss = criterion(prediction, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        Loss_data["train"].append(float(loss))
        Loss_data["dev"].append(0)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:01<?, ?it/s]


RuntimeError: 0D or 1D target tensor expected, multi-target not supported