In [85]:
import json
import jieba
import pandas
import numpy as np

In [86]:
# 加载数据集
with open("./dataset/HPD/cn_train_set.json", 'r') as file:
    train_file = json.load(file)
with open("./dataset/HPD/cn_test_set.json", 'r') as file:
    test_file = json.load(file)

In [87]:
# 加载停用词表
with open('dataset/停用词表.txt', encoding='utf-8') as f:
    con = f.readlines()
    stop_words = set()
    for i in con:
        i = i.replace("\n", "")
        stop_words.add(i)

In [88]:
# 对对话词元化
i = 0
train_words = {}
for segment in train_file:
    for line in train_file[segment]["对话历史"]:
        tokens = jieba.cut(line, cut_all=False)
        for token in tokens:
            # 去停用词
            if token not in stop_words and len(token) > 1:
                if token in train_words:
                    train_words[token][0] += 1
                else:
                    train_words[token] = [1, i]
        i += 1
i = 0
test_words = {}
for segment in test_file:
    for line in test_file[segment]["对话历史"]:
        tokens = jieba.cut(line, cut_all=False)
        for token in tokens:
            # 去停用词
            if token not in stop_words and len(token) > 1:
                if token in test_words:
                    test_words[token][0] += 1
                else:
                    test_words[token] = [1, i]
        i += 1

In [89]:
train_words.values()

dict_values([[103, 0], [13, 0], [1, 0], [1, 0], [5, 0], [1, 0], [4967, 1], [7, 1], [3, 1], [5, 1], [1, 1], [57, 4], [1, 4], [1, 4], [8, 4], [6, 4], [5, 4], [1, 4], [5, 4], [2, 4], [9, 4], [56, 5], [13, 5], [32, 5], [1, 5], [1, 5], [25, 5], [2, 9], [1, 9], [24, 10], [2, 10], [1, 10], [4, 11], [27, 11], [401, 11], [3, 12], [14, 13], [38, 13], [12, 14], [109, 15], [6, 15], [11, 17], [2, 20], [4, 20], [5, 20], [2, 20], [2, 21], [7, 21], [2, 22], [70, 22], [5, 22], [20, 22], [8, 22], [171, 22], [64, 22], [5, 22], [18, 22], [7, 22], [155, 24], [502, 24], [14, 24], [20, 24], [8, 24], [389, 24], [158, 24], [1, 24], [2, 24], [40, 24], [74, 26], [69, 26], [2, 27], [1, 29], [5, 29], [1, 29], [10, 29], [2, 29], [1, 31], [3, 31], [19, 31], [9, 31], [19, 31], [18, 31], [7, 34], [1, 34], [17, 34], [3, 34], [87, 35], [52, 35], [33, 35], [47, 36], [1, 37], [25, 37], [90, 37], [1, 37], [28, 37], [215, 37], [5, 37], [31, 37], [2, 37], [7, 37], [20, 37], [2, 37], [28, 39], [66, 39], [7, 41], [2, 43], [9, 

In [90]:
train_df = pandas.DataFrame(data = train_words.values(), columns=['Num', 'Line'])
train_df["Word"] = train_words.keys()
test_df = pandas.DataFrame(data = test_words.values(), columns=['Num', 'Line'])
test_df["Word"] = test_words.keys()

In [91]:
train_df.count()

Num     11471
Line    11471
Word    11471
dtype: int64

In [92]:
test_df.count()

Num     2996
Line    2996
Word    2996
dtype: int64

In [93]:
# 构建词表
class Vocab:
    def __init__(self, tokens=None, min_freq = 0) -> None:
        if tokens is None:
            return
        # 计算频率
        freq = (tokens["Num"] / tokens["Num"].count()) * 100
        tokens["Freq"] = freq
        print(tokens["Freq"].describe())
        
        if not min_freq:
            min_freq = tokens["Freq"].min()

        # 删除频率太低的词
        tokens.drop(tokens[(tokens.Freq <= min_freq)].index, inplace=True)
        self.tokens = tokens
        self.tokens_sort = self.tokens.sort_values(by='Freq', ascending=False)

    def __len__(self):
        return self.tokens.count()
    
    def __getitem__(self, start, end = -1, flag = 1):
        if (start < 0 or start > len(self.tokens)) and \
        (end < 0 or end > len(self.tokens)):
            return
        if flag == 1:
            if end == -1:
                return self.tokens.iloc[start:]
            elif not end:
                return self.tokens.iloc[start]
            return self.tokens.iloc[start: end]
        else:
            if end == -1:
                return self.tokens_sort.iloc[start:]
            elif not end:
                return self.tokens_sort.iloc[start]
            return self.tokens_sort.iloc[start: end]
    
    @property
    def unk(self):
        return 0
    
    @property
    def token_freqs(self):
        return self.tokens["Freq"]

train_table = Vocab(train_df)
test_table= Vocab(test_df)

count    11471.000000
mean         0.049012
std          0.464451
min          0.008718
25%          0.008718
50%          0.008718
75%          0.026153
max         43.300497
Name: Freq, dtype: float64
count    2996.000000
mean        0.088870
std         0.426788
min         0.033378
25%         0.033378
50%         0.033378
75%         0.066756
max        20.060080
Name: Freq, dtype: float64


In [97]:
# 输出频率前100的词
print(train_table.__getitem__(0, 100, 0))

       Num  Line Word       Freq
6     4967     1   哈利  43.300497
631   1567   207   赫敏  13.660535
524   1331   169  罗恩说  11.603173
137    773    49   布利   6.738732
59     502    24   海格   4.376253
...    ...   ...  ...        ...
1579    76   581   杀死   0.662540
497     75   147   珀西   0.653823
68      74    26   一只   0.645105
1257    73   457   晚上   0.636387
1571    72   579  办公室   0.627670

[100 rows x 4 columns]


In [96]:
# 输出频率前100的词
print(test_table.__getitem__(0, 100, 0))

      Num  Line Word       Freq
22    601     5   哈利  20.060080
474   207   107  罗恩说   6.909212
419   180    98   赫敏   6.008011
144    87    58   布利   2.903872
90     73    41   海格   2.436582
...   ...   ...  ...        ...
161    10    63   这件   0.333778
12     10     2   讨厌   0.333778
1343   10   407  斯莱特   0.333778
548    10   133   游走   0.333778
1542   10   498   金妮   0.333778

[100 rows x 4 columns]


In [98]:
# 划分数据集
def seq_data_iter_random(corpus, batch_size, num_steps):  #@save
    """使用随机抽样生成一个小批量子序列"""
    # 从随机偏移量开始对序列进行分区，随机范围包括num_steps-1
    corpus = corpus[np.random.randint(0, num_steps - 1):]
    # 减去1，是因为我们需要考虑标签
    num_subseqs = (len(corpus) - 1) // num_steps
    # 长度为num_steps的子序列的起始索引
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    # 在随机抽样的迭代过程中，
    # 来自两个相邻的、随机的、小批量中的子序列不一定在原始序列上相邻
    np.random.shuffle(initial_indices)

    def data(pos):
        # 返回从pos位置开始的长度为num_steps的序列
        return corpus[pos: pos + num_steps]

    num_batches = num_subseqs // batch_size
    for i in range(0, batch_size * num_batches, batch_size):
        # 在这里，initial_indices包含子序列的随机起始索引
        initial_indices_per_batch = initial_indices[i: i + batch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
        yield np.array(X), np.array(Y)
    

seq_data_iter_random(64, train_table, 5)

<generator object seq_data_iter_random at 0x7f348d245bc0>