In [None]:
import json
import jieba
import pandas

In [None]:
# 加载数据集
with open("./dataset/HPD/cn_train_set.json", 'r') as file:
    train_file = json.load(file)
with open("./dataset/HPD/cn_test_set.json", 'r') as file:
    test_file = json.load(file)

In [None]:
# 加载停用词表
with open('dataset/停用词表.txt', encoding='utf-8') as f:
    con = f.readlines()
    stop_words = set()
    for i in con:
        i = i.replace("\n", "")
        stop_words.add(i)

In [None]:
# 对对话词元化
train_words = {}
for segment in train_file:
    for line in train_file[segment]["对话历史"]:
        tokens = jieba.cut(line, cut_all=False)
        for token in tokens:
            # 去停用词
            if token not in stop_words and len(token) > 1:
                if token in train_words:
                    train_words[token] += 1
                else:
                    train_words[token] = 1
test_words = {}
for segment in test_file:
    for line in test_file[segment]["对话历史"]:
        tokens = jieba.cut(line, cut_all=False)
        for token in tokens:
            # 去停用词
            if token not in stop_words and len(token) > 1:
                if token in test_words:
                    test_words[token] += 1
                else:
                    test_words[token] = 1

In [None]:
train_df = pandas.DataFrame()
train_df["Word"] = train_words.keys()
train_df["Num"] = train_words.values()
test_df = pandas.DataFrame()
test_df["Word"] = test_words.keys()
test_df["Num"] = test_words.values()

In [None]:
train_df.count()

In [None]:
test_df.count()

In [None]:
# 构建词表
class Vocab:
    def __init__(self, tokens=None, min_freq = 0) -> None:
        if tokens is None:
            return
        # 计算频率
        freq = (tokens["Num"] / tokens["Num"].count()) * 100
        tokens["Freq"] = freq
        print(tokens["Freq"].describe())
        
        if not min_freq:
            min_freq = tokens["Freq"].min()

        # 删除频率太低的词
        tokens.drop(tokens[(tokens.Freq <= min_freq)].index, inplace=True)
        # 按频率排序
        self.tokens_sort = tokens.sort_values(by='Freq', ascending=False)
        self.tokens_sort.reset_index(drop=True, inplace=True)

    def __len__(self):
        return self.tokens_sort.count()
    
    def __getitem__(self, index):
        if index < 0 or index > len(self.tokens_sort):
            return
        return self.tokens_sort.iloc[index]
    
    @property
    def unk(self):
        return 0
    
    @property
    def token_freqs(self):
        return self.tokens_sort["Freq"]

train_set = Vocab(train_df)
test_set= Vocab(test_df)