In [4]:
import numpy as np
import pandas as pd
import scipy.sparse
from bisect import bisect_left
import matplotlib.pyplot as plt
import openpyxl

In [24]:
class PoemDatabase():
    def cut(string, word_set, max_length):
        # 对string根据word_set分词，最长词语长度为max_length
        cut_list = []

        while string != "":
            for length in range(max_length, 0, -1):
                word = string[: length]
                if word in word_set or length == 1:
                    cut_list.append(word)
                    string = string[length: ]
                    break

        return cut_list
    
    def __init__(self):
        # 初始化PoemDatabase对象
        
        # 读取wordlist，求最大词长
        self.word_list_df = pd.read_excel("wordlist_v2.xlsx")
        self.word_list = self.word_list_df["word"].tolist()
        self.word_list.sort()  # WARNING! 这里排序后与原词表顺序不符！以后统一使用word_list
        self.max_length = (self.word_list_df["word"].apply(lambda x: len(x))).max()
        self.word_number = len(self.word_list)
        
        # 读取所有诗歌，并分词
        self.table = pd.read_excel("poem_v2.xlsx")
        word_set = set(self.word_list)  # hash acceleration
        self.table['words'] = self.table['content'].apply(lambda x: ' '.join(PoemDatabase.cut(x, word_set, self.max_length)))
        
        # 建立poemlist
        self.poem_list = list(self.table["Poem_id"].apply("unique"))
        self.poem_list.sort()
        self.poem_number = len(self.poem_list)
        
        # 根据诗人检索诗歌编号
        self.author_table = self.table[self.table["line_number"] == -1]
        self.author_table.loc[:, "content"] = self.author_table["content"].apply(lambda x: x.strip('$'))
        self.author_table = self.author_table[["Poem_id", "content"]]
        self.author_table = self.author_table.groupby("content")["Poem_id"].agg(list)
        
        # 计算倒排索引（根据单词检索诗歌编号）、低频词编号表、tf-idf矩阵、单词相似矩阵
        self.invertidx = None
        self.drop_indexs = None
        self.tfidf_sparse = None
        self.sim_sparse = None
        self.cal_tfidf()
        self.cal_sim_sparse()
        
    
    def cal_tfidf(self):
        """
        计算文档的倒排索引、低频词表、tf-idf值
        """
        # 计算term-document矩阵的tf-idf值
        split_words = self.table['words'].str.split(' ', expand=True).stack().rename('word').reset_index()
        new_data = pd.merge(self.table['Poem_id'], split_words, left_index=True, right_on='level_0')
        
        # 首先求单词的倒排索引
        self.invertidx = new_data.groupby("word")["Poem_id"].agg("unique")
        self.invertidx = pd.DataFrame(self.invertidx)
        self.invertidx = self.invertidx.reindex(self.word_list)  # 清理掉不在词表中的词，并重新排序
        isnull = self.invertidx["Poem_id"].isnull()
        self.invertidx.loc[isnull, 'Poem_id'] = [ [[]] * isnull.sum() ]   # 填充从未出现的词
        self.invertidx = self.invertidx["Poem_id"]
        
        # 由倒排索引可求单词的tf_total, idf值，索引都是单词的编号
        self.tf_total_df = self.invertidx.apply(lambda x: len(x))
        self.tf_total_df = self.tf_total_df.reset_index(drop=True)
        self.drop_indexs = self.tf_total_df[self.tf_total_df < 10].index
        idf_df = self.tf_total_df.apply(lambda x: np.log((self.poem_number + 1) / (x + 1)))  # smoothed
        self.idf_df = idf_df
        
        # 求单词的tf值，存储三元组(word, Poem_id, frequency)
        tf_df = new_data[["Poem_id", "word"]].groupby(["Poem_id", "word"]).size().reset_index()
        tf_df = tf_df.rename(columns = {0: 'frequency', "word":"word", "Poem_id":"Poem_id"})
        tf_df = tf_df[tf_df["word"].isin(self.word_list)]
        tf_df["word"] = tf_df["word"].apply(lambda x: bisect_left(self.word_list, x))  # 二分查找单词的编号
        tf_df["Poem_id"] = tf_df["Poem_id"].apply(lambda x: bisect_left(self.poem_list, x))  # 二分查找诗歌的编号
        self.tf_df = tf_df
        
        # 用系数矩阵计算tf-idf值，每个行向量可以用作词向量
        idf_sparse = scipy.sparse.coo_matrix(idf_df)
        tf_sparse = scipy.sparse.coo_matrix((tf_df["frequency"], (tf_df["word"], tf_df["Poem_id"])), shape=(self.word_number, self.poem_number))
        self.tfidf_sparse = tf_sparse.multiply(idf_sparse.T)
        
        return
    
    
    def cal_sim_sparse(self):
        """
        求词的相似矩阵。
        SVD可以做LSA增强robustness，但是不用sparse就太慢了，不值得
        """
        self.sim_sparse = self.tfidf_sparse.dot(self.tfidf_sparse.T)
        self.tfidf_norm_reciprocal = np.reciprocal(np.sqrt(self.tfidf_sparse.power(2).sum(axis=1)))
        self.sim_sparse = self.sim_sparse.multiply(self.tfidf_norm_reciprocal)  # 有0报warning没关系，稀疏的不会算
        self.sim_sparse = self.sim_sparse.multiply(self.tfidf_norm_reciprocal.T)

    
    def find_simword(self, keyword):
        """
        查找关键词的近义词，返回近义词及其相似度的dataframe
        """
        sim_df = pd.DataFrame(columns=["similarity"])
        if keyword not in self.word_list:
            return sim_df           
        
        # 选取近义词
        key_index = bisect_left(self.word_list, keyword)
        simvec_sparse = self.sim_sparse.getcol(key_index)
        sim_df = pd.DataFrame(simvec_sparse.toarray()).rename(columns={0: "similarity"})
        if key_index in self.drop_indexs:
            sim_df = sim_df.loc[[key_index]]
            return sim_df
        
        # 做近义词筛选，drop掉总词频不足10的
        sim_df = sim_df.drop(self.drop_indexs)
        sim_df = sim_df.sort_values(by=["similarity"], ascending=False)
        
        # 根据evaluate_similarity，取相似度大于0.1的前三个单词（也包括它自己）
        sim_df = sim_df[sim_df["similarity"] >= 0.1]
        if len(sim_df.index) > 3:
            sim_df = sim_df.iloc[:3]
        
        return sim_df
    
    
    def query_poem(self, author="", keyword=[]):
        """
        根据诗人和给定的关键词，查找包含这些内容的诗歌，并返回关键词及其相似词（图省事了……）
        其中，author为诗人姓名，keyword为关键词
        如果不指定author，则返回所有包含keyword及其近义词的诗歌；否则只返回该author的诗
        返回诗歌的顺序：根据诗歌包含的keyword及其近义词的相似度加权降序排列
        """
        
        poem_indexs = pd.DataFrame(index=self.poem_list, columns=["power"])
        
        if author != "" and author in self.author_table.index:
            author_list = pd.DataFrame(self.author_table.loc[author]).set_index(0)
            author_list["power"] = 0
            author_list = author_list.reindex(self.poem_list)
            poem_indexs["power"] = author_list["power"]
        else:
            poem_indexs["power"] = poem_indexs["power"].fillna(0)        
        
        simword = []
        if keyword != []:
            key_indexs = pd.concat([self.find_simword(x) for x in keyword])
            simword = [self.word_list[idx] for idx in key_indexs.index]
            for idx in key_indexs.index:
                word_indexs = pd.DataFrame(self.invertidx.loc[self.word_list[idx]]).set_index(0)
                word_indexs["power"] = key_indexs.loc[idx, "similarity"]
                word_indexs = word_indexs.reindex(self.poem_list).fillna(0)
                poem_indexs["power"] = poem_indexs["power"] + word_indexs["power"]
        
        poem_indexs.dropna() # 根据诗人drop
        poem_indexs = poem_indexs[poem_indexs.power > 0]  # 根据相似度drop      
        
        res_table = self.table[self.table.Poem_id.isin(list(poem_indexs.index))][["Poem_id", "content"]]
        res_table["content"] = res_table["content"].apply(lambda x: x.strip("#$"))  # 去掉没用的符号
        res_table = res_table.groupby("Poem_id").agg(list)  # 存成列表，打印时还得处理一下子
        res_table["power"] = poem_indexs["power"]
        res_table = res_table.sort_values("power", ascending=False)
        
        return res_table, simword
    
    
    def evaluate_similarity(self):
        # 确定近义词的阈值的方案：
        sim_df = pd.DataFrame(self.sim_sparse.toarray())
        sim_df = sim_df.drop(self.drop_indexs)
        sim_df = sim_df.T.drop(self.drop_indexs)
        available_word_number = len(sim_df.index)  # 被统计的单词数
        
        print("只考虑相似度在(0, 1)的高频词对，即那些可能成为近义词的词")
        sim_df = sim_df.applymap(lambda x: 0 if x > 0.9999 else x)
        sim_data = np.array(sim_df)
        sim_data = sim_data[sim_data > 0]
        sim_data = sim_data.flatten()
        available_pair_number = len(sim_data)
        
        print("先看一下整体情况")
        plt.hist(sim_data, bins=100, density=True, stacked=True)
        plt.xlabel("similarity")
        plt.ylabel("frequency")
        plt.title("statistics on words' similarity")
        plt.show()
        
        print("绝大部分词的相似度小于0.2。暂时忽略相似度大于0.2的词对是合理的")
        n, bins, _ = plt.hist(sim_data, bins=100, range=(0, 0.2), density=True, cumulative=True)
        plt.xlabel("similarity")
        plt.ylabel("frequency")
        plt.title("statistics on words' similarity (< 0.2, cumulative)")
        plt.show()
        
        print("取相似度阈值为0.1，则两个词的相似度超过了99%的相似词对，我们认为这样的词对是近义词")
        print("取如此高的阈值，主要是为了选取适当数目的近义词，尽量控制在个位数范围内")
        print("接下来核实匹配比例，并对近义词匹配数量的期望进行统计")
        threshold_data = sim_data[sim_data > 0.1]
        threshold_pair_number = len(threshold_data)
        print(f"Similar word-pair proportion: {threshold_pair_number / available_pair_number}")
        print(f"Expected similar word number: {threshold_pair_number / available_word_number} ")
        
        print("平均每个词会匹配到3个左右近义词，是一个理想的比例")
        print("对于那些近义词数量较多的，我们最终只取相似度最高的前三个；对于近义词数量较少的，我们就不取了")
        

    def cal_relative_sparse(self):
        """
        计算篇章的相似度，用于HITS算法
        """
        self.relative_sparse = self.tfidf_sparse.T.dot(self.tfidf_sparse)
        self.tfidf_T_norm_reciprocal = np.reciprocal(np.sqrt(self.tfidf_sparse.T.power(2).sum(axis=1)))
        self.relative_sparse = self.relative_sparse.multiply(self.tfidf_T_norm_reciprocal)  # 有0报warning没关系，稀疏的不会算
        self.relative_sparse = self.relative_sparse.multiply(self.tfidf_T_norm_reciprocal.T)
        
    
    def store_tfidf(self):
        scipy.sparse.save_npz('tfidf_sparse.npz', self.tfidf_sparse)
        # self.tfidf_sparse = sparse.load_npz('tfidf_sparse.npz')读取
        # excel和csv炸内存了

In [25]:
# debugging
poem_db = PoemDatabase()
poem_db.store_tfidf()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
  self.tfidf_norm_reciprocal = np.reciprocal(np.sqrt(self.tfidf_sparse.power(2).sum(axis=1)))


In [29]:
poem_db.query_poem(author="", keyword=["龜魚", "月"])

(                                                   content     power
 Poem_id                                                             
 22530    [八月十五夜臥疾, 熊孺登, 一年只有今宵月, 盡上江樓獨病眠, 寂寞竹窗閒不閉, 夜深斜影...  1.315837
 22312      [八月望夕雨, 徐凝, 今年八月十五夜, 寒雨蕭蕭不可聞, 如鏈如霜在何處, 吳山越水萬重雲]  1.315837
 22318      [八月十五夜, 徐凝, 皎皎秋空八月圓, 常娥端正桂枝鮮, 一年無似如今夜, 十二峰前看不眠]  1.315837
 16278                [春江曲, 王涯, 颻漾越江春, 相將採白蘋, 歸時不覺夜, 出浦月隨人]  1.174230
 20389       [秋房夜, 白居易, 雲露青天月漏光, 中庭立久卻歸房, 水簾席冷未能臥, 挑盡殘燈秋夜長]  1.174230
 ...                                                    ...       ...
 18994    [桐孫詩  并序 (此后元和十年詔召入京 及通州司馬以後詩), ％元和五年。予貶劇江陵。三月...  0.141607
 31466    [小遊仙詩九十八首 2, 曹唐, 上元元日豁明堂, 五帝望空拜玉皇, 萬樹琪花千圃藥, 心知...  0.141607
 31465    [小遊仙詩九十八首 1, 曹唐, 玉簫金瑟發商聲, 桑葉枯乾海水清, 淨掃蓬萊山下路( 一作...  0.141607
 19863    [送元八歸鳳翔, 白居易, 莫道岐州三日程, 其如風雪一身行, 與君況是經年別, 暫到城來又出城]  0.141607
 31543    [小游仙詩九十八首 80, 曹唐, 玉洞長春風景鮮, 丈人私宴就芝田, 笙歌暫向花間盡, 便...  0.141607
 
 [922 rows x 2 columns],
 ['龜魚', '月', '夜', '八'])