In [1]:
import pandas as pd
import numpy as np
import datetime
import math
import jieba
import jieba.posseg as psg
from gensim import corpora, models
from jieba import analyse
import functools


In [2]:
all_docs_df = pd.read_csv('../../data/chusai/all_docs.txt', sep='\001', header=None)
all_docs_df.columns = ['id', 'title', 'text']
all_docs_df['title'] = all_docs_df['title'].astype(str)
all_docs_df['text'] = all_docs_df['text'].astype(str)
print(all_docs_df.info())
print(all_docs_df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108295 entries, 0 to 108294
Data columns (total 3 columns):
id       108295 non-null object
title    108295 non-null object
text     108295 non-null object
dtypes: object(3)
memory usage: 2.5+ MB
None
        id                title  \
0  D000001   林志颖老婆深夜敷面膜，睫毛太长好吓人   
1  D000002   小s夸杨幂身材好，杨幂回复太精彩了！   
2  D000003    年轻时的她风华绝代，现却无人送祝福   
3  D000004   林心如屡曝霍建华私生活被怼蹭老公人气   
4  D000005  曾是TVB颜值担当，近照曝光发现真老了   

                                                text  
0  早年林志颖带kimi上《爸爸去哪儿》的时候，当时遮遮掩掩的林志颖老婆低调探班，总让人觉得格外...  
1  翩若惊鸿，婉若游龙。曹植形容洛神的这两句，实在太抽象，以至于始终寻不到承受对象。直到在《大军...  
2  上个世纪香港影视界涌现出了不少高颜值女星，在《大话西游之月光宝盒》中饰演春三十娘和蜘蛛精的蓝...  
3  霍建华林心如1905电影网讯近日，林心如在接受采访时爆料称老公霍建华会主动向女儿索吻，笑称他...  
4  不知道有多少人是看TVB剧集长大的，小时候我每一天晚上都会守着电视看TVB剧集的。可以说对于...  


In [3]:
#停用词表加载方法
def get_stopword_list():
    #停用词表存储路径，每一行为一个词，按行读取进行加载
    #进行编码转换确保匹配准确率
    stop_word_path = '../stopword.txt'
    stop_word_list = [sw.replace('\n', '') for sw in open(stop_word_path).readlines()]
    return stop_word_list

#分词方法，调用结巴接口
def seg_to_list(sentence, pos=False):
    if not pos:
        #不进行词性标注的分词方法
        seg_list = jieba.cut(sentence)
    else:
        #进行词性标注的分词方法
        seg_list = psg.cut(sentence)
    return seg_list

#去除干扰词
def word_filter(seg_list, stopword_list, pos=False):
    
    filter_list = []
    #根据pos参数选择是否词性过滤
    #不进行词性过滤，则将词性都标记为n，表示全部保留
    for seg in seg_list:
        if not pos:
            word = seg
            flag = 'n'
        else:
            word = seg.word
            flag = seg.flag
        if not flag.startswith('n'):
            continue
        #过滤高停用词表中的词，以及长度为<2的词
        if not word in stopword_list and len(word) > 1:
            filter_list.append(word)
    
    return filter_list

def jieba_word_deal(sentence, stopword_list, pos=False):
    #调用上面方式对数据集进行处理，处理后的每条数据仅保留非干扰词
    seg_list = seg_to_list(sentence, pos)
    filter_list = word_filter(seg_list, stopword_list, pos)
    return filter_list

stopword_list = get_stopword_list()
all_docs_df['title_list'] = all_docs_df['title'].map(lambda x : jieba_word_deal(x, stopword_list, False))
all_docs_df['text_list'] = all_docs_df['text'].map(lambda x : jieba_word_deal(x, stopword_list, False))
all_docs_df['title_text'] = all_docs_df['title'] + '。' + all_docs_df['text']
all_docs_df['title_text_list'] = all_docs_df['title_text'].map(lambda x : jieba_word_deal(x, stopword_list, False))
print(all_docs_df.head())


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.152 seconds.
Prefix dict has been built succesfully.


        id                title  \
0  D000001   林志颖老婆深夜敷面膜，睫毛太长好吓人   
1  D000002   小s夸杨幂身材好，杨幂回复太精彩了！   
2  D000003    年轻时的她风华绝代，现却无人送祝福   
3  D000004   林心如屡曝霍建华私生活被怼蹭老公人气   
4  D000005  曾是TVB颜值担当，近照曝光发现真老了   

                                                text  \
0  早年林志颖带kimi上《爸爸去哪儿》的时候，当时遮遮掩掩的林志颖老婆低调探班，总让人觉得格外...   
1  翩若惊鸿，婉若游龙。曹植形容洛神的这两句，实在太抽象，以至于始终寻不到承受对象。直到在《大军...   
2  上个世纪香港影视界涌现出了不少高颜值女星，在《大话西游之月光宝盒》中饰演春三十娘和蜘蛛精的蓝...   
3  霍建华林心如1905电影网讯近日，林心如在接受采访时爆料称老公霍建华会主动向女儿索吻，笑称他...   
4  不知道有多少人是看TVB剧集长大的，小时候我每一天晚上都会守着电视看TVB剧集的。可以说对于...   

                      title_list  \
0  [林志颖, 老婆, 深夜, 面膜, 睫毛, 太长, 吓人]   
1          [夸杨, 身材, 杨幂, 回复, 太精彩]   
2            [年轻, 风华绝代, 无人, 送祝福]   
3     [林心如, 曝霍, 建华, 私生活, 老公, 人气]   
4     [TVB, 颜值, 担当, 照曝光, 发现, 真老]   

                                           text_list  \
0  [早年, 林志颖, kimi, 爸爸, 当时, 遮遮掩掩, 林志颖, 老婆, 低调, 探班,...   
1  [翩若惊鸿, 婉若游龙, 曹植, 形容, 洛神, 两句, 实在, 抽象, 始终, 不到, 承...   
2  [上个世纪, 香港, 影视界, 涌现出, 不少, 高颜值, 女星, 大话西游, 月光宝盒, ...   
3  [建华, 林心如

In [4]:
#排序函数，用于topK关键词的按值排
def cmp(e1, e2):
    res = np.sign(e1[1] - e2[1])
    if res != 0:
        return res
    else:
        a = e1[0] + e2[0]
        b = e2[0] + e1[0]
        if a > b:
            return 1
        elif a == b:
            return 0
        else:
            return -1



In [None]:
# 主题模型
class TopicModel(object):
    # 三个传入参数：处理后的数据集，关键词数量，具体模型（LSI、LDA），主题数量
    def __init__(self, doc_list, keyword_num, model='LSI', num_topics=10):
        # 使用gensim的接口，将文本转为向量化表示
        # 先构建词空间
        self.dictionary = corpora.Dictionary(doc_list)
        # 使用BOW模型向量化
        corpus = [self.dictionary.doc2bow(doc) for doc in doc_list]
        # 对每个词，根据tf-idf进行加权，得到加权后的向量表示
        self.tfidf_model = models.TfidfModel(corpus)
        self.corpus_tfidf = self.tfidf_model[corpus]

        self.keyword_num = keyword_num
        self.num_topics = num_topics
        # 选择加载的模型
        if model == 'LSI':
            self.model = self.train_lsi()
        else:
            self.model = self.train_lda()

        # 得到数据集的主题-词分布
        word_dic = self.word_dictionary(doc_list)
        self.wordtopic_dic = self.get_wordtopic(word_dic)

    def train_lsi(self):
        lsi = models.LsiModel(self.corpus_tfidf, id2word=self.dictionary, num_topics=self.num_topics)
        return lsi

    def train_lda(self):
        lda = models.LdaModel(self.corpus_tfidf, id2word=self.dictionary, num_topics=self.num_topics)
        return lda

    def get_wordtopic(self, word_dic):
        wordtopic_dic = {}

        for word in word_dic:
            single_list = [word]
            wordcorpus = self.tfidf_model[self.dictionary.doc2bow(single_list)]
            wordtopic = self.model[wordcorpus]
            wordtopic_dic[word] = wordtopic
        return wordtopic_dic

    # 计算词的分布和文档的分布的相似度，取相似度最高的keyword_num个词作为关键词
    def get_simword(self, word_list):
        sentcorpus = self.tfidf_model[self.dictionary.doc2bow(word_list)]
        senttopic = self.model[sentcorpus]

        # 余弦相似度计算
        def calsim(l1, l2):
            a, b, c = 0.0, 0.0, 0.0
            for t1, t2 in zip(l1, l2):
                x1 = t1[1]
                x2 = t2[1]
                a += x1 * x1
                b += x1 * x1
                c += x2 * x2
            sim = a / math.sqrt(b * c) if not (b * c) == 0.0 else 0.0
            return sim

        # 计算输入文本和每个词的主题分布相似度
        sim_dic = {}
        for k, v in self.wordtopic_dic.items():
            if k not in word_list:
                continue
            sim = calsim(v, senttopic)
            sim_dic[k] = sim
        
        result_dict = {}
        for k, v in sorted(sim_dic.items(), key=functools.cmp_to_key(cmp), reverse=True)[:self.keyword_num]:
            result_dict[k] = result_dict.get(k, 0.0) + float(v)
        return result_dict
        
    # 词空间构建方法和向量化方法，在没有gensim接口时的一般处理方法
    def word_dictionary(self, doc_list):
        dictionary = []
        for doc in doc_list:
            dictionary.extend(doc)

        dictionary = list(set(dictionary))

        return dictionary

    def doc2bowvec(self, word_list):
        vec_list = [1 if word in word_list else 0 for word in self.dictionary]
        return vec_list


In [None]:
topic_lsi_model = TopicModel(all_docs_df['title_text_list'], 5, 'LSI')
all_docs_df['result_lsi_dict'] = all_docs_df['title_text_list'].map(lambda x: topic_lsi_model.get_simword(x))
print(all_docs_df.head())



In [None]:
topic_lda_model = TopicModel(all_docs_df['title_text_list'], 5, 'LDA')
all_docs_df['result_lda_dict'] = all_docs_df['title_text_list'].map(lambda x: topic_lda_model.get_simword(x))
print(all_docs_df.head())



In [None]:
sample_df = pd.read_csv('../../result/chusai/sample.csv', encoding='ISO-8859-1')
print(len(sample_df))
sample_df = pd.merge(sample_df, all_docs_df, on='id', how='left')
print(sample_df.head())


In [None]:
def get_top_n_word(result_dict, n):
    keys = list(result_dict.keys())
    if len(keys) < n:
        return ''
    else:
        return keys[n - 1]
    
# 导出预测结果
def exportResult(df, fileName, model):
    if model == 'LSI':
        df['label1'] = df['result_lsi_dict'].map(lambda x: get_top_n_word(x, 1))
        df['label2'] = df['result_lsi_dict'].map(lambda x: get_top_n_word(x, 2))
    else:
        df['label1'] = df['result_lda_dict'].map(lambda x: get_top_n_word(x, 1))
        df['label2'] = df['result_lda_dict'].map(lambda x: get_top_n_word(x, 2))
    print(df.head())
    df[['id', 'label1', 'label2']].to_csv('../../result/chusai/%s.csv' % fileName, header=True, index=False)
    
exportResult(sample_df, 'topic_lsi_baseline_9_3_4', 'LSI')
exportResult(sample_df, 'topic_lda_baseline_9_3_5', 'LDA')
