In [1]:
# -*- coding: utf-8 -*-
import os
import jieba
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import logging
logging.basicConfig(level=logging.INFO)

# 对每段截取的不少于500字的文档进行处理，具有分隔句子与分词的函数
class Paragraph:
    def __init__(self, txtname='', content='', sentences=[], words=''):
        self.fromtxt = txtname
        self.content = content
        self.sentences = sentences
        self.words = words
        global punctuation
        self.punctuation = punctuation
        global stopwords
        self.stopwords = stopwords
    # 依据标点符号分隔句子
    def sepSentences(self):
        line = ''
        sentences = []
        for w in self.content:
            if w in self.punctuation and line != '\n':
                if line.strip() != '':
                    sentences.append(line.strip())
                    line = ''
            elif w not in self.punctuation:
                line += w
        self.sentences = sentences
    
    # 使用停用词分词
    def sepWords(self):
        words = []

        for i in range(len(self.sentences)):
            words.extend([x for x in jieba.cut(
                self.sentences[i]) if x not in self.stopwords])

        reswords = ' '.join(words)
        self.words = reswords

    def processData(self):
        self.sepSentences()
        self.sepWords()

# 总共三部小说，对每部小说，取不少于100字的段落100段，保存到DataExcel中的data.xlsx文件
def txt_convert_2_excel(file_path, data_path, K=3):
    files = os.listdir(file_path)    
    txt = []
    txtname = []
    n = 80 # 取不少于200字的段落80段
 
    for file in files:
        filename = os.path.join(file_path, file)
        with open(filename, 'r', encoding='utf-8') as f:
            full_txt = f.readlines()
            lenth_lines = len(full_txt)
            i = 2
            for j in range(n):
                txt_j = ''
                while(len(txt_j) <200):
                    txt_j += full_txt[i]
                    i += 1
                txt.append(txt_j)
                txtname.append(file.split('.')[0])
                #i += int(lenth_lines / (3 * n))

    dic = {'Content': txt, 'Txtname': txtname}
    df = pd.DataFrame(dic)
    out_path = data_path+'\\data.xlsx'
    df.to_excel(out_path, index=False)

    return out_path


def segment(path):
    data_list = []
    data_all = pd.read_excel(path)
    for i in range(len(data_all['Content'])):
        d = Paragraph()
        d.content = data_all['Content'][i]
        d.fromtxt = data_all['Txtname'][i]
        d.processData()
        data_list.append(d.words)
    return data_list


def read_punctuation_list(path):
    punctuation = [line.strip()
                   for line in open(path, encoding='UTF-8').readlines()]
    punctuation.extend(['\n', '\u3000', '\u0020', '\u00A0'])
    return punctuation


def read_stopwords_list(path):
    stopwords = [line.strip()
                 for line in open(path, encoding='UTF-8').readlines()]
    return stopwords


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [2]:
file_dir_path = '.\\Corpus'
data_dir_path = '.\\DataExcel'
stopwords_path = '.\\StopWord\\cn_stopwords.txt'
punctuation_path = '.\\StopWord\\cn_punctuation.txt'
user_dict_path = '.\\StopWord\\新闻词库.txt'

# 读取文件
global stopwords
stopwords = read_stopwords_list(stopwords_path)
global punctuation
punctuation = read_punctuation_list(punctuation_path)
# 在结巴用户字典中添加三体词库
jieba.load_userdict(user_dict_path) 
print('\u3000', '\u0020', '\u00A0')
print([w for w in jieba.cut('中国共产党第二十次全国代表大会')])

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\LEGION~1\AppData\Local\Temp\jieba.cache
DEBUG:jieba:Loading model from cache C:\Users\LEGION~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.756 seconds.
DEBUG:jieba:Loading model cost 0.756 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


　    
['中国共产党', '第二十次', '全国代表大会']


In [3]:
excel_path = txt_convert_2_excel(file_dir_path, data_dir_path)
# 得到分词好的训练预料
corpus = segment(excel_path)
print(corpus[-1])

两岸同胞 血脉相连 血浓于水 一家人 始终 尊重 关爱 造福 台湾同胞 继续 致力于 促进 两岸经济 文化 交流合作 深化 两岸 领域 融合 发展 完善 增进 台湾同胞 福祉 制度 政策 推动 两岸 共同 弘扬 中华文化 促进 两岸同胞 心灵 契合 台湾 中国 台湾 解决 台湾问题 中国 事 中国 人来 决定 坚持 最大 诚意 最大 努力争取 和平统一 前景 决不 承诺 放弃 使用 武力 保留 采取 必要措施 选项 外部 势力 干涉 极少数 台独 分裂 分子 分裂 活动 绝非 广大 台湾同胞 国家统一 民族复兴 历史 车轮 滚滚向前 祖国 完全 统一 一定 实现 一定 能够 实现


In [4]:
# LDA
logging.info('Training count vectorizer...')
cntVector = CountVectorizer(max_features=1000, max_df=0.2)
cntTf = cntVector.fit_transform(corpus)

print(cntVector.vocabulary_)
print(sorted(cntVector.vocabulary_.items(), key=lambda item:item[1])[-100:])

print('\n词袋矩阵的尺寸：',cntTf.shape)

INFO:root:Training count vectorizer...


{'同志': 324, '现在': 725, '代表': 102, '大会': 390, '中国共产党': 38, '进入': 938, '召开': 309, '十分': 281, '重要': 954, '主题': 63, '邓小平理论': 947, '三个代表': 11, '思想': 497, '科学发展观': 796, '指导': 540, '解放思想': 895, '改革开放': 565, '凝聚': 228, '力量': 259, '坚定不移': 357, '中国特色社会主义道路': 50, '前进': 257, '共同': 198, '艰苦奋斗': 874, '团结': 335, '带领': 448, '全国各族人民': 173, '中国': 37, '日益': 611, '走向': 919, '中华民族伟大复兴': 35, '展现出': 435, '前景': 256, '党和人民': 161, '创造': 245, '历史': 290, '倍加': 149, '确立': 766, '理想信念': 729, '坚定': 356, '责任': 908, '清醒': 699, '当前': 485, '继续': 844, '发生': 305, '深刻': 694, '变化': 307, '面临': 976, '风险': 986, '挑战': 541, '前所未有': 255, '全党': 169, '一定': 3, '牢记': 710, '工作': 438, '完成': 406, '时代': 613, '赋予': 917, '艰巨': 873, '任务': 107, '过去': 927, '五年': 72, '十年': 283, '基本': 369, '总结': 502, '十七大': 279, '经受': 832, '困难': 337, '考验': 858, '夺取': 395, '全面建设小康社会': 186, '胜利': 862, '推进改革': 550, '开放': 473, '社会主义现代化建设': 778, '宏伟目标': 408, '作出': 128, '贯彻': 911, '精神': 825, '中央': 51, '改进': 562, '新形势下': 600, '党的建设': 165, '制定': 249, '规划': 889, '文化': 58

In [5]:
logging.info('Training LDA model...')
lda = LatentDirichletAllocation(n_components=10,  max_iter=1000, random_state=2022)
docres = lda.fit_transform(cntTf)

# 查看LDA结果
feature_names = cntVector.get_feature_names()
print_top_words(lda, feature_names, 10)

INFO:root:Training LDA model...


Topic #0:
健康 社会 农村 农民 工作 人才 问题 实施 强化 引导
Topic #1:
中国 人类 重要 国家安全 社会主义核心价值观 马克思主义 思想 中国共产党 全国各族人民 文化自信
Topic #2:
保证 依法 发挥 重要 宪法 支持 法治 依法治国 工作 人大
Topic #3:
历史 实践 时代 新时代 中国 全党 中国特色社会主义 理论 伟大 始终
Topic #4:
文化 香港 实施 安全 澳门 科技 坚定 创新 一国两制 战略
Topic #5:
就业 教育 分配 解决 问题 鼓励 基本 机制 持续 居民收入
Topic #6:
基本 取得 世界 建成 水平 基础 强国 得到 大幅 生活
Topic #7:
保护 构建 优化 实施 生态 环境 建立 资源 绿色 节约
Topic #8:
建立 保障 基层 开放 统筹 优化 社会 创新 区域 基本
Topic #9:
问题 五年 不少 存在 面临 共同 一系列 工作 取得 解决




In [6]:
excel_path_18 = txt_convert_2_excel('.\\18', '.\\DataExcel_18')
corpus_18 = segment(excel_path_18)
print(corpus_18[-1])

三 积极 发展 党内民主 增强 党 创造 活力 党内民主 党 生命 坚持民主集中制 健全 党内民主制度体系 党内民主 带动 人民民主 保障 党员主体地位 健全 党员 民主权利 保障制度 开展 批评和自我批评 营造 党内民主 平等 同志 关系 民主讨论 政治氛围 民主监督的制度环境 落实 党员 知情权 参与权 选举权 监督权 完善 党的代表大会制度 提高 工人 农民 代表 比例 落实 完善 党 代表大会代表任期制 试行 乡镇党代会年会制 深化 县 市 区 党代会 常任制 试点 实行 党代会代表提案制 完善 党内选举制度 规范 差额提名 差额选举 形成 充分体现 选举人 意志 程序 环境 强化 全委会 决策 监督 作用 完善 常委会 议事规则 决策程序 完善 地方党委 讨论 决定 重大 问题 任用重要干部票决制 扩大 党内基层民主 完善 党员 定期 评议 基层党组织领导班子 制度 推行 党员 旁听 基层 党委 会议 党代会 代表 列席 同级 党委 会议 做法 增强 党内生活 原则性 透明度


In [7]:
# LDA
logging.info('Testing_18 count vectorizer...')
cntTf_18 = cntVector.transform(corpus_18)

INFO:root:Testing_18 count vectorizer...


In [8]:
import numpy as np	
# 构建文档-词频矩阵	
lda_output = lda.transform(cntTf_18)
# 列名	
topicnames = ["Topic" + str(i) 	
              for i in range(lda.n_components)]	
# 行索引名	
docnames = ["Doc" + str(i) 	
            for i in range(len(corpus_18))]	
# 转化为pd.DataFrame	
df_document_topic = pd.DataFrame(np.round(lda_output, 2), 	
                                 columns=topicnames,
                                 index=docnames)	
# Get dominant topic for each document	
dominant_topic = np.argmax(df_document_topic.values, axis=1)	
df_document_topic['dominant_topic'] = dominant_topic	
	
def make_bold(val):	
    weight = 700 if val > .5 else 400	
    return 'font-weight: {weight}'.format(weight=weight)	
# Apply Style	
df_document_topics = df_document_topic.style.applymap(make_bold)	
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.0,0.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.98,9
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.98,0.0,0.0,0.0,6
Doc3,0.0,0.0,0.0,0.0,0.0,0.57,0.41,0.0,0.0,0.0,5
Doc4,0.0,0.0,0.0,0.0,0.0,0.0,0.97,0.0,0.0,0.0,6
Doc5,0.0,0.0,0.0,0.0,0.0,0.0,0.45,0.0,0.53,0.0,8
Doc6,0.0,0.0,0.0,0.0,0.39,0.0,0.0,0.0,0.0,0.58,9
Doc7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99,9
Doc8,0.0,0.0,0.0,0.53,0.0,0.0,0.0,0.0,0.0,0.46,3
Doc9,0.0,0.1,0.0,0.0,0.0,0.1,0.47,0.0,0.0,0.32,6


In [9]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")	
df_topic_distribution.columns = ['Topic Num', 'Num Documents']	
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,3,12
1,2,9
2,4,9
3,9,9
4,6,8
5,7,8
6,0,7
7,5,7
8,8,7
9,1,4


In [10]:
excel_path_19 = txt_convert_2_excel('.\\19', '.\\DataExcel_19')
# 得到分词好的训练预料
corpus_19 = segment(excel_path_19)
print(corpus_19[-1])
# LDA
cntTf_19 = cntVector.transform(corpus_19)

军队 准备 打仗 工作 必须 坚持战斗力标准 打仗 打胜仗 聚焦 扎实 做好 战略 方向 军事斗争准备 统筹 推进 传统安全 领域 新型 安全 领域 军事斗争准备 发展 新型 作战 力量 保障 力量 开展 实战 化 军事训练 加强 军事力量 运用 加快 军事 智能化 发展 提高 网络信息 体系 联合作战 能力 全域 作战 能力 有效 塑造 态势 管控 危机 遏制 战争 打赢 战争 坚持 富国和强军相统一 强化 统一 领导 顶层设计 改革创新 重大项目 落实 深化 国防科技工业 改革 形成 军民融合深度发展 格局 构建 一体化 国家战略体系和能力 完善 国防动员体系 建设 强大 稳固 现代 边海 空防 组建 退役军人 管理 保障 机构 维护 军人 军属 合法权益 军人 成为 全 社会 尊崇 职业 深化 武警部队 改革 建设现代化武装警察部队


In [11]:
# 构建文档-词频矩阵	
lda_output = lda.transform(cntTf_19)
# 列名	
topicnames = ["Topic" + str(i) 	
              for i in range(lda.n_components)]	
# 行索引名	
docnames = ["Doc" + str(i) 	
            for i in range(len(corpus_19))]	
# 转化为pd.DataFrame	
df_document_topic = pd.DataFrame(np.round(lda_output, 2), 	
                                 columns=topicnames,
                                 index=docnames)	
# Get dominant topic for each document	
dominant_topic = np.argmax(df_document_topic.values, axis=1)	
df_document_topic['dominant_topic'] = dominant_topic	
	
def make_bold(val):	
    weight = 700 if val > .5 else 400	
    return 'font-weight: {weight}'.format(weight=weight)	
# Apply Style	
df_document_topics = df_document_topic.style.applymap(make_bold)	
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.0,0.55,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.26,1
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.98,9
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.98,0.0,0.0,0.0,6
Doc3,0.0,0.0,0.32,0.0,0.0,0.0,0.4,0.26,0.0,0.0,6
Doc4,0.0,0.38,0.0,0.29,0.0,0.0,0.3,0.0,0.0,0.0,1
Doc5,0.23,0.0,0.0,0.0,0.07,0.0,0.51,0.16,0.0,0.0,6
Doc6,0.5,0.0,0.0,0.0,0.0,0.0,0.47,0.0,0.0,0.0,0
Doc7,0.0,0.0,0.18,0.43,0.0,0.0,0.37,0.0,0.0,0.0,3
Doc8,0.0,0.0,0.0,0.0,0.36,0.0,0.0,0.0,0.0,0.63,9
Doc9,0.0,0.0,0.0,0.98,0.0,0.0,0.0,0.0,0.0,0.0,3


In [12]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")	
df_topic_distribution.columns = ['Topic Num', 'Num Documents']	
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,3,17
1,1,11
2,0,9
3,6,9
4,4,8
5,9,8
6,7,7
7,2,6
8,8,3
9,5,2


In [13]:
excel_path_20 = txt_convert_2_excel('.\\20', '.\\DataExcel_20')
# 得到分词好的训练预料
corpus_20 = segment(excel_path_20)
print(corpus_20[-1])
# LDA
cntTf_20 = cntVector.transform(corpus_20)

两岸同胞 血脉相连 血浓于水 一家人 始终 尊重 关爱 造福 台湾同胞 继续 致力于 促进 两岸经济 文化 交流合作 深化 两岸 领域 融合 发展 完善 增进 台湾同胞 福祉 制度 政策 推动 两岸 共同 弘扬 中华文化 促进 两岸同胞 心灵 契合 台湾 中国 台湾 解决 台湾问题 中国 事 中国 人来 决定 坚持 最大 诚意 最大 努力争取 和平统一 前景 决不 承诺 放弃 使用 武力 保留 采取 必要措施 选项 外部 势力 干涉 极少数 台独 分裂 分子 分裂 活动 绝非 广大 台湾同胞 国家统一 民族复兴 历史 车轮 滚滚向前 祖国 完全 统一 一定 实现 一定 能够 实现


In [14]:
# 构建文档-词频矩阵	
lda_output = lda.transform(cntTf_20)
# 列名	
topicnames = ["Topic" + str(i) 	
              for i in range(lda.n_components)]	
# 行索引名	
docnames = ["Doc" + str(i) 	
            for i in range(len(corpus_20))]	
# 转化为pd.DataFrame	
df_document_topic = pd.DataFrame(np.round(lda_output, 2), 	
                                 columns=topicnames,
                                 index=docnames)	
# Get dominant topic for each document	
dominant_topic = np.argmax(df_document_topic.values, axis=1)	
df_document_topic['dominant_topic'] = dominant_topic	
	
def make_bold(val):	
    weight = 700 if val > .5 else 400	
    return 'font-weight: {weight}'.format(weight=weight)	
# Apply Style	
df_document_topics = df_document_topic.style.applymap(make_bold)	
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.0,0.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
Doc1,0.0,0.0,0.0,0.46,0.0,0.0,0.0,0.0,0.0,0.53,9
Doc2,0.0,0.0,0.0,0.44,0.26,0.04,0.07,0.0,0.0,0.19,3
Doc3,0.0,0.0,0.0,0.53,0.0,0.0,0.0,0.0,0.0,0.47,3
Doc4,0.0,0.39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.59,9
Doc5,0.0,0.0,0.07,0.77,0.0,0.15,0.0,0.0,0.0,0.0,3
Doc6,0.0,0.0,0.0,0.97,0.0,0.0,0.0,0.0,0.0,0.0,3
Doc7,0.41,0.0,0.0,0.0,0.0,0.0,0.57,0.0,0.0,0.0,6
Doc8,0.0,0.0,0.0,0.0,0.0,0.0,0.32,0.66,0.0,0.0,7
Doc9,0.0,0.0,0.0,0.0,0.0,0.0,0.98,0.0,0.0,0.0,6


In [15]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")	
df_topic_distribution.columns = ['Topic Num', 'Num Documents']	
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,3,15
1,1,12
2,4,10
3,0,8
4,6,8
5,7,7
6,9,7
7,8,6
8,2,5
9,5,2


In [16]:
import pyLDAvis
import pyLDAvis.sklearn

In [17]:
pyLDAvis.enable_notebook()
pic = pyLDAvis.sklearn.prepare(lda,cntTf,cntVector)
pyLDAvis.save_html(pic,'1.html')
pyLDAvis.show(pic)

  and should_run_async(code)
INFO:numexpr.utils:Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


OSError: [Errno 22] Invalid argument: 'https://cdn.jsdelivr.net/gh/bmabey/pyLDAvis@3.3.1/pyLDAvis/js/ldavis.v1.0.0.css'