# 数据预处理

In [1]:
import logging
logging.basicConfig(#filename='mylog.log', 
                    level=logging.INFO, 
                    format='%(asctime)s %(name)s.%(funcName)s +%(lineno)s: %(levelname)-8s [%(process)d] %(message)s'
                   )

In [2]:
%%time
import pandas as pd
import jieba_fast as jieba
import re
from tqdm import tqdm
from gensim import corpora, models
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

from langconv import *

df = pd.read_csv('../../../datasets/simplifyweibo_4_moods.csv')
reviews = df.review.to_list()

stopwords = [line.strip() for line in open('../../../datasets/stopwords.txt',encoding='UTF-8').readlines()]


def data_process(texts, stopwords):
    clean_texts = []
    for line in tqdm(texts):
        # 繁体字处理，不然会有冗余，没有转换之前Dictionary(305734 unique tokens: ['五角星', '件', '伦伦', '余', '各種']...)
        # 处理后的结果 Dictionary(287951 unique tokens: ['五角星', '件', '伦伦', '余', '周小伦']...)
        line = Converter('zh-hans').convert(line)
        line = re.sub(r'[^\u4e00-\u9fa5]+','',str(line)).strip()
        line = jieba.cut(line)
        line = [word for word in line if word not in stopwords and word]
        if line is not None:
            clean_texts.append(line)
    return clean_texts
clean_texts = data_process(reviews[:30000], stopwords)

id2word = corpora.Dictionary(clean_texts)
corpus = [id2word.doc2bow(line) for line in clean_texts]

  0%|          | 0/30000 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
2020-10-16 12:59:34,013 jieba_fast.initialize +117: DEBUG    [4348] Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/j3/469syj552jx4gt9n73lycfh00000gn/T/jieba.cache
2020-10-16 12:59:34,730 jieba_fast.initialize +151: DEBUG    [4348] Dumping model to file cache /var/folders/j3/469syj552jx4gt9n73lycfh00000gn/T/jieba.cache
Loading model cost 0.766 seconds.
2020-10-16 12:59:34,791 jieba_fast.initialize +169: DEBUG    [4348] Loading model cost 0.766 seconds.
Prefix dict has been built succesfully.
2020-10-16 12:59:34,793 jieba_fast.initialize +170: DEBUG    [4348] Prefix dict has been built succesfully.
 41%|████      | 12359/30000 [00:14<00:20, 844.23it/s]


KeyboardInterrupt: 

In [3]:
import pickle
pickle.dump(clean_texts, open('clean_texts.pkl','wb'))
pickle.dump(id2word, open('id2word.pkl','wb'))
pickle.dump(corpus, open('corpus.pkl','wb'))

NameError: name 'clean_texts' is not defined

In [4]:
clean_texts = pickle.load(open('clean_texts.pkl','rb'))
id2word = pickle.load(open('id2word.pkl','rb'))
corpus = pickle.load(open('corpus.pkl','rb'))

In [None]:
coherence_values = []
model_list = []

for num_topics in tqdm(range(20, 50, 3)):
    model = models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics)
    model_list.append(model)
    coherencemodel = CoherenceModel(model=model, texts=clean_texts, dictionary=id2word, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())


In [None]:
limit=50; start=20; step=3;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [5]:
%%time
# for 10 already konwn topic
lda_model = models.LdaModel(corpus, id2word=id2word, num_topics=30, alpha='auto', eval_every=5, per_word_topics=)

SyntaxError: invalid syntax (<unknown>, line 2)

In [6]:
simple_lda = models.LdaMulticore(corpus=corpus, 
                                 id2word=id2word, 
                                 num_topics=20, 
                                 eval_every=5, 
                                 per_word_topics=True, )
#                                  minimum_phi_value=0.001, 
#                                  minimum_probability=0.0001)

2020-10-16 13:01:33,057 gensim.models.ldamodel.init_dir_prior +557: INFO     [4348] using symmetric alpha at 0.05
2020-10-16 13:01:33,058 gensim.models.ldamodel.init_dir_prior +557: INFO     [4348] using symmetric eta at 0.05
2020-10-16 13:01:33,097 gensim.models.ldamodel.__init__ +481: INFO     [4348] using serial LDA version on this node
2020-10-16 13:01:33,620 gensim.models.ldamulticore.update +243: INFO     [4348] running online LDA training, 20 topics, 1 passes over the supplied corpus of 361744 documents, updating every 22000 documents, evaluating every ~110000 documents, iterating 50x with a convergence threshold of 0.001000
2020-10-16 13:01:33,624 gensim.models.ldamulticore.update +279: INFO     [4348] training LDA model using 11 processes
2020-10-16 13:01:33,730 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #0 = documents up to #2000/361744, outstanding queue size 1
2020-10-16 13:01:33,990 gensim.models.ldamulticore.update +297: INF

2020-10-16 13:01:40,436 gensim.models.ldamodel.show_topics +1171: INFO     [4348] topic #9 (0.050): 0.007*"说" + 0.006*"喜欢" + 0.004*"中" + 0.004*"老师" + 0.003*"想" + 0.003*"第名" + 0.003*"笑" + 0.003*"太" + 0.003*"爱" + 0.003*"支持"
2020-10-16 13:01:40,689 gensim.models.ldamodel.do_mstep +1049: INFO     [4348] topic diff=19.059536, rho=1.000000
2020-10-16 13:01:40,706 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #37 = documents up to #76000/361744, outstanding queue size 27
2020-10-16 13:01:43,153 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #38 = documents up to #78000/361744, outstanding queue size 18
2020-10-16 13:01:43,157 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #39 = documents up to #80000/361744, outstanding queue size 19
2020-10-16 13:01:43,161 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #40 = documents

2020-10-16 13:01:51,670 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #66 = documents up to #134000/361744, outstanding queue size 26
2020-10-16 13:01:52,007 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #67 = documents up to #136000/361744, outstanding queue size 26
2020-10-16 13:01:52,226 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #68 = documents up to #138000/361744, outstanding queue size 26
2020-10-16 13:01:53,094 gensim.models.ldamodel.blend +230: INFO     [4348] merging changes from 22000 documents into a model of 361744 documents
2020-10-16 13:01:53,601 gensim.models.ldamodel.show_topics +1171: INFO     [4348] topic #9 (0.050): 0.008*"说" + 0.006*"喜欢" + 0.005*"中" + 0.004*"老师" + 0.004*"想" + 0.003*"爱" + 0.003*"围脖" + 0.003*"天蝎座" + 0.003*"支持" + 0.003*"水瓶座"
2020-10-16 13:01:53,607 gensim.models.ldamodel.show_topics +1171: INFO     [4348] topic #4 (

2020-10-16 13:02:05,863 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #88 = documents up to #178000/361744, outstanding queue size 8
2020-10-16 13:02:05,867 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #89 = documents up to #180000/361744, outstanding queue size 9
2020-10-16 13:02:05,871 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #90 = documents up to #182000/361744, outstanding queue size 10
2020-10-16 13:02:05,878 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #91 = documents up to #184000/361744, outstanding queue size 11
2020-10-16 13:02:05,881 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #92 = documents up to #186000/361744, outstanding queue size 12
2020-10-16 13:02:05,885 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatche

2020-10-16 13:02:13,671 gensim.models.ldamodel.blend +230: INFO     [4348] merging changes from 22000 documents into a model of 361744 documents
2020-10-16 13:02:14,086 gensim.models.ldamodel.show_topics +1171: INFO     [4348] topic #0 (0.050): 0.009*"说" + 0.009*"可爱" + 0.005*"做" + 0.005*"回复" + 0.005*"中国" + 0.005*"太" + 0.004*"想" + 0.004*"中" + 0.003*"美国" + 0.003*"真的"
2020-10-16 13:02:14,090 gensim.models.ldamodel.show_topics +1171: INFO     [4348] topic #18 (0.050): 0.009*"中国" + 0.008*"说" + 0.005*"加油" + 0.004*"回复" + 0.003*"喜欢" + 0.003*"中" + 0.003*"笑" + 0.003*"太" + 0.003*"粉丝" + 0.002*"谢谢"
2020-10-16 13:02:14,095 gensim.models.ldamodel.show_topics +1171: INFO     [4348] topic #11 (0.050): 0.009*"说" + 0.007*"粉丝" + 0.006*"请" + 0.005*"吃" + 0.004*"回复" + 0.004*"做" + 0.004*"中" + 0.004*"小时" + 0.004*"朋友" + 0.003*"快乐"
2020-10-16 13:02:14,099 gensim.models.ldamodel.show_topics +1171: INFO     [4348] topic #19 (0.050): 0.011*"幸福" + 0.008*"说" + 0.006*"做" + 0.006*"吃" + 0.005*"希望" + 0.005*"想" + 0.004*"一

2020-10-16 13:02:23,120 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #149 = documents up to #300000/361744, outstanding queue size 22
2020-10-16 13:02:23,125 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #150 = documents up to #302000/361744, outstanding queue size 23
2020-10-16 13:02:23,129 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #151 = documents up to #304000/361744, outstanding queue size 24
2020-10-16 13:02:23,133 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #152 = documents up to #306000/361744, outstanding queue size 25
2020-10-16 13:02:23,159 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #153 = documents up to #308000/361744, outstanding queue size 26
2020-10-16 13:02:23,691 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, di

2020-10-16 13:02:32,801 gensim.models.ldamulticore.update +297: INFO     [4348] PROGRESS: pass 0, dispatched chunk #180 = documents up to #361744/361744, outstanding queue size 26
2020-10-16 13:02:33,340 gensim.models.ldamodel.blend +230: INFO     [4348] merging changes from 22000 documents into a model of 361744 documents
2020-10-16 13:02:33,734 gensim.models.ldamodel.show_topics +1171: INFO     [4348] topic #17 (0.050): 0.007*"回复" + 0.006*"说" + 0.005*"朋友" + 0.004*"做" + 0.004*"吃" + 0.003*"想" + 0.003*"喜欢" + 0.003*"太" + 0.003*"老师" + 0.003*"谢谢"
2020-10-16 13:02:33,738 gensim.models.ldamodel.show_topics +1171: INFO     [4348] topic #15 (0.050): 0.012*"真的" + 0.009*"说" + 0.005*"回复" + 0.005*"太" + 0.004*"想" + 0.004*"做" + 0.004*"找" + 0.003*"微博" + 0.003*"中国" + 0.003*"中"
2020-10-16 13:02:33,742 gensim.models.ldamodel.show_topics +1171: INFO     [4348] topic #11 (0.050): 0.009*"说" + 0.006*"请" + 0.006*"粉丝" + 0.006*"吃" + 0.006*"小时" + 0.004*"做" + 0.004*"中" + 0.004*"回复" + 0.004*"朋友" + 0.003*"感动"
2020

In [7]:
pickle.dump(simple_lda, open('lad_model.ckpt', 'wb'))

In [4]:
from gensim.models import CoherenceModel
# Compute Perplexity
print('\nPerplexity: ', simple_lda.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=simple_lda, texts=clean_texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -10.65376748751011

Coherence Score:  0.2244054289533366


In [None]:
# 主题的词频分布
import numpy as np
import json

np.sum(lda_model.get_topics()[0])

In [None]:
# 文章的主题分布
lda_model.get_document_topics()

In [18]:
line= '金牛座人和巨蟹座人一样，都属于思想悲观的类型。看问题总喜欢抓着不好的一面，然后扩大解读其中的含义。如此一来，便导致了金牛座人常常被负面情绪支配着，随时随地都在抱怨着、各种唠叨、各种牢骚。自以为发泄坏情绪可以减轻压力，但他们从没想过，自己的压力倒是减轻了，却给别人徒增了不少烦恼！'
line = Converter('zh-hans').convert(line)
line = re.sub(r'[^\u4e00-\u9fa5]+','',str(line)).strip()
line = jieba.cut(line)
line = [word for word in line if word not in stopwords if word]

In [None]:
line

In [None]:
[{'name': i} for i in line]

In [None]:
{ 'source': '金牛座', 'target': '巨蟹座', 'value': 5 },

In [31]:
topic_dis

[(0, 0.2612767),
 (1, 0.11689473),
 (2, 0.0015701243),
 (3, 0.0015701243),
 (4, 0.0015701243),
 (5, 0.0015701243),
 (6, 0.0015701243),
 (7, 0.0015701243),
 (8, 0.0015701243),
 (9, 0.0015701243),
 (10, 0.0015701243),
 (11, 0.31491882),
 (12, 0.0015701243),
 (13, 0.0015701243),
 (14, 0.0015701243),
 (15, 0.0015701243),
 (16, 0.2817878),
 (17, 0.0015701243),
 (18, 0.0015701243),
 (19, 0.0015701243)]

In [30]:
bows = id2word.doc2bow(line)
topic_dis = simple_lda.get_document_topics(bows,minimum_probability=0, minimum_phi_value=0)

In [33]:
[{'source': '原文', 'target': str(i[0]), 'value': i[1]} for i in topic_dis]

[{'source': '原文', 'target': '0', 'value': 0.2612767},
 {'source': '原文', 'target': '1', 'value': 0.11689473},
 {'source': '原文', 'target': '2', 'value': 0.0015701243},
 {'source': '原文', 'target': '3', 'value': 0.0015701243},
 {'source': '原文', 'target': '4', 'value': 0.0015701243},
 {'source': '原文', 'target': '5', 'value': 0.0015701243},
 {'source': '原文', 'target': '6', 'value': 0.0015701243},
 {'source': '原文', 'target': '7', 'value': 0.0015701243},
 {'source': '原文', 'target': '8', 'value': 0.0015701243},
 {'source': '原文', 'target': '9', 'value': 0.0015701243},
 {'source': '原文', 'target': '10', 'value': 0.0015701243},
 {'source': '原文', 'target': '11', 'value': 0.31491882},
 {'source': '原文', 'target': '12', 'value': 0.0015701243},
 {'source': '原文', 'target': '13', 'value': 0.0015701243},
 {'source': '原文', 'target': '14', 'value': 0.0015701243},
 {'source': '原文', 'target': '15', 'value': 0.0015701243},
 {'source': '原文', 'target': '16', 'value': 0.2817878},
 {'source': '原文', 'target': '17', 

In [6]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in tqdm(enumerate(ldamodel[corpus])):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=simple_lda, corpus=corpus, texts=clean_texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

30000it [04:24, 113.24it/s]


Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,14.0,0.9471,"说, 想, 喜欢, 做, 可爱, 爱, 女, 微博, 男, 吃","[啊呀呀, 要死, 穿, 外套, 件, 余, 周小伦, 喜欢, 五角星, 项链, 露, 胸,..."
1,1,12.0,0.9547,"爱, 哈哈哈, 啊啊啊, 说, 吃, 中, 幸福, 想, 回复, 喜欢","[大姚, 通知, 收到, 姚, 外星人, 入侵, 地球, 摧毁, 网络, 地球, 完蛋, 恐..."
2,2,7.0,0.8096,"喜欢, 回复, 可爱, 爱, 吃, 做, 里, 哈哈哈, 说, 终于","[风格, 喜欢, 喜欢, 哪张]"
3,3,7.0,0.966,"喜欢, 回复, 可爱, 爱, 吃, 做, 里, 哈哈哈, 说, 终于","[试试, 去死皮, 面膜, 燕麦片, 加水, 中, 浸泡, 小时, 木瓜, 牛奶, 搅拌, ..."
4,4,17.0,0.9547,"说, 想, 太, 回复, 做, 微博, 转, 快乐, 爱, 照片","[张老师, 谢谢, 侬, 信任, 粉丝, 无所谓, 重在, 质地, 近日, 发现, 现象, ..."
5,5,7.0,0.975,"喜欢, 回复, 可爱, 爱, 吃, 做, 里, 哈哈哈, 说, 终于","[第二条, 吸引力, 美国, 相亲, 节目, 中国, 几大, 美国, 同一个, 单身汉, 单..."
6,6,12.0,0.7847,"爱, 哈哈哈, 啊啊啊, 说, 吃, 中, 幸福, 想, 回复, 喜欢","[喜欢, 苹果, 功能强大, 时尚手机, 功能, 多沃, 爱, 平谷, 第二轮, 活动, 时..."
7,7,16.0,0.9499,"说, 回复, 粉丝, 中, 做, 喜欢, 可爱, 真的, 想, 中国","[回覆, 幸福, 今晚, 本地人, 说, 老板, 一对, 跑, 一对, 幸福, 幸福, 经营..."
8,8,14.0,0.9735,"说, 想, 喜欢, 做, 可爱, 爱, 女, 微博, 男, 吃","[书读, 未必, 好事, 教条主义, 书呆子, 笑, 死, 一对, 新生代, 对联, 愿读,..."
9,9,8.0,0.7618,"想, 回复, 中国, 喜欢, 说, 朋友, 女人, 中, 笑, 世界","[一对, 孪生, 兄弟]"


In [7]:
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head()

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.9862,"第名, 说, 哈哈哈, 吃, 回复, 真的, 太, 想, 买, 爱","[内容, 值钱, 代表, 杂志, 值钱, 编辑, 作用, 越来越, 新浪, 微博, 编辑, ..."
1,1.0,0.9889,"说, 做, 回复, 男人, 分享, 关注, 加油, 想, 太, 中","[任老, 缺好博, 回复, 资格, 联合早报, 网讯, 香港, 明报, 报道, 微软, 创办..."
2,2.0,0.9876,"爱, 回复, 说, 中, 吃, 太, 女人, 做, 快乐, 中国","[封面, 气场, 范儿, 十足, 海报, 气质, 优雅, 女人味, 气质, 漂亮, 女人, ..."
3,3.0,0.986,"说, 分享, 图片, 买, 喜欢, 回复, 笑, 快乐, 活动, 孩子","[笨, 穿, 马甲, 胳膊, 冻掉, 小乐, 想个, 微薄, 名, 一会, 偷偷, 告诉, ..."
4,4.0,0.9856,"说, 哈哈哈, 做, 可爱, 回复, 喜欢, 吃, 想, 太, 微博","[请, 脖友, 提意见, 杭州, 历史, 建筑, 保护, 情况, 很糟, 深层次, 远未, ..."


In [8]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0.0,14.0,"说, 想, 喜欢, 做, 可爱, 爱, 女, 微博, 男, 吃",1950.0,0.0650
1.0,12.0,"爱, 哈哈哈, 啊啊啊, 说, 吃, 中, 幸福, 想, 回复, 喜欢",1433.0,0.0478
2.0,7.0,"喜欢, 回复, 可爱, 爱, 吃, 做, 里, 哈哈哈, 说, 终于",1652.0,0.0551
3.0,7.0,"喜欢, 回复, 可爱, 爱, 吃, 做, 里, 哈哈哈, 说, 终于",1499.0,0.0500
4.0,17.0,"说, 想, 太, 回复, 做, 微博, 转, 快乐, 爱, 照片",1605.0,0.0535
...,...,...,...,...
29995.0,9.0,"说, 回复, 太, 微博, 想, 粉丝, 朋友, 真的, 请, 中国",,
29996.0,6.0,"回复, 笑, 爱, 喜欢, 岁, 中国, 真, 谢谢, 里, 死",,
29997.0,10.0,"想, 回复, 中国, 请, 说, 微博, 爱, 哈哈哈, 关注, 转",,
29998.0,4.0,"说, 哈哈哈, 做, 可爱, 回复, 喜欢, 吃, 想, 太, 微博",,


In [9]:
df_dominant_topics[:20]

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0.0,14.0,"说, 想, 喜欢, 做, 可爱, 爱, 女, 微博, 男, 吃",1950.0,0.065
1.0,12.0,"爱, 哈哈哈, 啊啊啊, 说, 吃, 中, 幸福, 想, 回复, 喜欢",1433.0,0.0478
2.0,7.0,"喜欢, 回复, 可爱, 爱, 吃, 做, 里, 哈哈哈, 说, 终于",1652.0,0.0551
3.0,7.0,"喜欢, 回复, 可爱, 爱, 吃, 做, 里, 哈哈哈, 说, 终于",1499.0,0.05
4.0,17.0,"说, 想, 太, 回复, 做, 微博, 转, 快乐, 爱, 照片",1605.0,0.0535
5.0,7.0,"喜欢, 回复, 可爱, 爱, 吃, 做, 里, 哈哈哈, 说, 终于",1370.0,0.0457
6.0,12.0,"爱, 哈哈哈, 啊啊啊, 说, 吃, 中, 幸福, 想, 回复, 喜欢",1481.0,0.0494
7.0,16.0,"说, 回复, 粉丝, 中, 做, 喜欢, 可爱, 真的, 想, 中国",1418.0,0.0473
8.0,14.0,"说, 想, 喜欢, 做, 可爱, 爱, 女, 微博, 男, 吃",1588.0,0.0529
9.0,8.0,"想, 回复, 中国, 喜欢, 说, 朋友, 女人, 中, 笑, 世界",1584.0,0.0528


In [None]:
{i[0]: i[1] for i in lda_model.show_topic(0,20)}

In [None]:
%%time
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=10)

In [None]:
lda.show_topics()

In [None]:
%%time
lda_m = models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=20)

In [24]:
import pyLDAvis.gensim

In [25]:
pyLDAvis.enable_notebook()

In [26]:
pyLDAvis.display(pyLDAvis.gensim.prepare(simple_lda, corpus, id2word))
#书籍、平板、手机、水果、洗发水、热水器、蒙牛、衣服、计算机、酒店

In [None]:
from gensim.models import CoherenceModel
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=clean_texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
from gensim.models import CoherenceModel
# Compute Perplexity
print('\nPerplexity: ', lda.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=clean_texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
coherence_values = []
model_list = []
for num_topics in range(2, 40, 3):
    model = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)
    model_list.append(model)
    coherencemodel = CoherenceModel(model=model, texts=clean_texts, dictionary=id2word, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())

In [None]:
len(coherence_values)

In [None]:
mallet_path = '/Users/vito/Desktop/satk/datasets/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=10, id2word=id2word)

In [None]:
'/var/folders/j3/469syj552jx4gt9n73lycfh00000gn/T/'

In [None]:
import matplotlib.pyplot as plt
limit=40; start=2; step=3;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [8]:
import json

In [9]:
json.dumps([{'a':1}, {'b':2}])

'[{"a": 1}, {"b": 2}]'