In [1]:
%%time
import pandas as pd
import jieba_fast as jieba
import re
from tqdm import tqdm
from gensim import corpora, models
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

from langconv import *

df = pd.read_csv('../../../datasets/simplifyweibo_4_moods.csv')
reviews = df.review.to_list()

stopwords = [line.strip() for line in open('../../../datasets/stopwords_sp.txt',encoding='UTF-8').readlines()]


def data_process(texts, stopwords):
    clean_texts = []
    for line in tqdm(texts):
        # 繁体字处理，不然会有冗余，没有转换之前Dictionary(305734 unique tokens: ['五角星', '件', '伦伦', '余', '各種']...)
        # 处理后的结果 Dictionary(287951 unique tokens: ['五角星', '件', '伦伦', '余', '周小伦']...)
        line = Converter('zh-hans').convert(line)
        line = re.sub(r'[^\u4e00-\u9fa5]+','',str(line)).strip()
        line = jieba.cut(line)
        line = [word for word in line if word not in stopwords and word]
        if line is not None:
            clean_texts.append(line)
    return clean_texts
clean_texts = data_process(reviews[:30000], stopwords)
# clean_texts = data_process(reviews[:30000], stopwords)
id2word = corpora.Dictionary(clean_texts)
corpus = [id2word.doc2bow(line) for line in clean_texts]

  0%|          | 0/30000 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/j3/469syj552jx4gt9n73lycfh00000gn/T/jieba.cache
Loading model cost 0.552 seconds.
Prefix dict has been built succesfully.
100%|██████████| 30000/30000 [00:32<00:00, 922.84it/s] 


CPU times: user 35.7 s, sys: 1.04 s, total: 36.8 s
Wall time: 36.1 s


In [2]:
stopwords[:10]

['喜欢', '说', '想', '哈哈哈', '回复', '太', '!', '"', '#', '$']

In [10]:
# simple_lda = models.LdaMulticore(corpus=corpus, 
#                                  id2word=id2word, 
#                                  num_topics=20, 
#                                  eval_every=5, 
#                                  per_word_topics=True, )
import gensim
simple_lda = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [12]:
import pickle
pickle.dump(simple_lda, open('lad_model_v2.ckpt', 'wb'))
pickle.dump(clean_texts, open('clean_texts_v2.pkl','wb'))
pickle.dump(id2word, open('id2word_v2.pkl','wb'))
pickle.dump(corpus, open('corpus_v2.pkl','wb'))

In [11]:
from gensim.models import CoherenceModel
# Compute Perplexity
print('\nPerplexity: ', simple_lda.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=simple_lda, texts=clean_texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -25.40771917309052

Coherence Score:  0.334087759862334


In [5]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in tqdm(enumerate(ldamodel[corpus])):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=simple_lda, corpus=corpus, texts=clean_texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

30000it [04:26, 112.49it/s]


Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,5.0,0.944,"中国, 爱, 快乐, 微博, 做, 真的, 开心, 幸福, 围脖, 图片","[啊呀呀, 要死, 穿, 外套, 件, 余, 周小伦, 五角星, 项链, 露, 胸, 衣服,..."
1,1,17.0,0.9524,"男人, 中, 转, 朋友, 关注, 中国, 爱, 围脖, 图片, 里","[大姚, 通知, 收到, 姚, 外星人, 入侵, 地球, 摧毁, 网络, 地球, 完蛋, 恐..."
2,2,14.0,0.6821,"做, 分享, 图片, 中国, 爱, 幸福, 真的, 快乐, 可爱, 真","[风格, 哪张]"
3,3,4.0,0.9659,"请, 中, 吃, 微博, 粉丝, 爱, 里, 支持, 点, 真的","[试试, 去死皮, 面膜, 燕麦片, 加水, 中, 浸泡, 小时, 木瓜, 牛奶, 搅拌, ..."
4,4,10.0,0.9547,"中国, 爱, 做, 不错, 第名, 可爱, 真, 中, 微博, 关注","[张老师, 谢谢, 侬, 信任, 粉丝, 无所谓, 重在, 质地, 近日, 发现, 现象, ..."
5,5,4.0,0.9749,"请, 中, 吃, 微博, 粉丝, 爱, 里, 支持, 点, 真的","[第二条, 吸引力, 美国, 相亲, 节目, 中国, 几大, 美国, 同一个, 单身汉, 单..."
6,6,13.0,0.966,"谢谢, 爱, 做, 里, 微博, 时间, 朋友, 请, 送, 评论","[苹果, 功能强大, 时尚手机, 功能, 多沃, 爱, 平谷, 第二轮, 活动, 时前, 关..."
7,7,5.0,0.9472,"中国, 爱, 快乐, 微博, 做, 真的, 开心, 幸福, 围脖, 图片","[回覆, 幸福, 今晚, 本地人, 老板, 一对, 跑, 一对, 幸福, 幸福, 经营者, ..."
8,8,9.0,0.9735,"中国, 老师, 关注, 粉丝, 中, 支持, 囧, 朋友, 爱情, 第名","[书读, 未必, 好事, 教条主义, 书呆子, 笑, 死, 一对, 新生代, 对联, 愿读,..."
9,9,5.0,0.762,"中国, 爱, 快乐, 微博, 做, 真的, 开心, 幸福, 围脖, 图片","[一对, 孪生, 兄弟]"


In [6]:
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head(20)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.986,"可爱, 关注, 活动, 微博, 爱, 中国, 吃, 新浪, 支持, 加油","[女儿, 回家, 早上, 送, 希望, 心情, 广州, 小孩, 幼儿园, 小学, 家门口, ..."
1,1.0,0.9881,"岁, 做, 日本, 中国, 幸福, 朋友, 找, 请, 吃, 爱","[仿佛, 曾于, 某猫, 日志, 中读, 彼岸花, 语, 白色, 彼岸花, 又称, 曼陀罗,..."
2,2.0,0.9865,"中, 爱, 可爱, 笑, 明天, 做, 谢谢, 吃, 粉丝, 活动","[煲仔饭, 暖, 粒粒, 正, 冬天, 最爱食, 煲仔饭, 万福, 路边, 方位, 加窝, ..."
3,3.0,0.9876,"男人, 里, 女人, 希望, 微博, 中, 走, 开心, 支持, 期待","[谢谢, 开心, 没理, 倒, 说好, 联法, 气坏, 莪, 子, 法地, 编一联, 小丫头..."
4,4.0,0.9875,"请, 中, 吃, 微博, 粉丝, 爱, 里, 支持, 点, 真的","[牛奶, 造假, 学术, 成果, 造假, 科技, 大奖, 造假, 顶峰, 某年, 三鹿, 集..."
5,5.0,0.9881,"中国, 爱, 快乐, 微博, 做, 真的, 开心, 幸福, 围脖, 图片","[婉约, 萧瑟, 秋风落叶, 衰草, 红, 蓼, 散入, 云中, 萍语, 天涯, 君同, 红..."
6,6.0,0.9878,"朋友, 明天, 中, 做, 听, 日本, 吃, 好看, 中国, 走","[喜剧, 官员, 在位, 鼻孔朝天, 牛到, 无法形容, 牛, 帽子, 摘下, 当作, 小偷..."
7,7.0,0.9875,"做, 中, 爱, 加油, 分享, 感谢, 吃, 女人, 希望, 可爱","[爱情, 最深, 绝境, 里, 遇见, 美丽, 惊喜, 这话, 莫名, 血液, 沸腾, 极致..."
8,8.0,0.9891,"吃, 爱, 笑, 做, 死, 世界, 请, 中, 真, 可爱","[谢, 谢, 放松, 心情, 睡觉, 养生, 第一, 要素, 下决心, 消除, 影响, 睡眠..."
9,9.0,0.9868,"中国, 老师, 关注, 粉丝, 中, 支持, 囧, 朋友, 爱情, 第名","[郑, 老师, 真, 童话, 大师, 只能, 童话, 里, 现实, 真, 带来, 危害, 远..."


In [7]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0.0,5.0,"中国, 爱, 快乐, 微博, 做, 真的, 开心, 幸福, 围脖, 图片",1898.0,0.0633
1.0,17.0,"男人, 中, 转, 朋友, 关注, 中国, 爱, 围脖, 图片, 里",1424.0,0.0475
2.0,14.0,"做, 分享, 图片, 中国, 爱, 幸福, 真的, 快乐, 可爱, 真",1585.0,0.0528
3.0,4.0,"请, 中, 吃, 微博, 粉丝, 爱, 里, 支持, 点, 真的",1320.0,0.0440
4.0,10.0,"中国, 爱, 做, 不错, 第名, 可爱, 真, 中, 微博, 关注",1510.0,0.0503
...,...,...,...,...
29995.0,2.0,"中, 爱, 可爱, 笑, 明天, 做, 谢谢, 吃, 粉丝, 活动",,
29996.0,4.0,"请, 中, 吃, 微博, 粉丝, 爱, 里, 支持, 点, 真的",,
29997.0,12.0,"做, 感谢, 支持, 中, 中国, 分享, 北京, 图片, 请, 第名",,
29998.0,1.0,"岁, 做, 日本, 中国, 幸福, 朋友, 找, 请, 吃, 爱",,


In [8]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.display(pyLDAvis.gensim.prepare(simple_lda, corpus, id2word))