In [22]:
# reference: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
import six.moves.cPickle as pickle
from gensim import corpora, models
import gensim
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt


In [18]:
# loading dataset (token type)
with open('paras/tokens.pkl', 'rb') as pf:
	tokens = pickle.load(pf)

# Create Dictionary
id2word = corpora.Dictionary(tokens)

# Term Document Frequency
corpus = [id2word.doc2bow(tok) for tok in tokens]

print(corpus[:1]) #(字,次數) 有次數的字才會顯示出來。
print(id2word[1], id2word[3])
#print(len(id2word)) >> 總共其實有4千多個字

[[(0, 10), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 2), (8, 1), (9, 1), (10, 3), (11, 1), (12, 3), (13, 7), (14, 1), (15, 1), (16, 3), (17, 1), (18, 4), (19, 3), (20, 6), (21, 3), (22, 1), (23, 1), (24, 6), (25, 1), (26, 1), (27, 4), (28, 5), (29, 2), (30, 2), (31, 2), (32, 1), (33, 1), (34, 1), (35, 2), (36, 3), (37, 1), (38, 2), (39, 1), (40, 1), (41, 12), (42, 4), (43, 1), (44, 3), (45, 9), (46, 1), (47, 26), (48, 1), (49, 5), (50, 2), (51, 1), (52, 4), (53, 2), (54, 1), (55, 1), (56, 1), (57, 2), (58, 6), (59, 2), (60, 3), (61, 1), (62, 3), (63, 5), (64, 1), (65, 2), (66, 1), (67, 1), (68, 3), (69, 1), (70, 2), (71, 21), (72, 1), (73, 2), (74, 1), (75, 1), (76, 3), (77, 3), (78, 1), (79, 1), (80, 4), (81, 2), (82, 1), (83, 2), (84, 1), (85, 1), (86, 1), (87, 7), (88, 5), (89, 4), (90, 1), (91, 5), (92, 3), (93, 1), (94, 1), (95, 3), (96, 1), (97, 1), (98, 2), (99, 2), (100, 1), (101, 3), (102, 5), (103, 1), (104, 2), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110

In [75]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [76]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics(num_words=5))
doc_lda = lda_model[corpus]

[(0,
  '0.022*"data" + 0.018*"" + 0.013*"paragraph" + 0.011*"use" + 0.010*"mine"'),
 (1,
  '0.029*"data" + 0.018*"" + 0.016*"mine" + 0.012*"paragraph" + '
  '0.011*"algorithm"'),
 (2, '0.123*"lt" + 0.123*"gt" + 0.058*"true" + 0.032*"mrow" + 0.027*"mi"'),
 (3,
  '0.022*"diseas" + 0.018*"treatment" + 0.017*"" + 0.015*"data" + '
  '0.010*"recommend"'),
 (4,
  '0.019*"predict" + 0.016*"model" + 0.015*"mine" + 0.015*"featur" + 0.014*""'),
 (5, '0.018*"time" + 0.018*"gt" + 0.017*"lt" + 0.016*"seri" + 0.016*""'),
 (6, '0.026*"data" + 0.021*"tax" + 0.017*"time" + 0.016*"mine" + 0.016*""'),
 (7,
  '0.019*"data" + 0.016*"" + 0.014*"use" + 0.010*"mine" + 0.010*"paragraph"')]


In [77]:
# Compute Perplexity (用來評估模型的好壞，值越低越好)
print('Perplexity: ', lda_model.log_perplexity(corpus))

Perplexity:  -6.825110336955748


In [78]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [79]:
# 找出每份文件的topic分布
for idx,doc in enumerate(corpus):
    print('doc {}: {}'.format(idx ,lda_model.get_document_topics(doc))) #每個文件顯示的topic機率加起來為1

doc 0: [(0, 0.99964994)]
doc 1: [(1, 0.10849218), (4, 0.09038466), (5, 0.6829762), (7, 0.11803151)]
doc 2: [(3, 0.999753)]
doc 3: [(0, 0.99976873)]
doc 4: [(7, 0.99966997)]
doc 5: [(2, 0.3407134), (5, 0.65910226)]
doc 6: [(6, 0.99972945)]
doc 7: [(2, 0.9997787)]
doc 8: [(6, 0.9994591)]
doc 9: [(1, 0.9997317)]
doc 10: [(0, 0.1899217), (5, 0.80992955)]
doc 11: [(2, 0.71097505), (4, 0.12660521), (7, 0.16222346)]
doc 12: [(7, 0.9996475)]
doc 13: [(7, 0.9996786)]
doc 14: [(1, 0.9997603)]
doc 15: [(4, 0.9994857)]
doc 16: [(7, 0.99973017)]
doc 17: [(7, 0.9997099)]
doc 18: [(7, 0.9996754)]
doc 19: [(7, 0.9998123)]
doc 20: [(3, 0.9991551)]
doc 21: [(7, 0.9996043)]
doc 22: [(0, 0.9981585)]
doc 23: [(7, 0.9998086)]
doc 24: [(2, 0.9999344)]
doc 25: [(1, 0.9997354)]
doc 26: [(0, 0.9998055)]
doc 27: [(4, 0.99980617)]
doc 28: [(4, 0.99975497)]
doc 29: [(7, 0.9994398)]
doc 30: [(0, 0.78047514), (3, 0.21908604)]
doc 31: [(7, 0.99969786)]
doc 32: [(7, 0.9997206)]
doc 33: [(0, 0.9996153)]
doc 34: [(0, 0.