* LDA introduction with a simple example

In [2]:
from gensim import corpora, models, similarities
from pprint import pprint
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [4]:
f = open('Data-and-Output/22.LDA_test.txt')
stop_list = set('for a of the and to in'.split())
print("Stop List", stop_list)
# texts = [line.strip().split() for line in f]
# print(texts)

# put words not in the stop list in word list for each line
texts = [[word for word in line.strip().lower().split() if word not in stop_list] for line in f]
print ('Text = ')
pprint(texts)

Stop List {'and', 'to', 'of', 'the', 'for', 'in', 'a'}
Text = 
[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]


In [7]:
dictionary = corpora.Dictionary(texts)
V = len(dictionary)
corpus = [dictionary.doc2bow(text) for text in texts]
corpus_tfidf = models.TfidfModel(corpus)[corpus]

print ('TF-IDF:')
for c in corpus_tfidf:
    print (c)


TF-IDF:
[(0, 0.4301019571350565), (1, 0.4301019571350565), (2, 0.4301019571350565), (3, 0.2944198962221451), (4, 0.2944198962221451), (5, 0.2944198962221451), (6, 0.4301019571350565)]
[(3, 0.3726494271826947), (7, 0.5443832091958983), (8, 0.27219160459794917), (9, 0.3726494271826947), (10, 0.3726494271826947), (11, 0.27219160459794917), (12, 0.3726494271826947)]
[(5, 0.438482464916089), (8, 0.32027755044706185), (11, 0.32027755044706185), (13, 0.438482464916089), (14, 0.6405551008941237)]
[(4, 0.3449874408519962), (8, 0.5039733231394895), (13, 0.3449874408519962), (15, 0.5039733231394895), (16, 0.5039733231394895)]
[(9, 0.30055933182961736), (10, 0.30055933182961736), (11, 0.21953536176370683), (17, 0.43907072352741366), (18, 0.43907072352741366), (19, 0.43907072352741366), (20, 0.43907072352741366)]
[(21, 0.48507125007266594), (22, 0.48507125007266594), (23, 0.48507125007266594), (24, 0.48507125007266594), (25, 0.24253562503633297)]
[(25, 0.31622776601683794), (26, 0.31622776601683794

In [8]:
# LSI
print ('\nLSI Model:')
lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary)
topic_result = [a for a in lsi[corpus_tfidf]]
pprint(topic_result)


LSI Model:
[[(0, 0.34057117986841845), (1, -0.20602251622679627)],
 [(0, 0.69330400021715777), (1, 0.0072327583903876308)],
 [(0, 0.59026076703897246), (1, -0.35260469490855778)],
 [(0, 0.5214901821825132), (1, -0.33887976154055321)],
 [(0, 0.39533193176354536), (1, -0.059192853366601052)],
 [(0, 0.036353173528493515), (1, 0.18146550208818935)],
 [(0, 0.1470901232877902), (1, 0.49432948127822413)],
 [(0, 0.21407117317565358), (1, 0.640645666445395)],
 [(0, 0.40066568318170881), (1, 0.64131082990940003)]]


In [9]:

print ('LSI Topics:')
pprint(lsi.print_topics(num_topics=2, num_words=5))

LSI Topics:
[(0,
  '0.400*"system" + 0.318*"survey" + 0.290*"user" + 0.274*"eps" + '
  '0.236*"management"'),
 (1,
  '0.421*"minors" + 0.420*"graph" + 0.293*"survey" + 0.239*"trees" + '
  '0.226*"intersection"')]


In [10]:
similarity = similarities.MatrixSimilarity(lsi[corpus_tfidf])   # similarities.Similarity()
print ('Similarity:')
pprint(list(similarity))

Similarity:
[array([ 1.        ,  0.85017949,  0.99998462,  0.99948108,  0.92283762,
       -0.33944285, -0.25207737, -0.21974573,  0.01438824], dtype=float32),
 array([ 0.85017949,  1.        ,  0.85309052,  0.83277911,  0.98737705,
        0.20664607,  0.29518002,  0.32680073,  0.53867108], dtype=float32),
 array([ 0.99998462,  0.85309052,  1.        ,  0.99928677,  0.92496276,
       -0.33421329, -0.24669875, -0.21432398,  0.01994151], dtype=float32),
 array([ 0.99948108,  0.83277911,  0.99928677,  1.        ,  0.90995121,
       -0.3695657 , -0.28311783, -0.25105581, -0.0178274 ], dtype=float32),
 array([ 0.92283762,  0.98737705,  0.92496276,  0.90995121,  1.        ,
        0.04906874,  0.14012395,  0.1729846 ,  0.39842743], dtype=float32),
 array([-0.33944285,  0.20664607, -0.33421329, -0.3695657 ,  0.04906874,
        1.        ,  0.99581701,  0.99222624,  0.93564534], dtype=float32),
 array([-0.25207737,  0.29518002, -0.24669875, -0.28311783,  0.14012395,
        0.99581701,  

In [12]:
print ('\nLDA Model:')
num_topics = 2
lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary,
                    alpha='auto', eta='auto', minimum_probability=0.001)
doc_topic = [doc_t for doc_t in lda[corpus_tfidf]]
print ('Document-Topic:\n')
pprint(doc_topic)


LDA Model:
Document-Topic:

[[(0, 0.30895863934234108), (1, 0.69104136065765898)],
 [(0, 0.29290268764942679), (1, 0.70709731235057327)],
 [(0, 0.68281219746677557), (1, 0.31718780253322443)],
 [(0, 0.39309811890975499), (1, 0.60690188109024501)],
 [(0, 0.71874716393273841), (1, 0.28125283606726148)],
 [(0, 0.4700878573572746), (1, 0.52991214264272546)],
 [(0, 0.2607668293175166), (1, 0.7392331706824834)],
 [(0, 0.34356255351665133), (1, 0.65643744648334867)],
 [(0, 0.30128658180272488), (1, 0.69871341819727517)]]


In [13]:
for doc_topic in lda.get_document_topics(corpus_tfidf):
    print (doc_topic)
for topic_id in range(num_topics):
    print ('Topic', topic_id)
    # pprint(lda.get_topic_terms(topicid=topic_id))
    pprint(lda.show_topic(topic_id))
similarity = similarities.MatrixSimilarity(lda[corpus_tfidf])
print ('Similarity:')
pprint(list(similarity))

[(0, 0.30887851082408768), (1, 0.69112148917591232)]
[(0, 0.29298891316066378), (1, 0.70701108683933622)]
[(0, 0.68283931229294526), (1, 0.31716068770705474)]
[(0, 0.39308287121750096), (1, 0.60691712878249915)]
[(0, 0.71873407905906206), (1, 0.28126592094093789)]
[(0, 0.47140919419008703), (1, 0.52859080580991291)]
[(0, 0.26077543407291986), (1, 0.73922456592708019)]
[(0, 0.34364866576534603), (1, 0.65635133423465397)]
[(0, 0.30128599418534885), (1, 0.69871400581465104)]
Topic 0
[('system', 0.038959625985374939),
 ('management', 0.037822747437386645),
 ('eps', 0.036913584081606714),
 ('user', 0.036870163016899091),
 ('interface', 0.036187892029062009),
 ('minors', 0.034296477939576762),
 ('response', 0.031935855826525486),
 ('graph', 0.031481777666608889),
 ('time', 0.03142137430744333),
 ('measurement', 0.030726341900593736)]
Topic 1
[('survey', 0.043780206189694602),
 ('graph', 0.041371621807866799),
 ('system', 0.03961375301285943),
 ('minors', 0.036632477687497784),
 ('trees', 0.0

In [15]:
hda = models.HdpModel(corpus_tfidf, id2word=dictionary)
topic_result = [a for a in hda[corpus_tfidf]]
print ('\n\nUSE WITH CARE--\nHDA Model:')
pprint(topic_result)
print ('HDA Topics:')
print (hda.print_topics(num_topics=2, num_words=5))



USE WITH CARE--
HDA Model:
[[(0, 0.14124363539490895),
  (1, 0.70279978617999272),
  (2, 0.039419969747645986),
  (3, 0.029721359766011611),
  (4, 0.022184556885793311),
  (5, 0.016642918105538022),
  (6, 0.012320848040085111)],
 [(0, 0.78993202493667791),
  (1, 0.052977525671265929),
  (2, 0.039759636712338293),
  (3, 0.029926095205808211),
  (4, 0.022335156778908911),
  (5, 0.016755956463171469),
  (6, 0.012404504831033751)],
 [(0, 0.084416228707431484),
  (1, 0.73726637508697324),
  (2, 0.045299775868464197),
  (3, 0.03395068625525359),
  (4, 0.025317037604487242),
  (5, 0.018991230257945153),
  (6, 0.014059272799183287),
  (7, 0.010491734391670452)],
 [(0, 0.76443375145749992),
  (1, 0.060021069544757215),
  (2, 0.044377394905282253),
  (3, 0.033455097028795525),
  (4, 0.024972136911451569),
  (5, 0.018731286850493412),
  (6, 0.013866863813666443),
  (7, 0.010348149146191095)],
 [(0, 0.78605560407410668),
  (1, 0.056260380266803148),
  (2, 0.040165549659906541),
  (3, 0.030054919

In [16]:
import numpy as np
from gensim import corpora, models, similarities
from pprint import pprint
import time
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def load_stopword():
    f_stop = open('Data-and-Output/22.stopword.txt')
    sw = [line.strip() for line in f_stop]
    f_stop.close()
    return sw

In [18]:
print ('初始化停止词列表 --')
t_start = time.time()
stop_words = load_stopword()

print ('开始读入语料数据 -- ')
f = open('Data-and-Output/22.news.dat')    #22.LDA_test.txt
texts = [[word for word in line.strip().lower().split() if word not in stop_words] for line in f]
# texts = [line.strip().split() for line in f]
print ('读入语料数据完成，用时%.3f秒' % (time.time() - t_start))
f.close()
M = len(texts)
print ('文本数目：%d个' % M)
# pprint(texts)


print ('正在建立词典 --')
dictionary = corpora.Dictionary(texts)
V = len(dictionary)
print ('正在计算文本向量 --')
corpus = [dictionary.doc2bow(text) for text in texts]
print ('正在计算文档TF-IDF --')
t_start = time.time()
corpus_tfidf = models.TfidfModel(corpus)[corpus]
print ('建立文档TF-IDF完成，用时%.3f秒' % (time.time() - t_start))
print ('LDA模型拟合推断 --')
num_topics = 30
t_start = time.time()
lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary,
                        alpha=0.01, eta=0.01, minimum_probability=0.001,
                        update_every = 1, chunksize = 100, passes = 1)
print ('LDA模型完成，训练时间为\t%.3f秒' % (time.time() - t_start))

初始化停止词列表 --


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 0: invalid start byte

In [None]:
# # 所有文档的主题
# doc_topic = [a for a in lda[corpus_tfidf]]
# print 'Document-Topic:\n'
# pprint(doc_topic)

# 随机打印某10个文档的主题
 num_show_topic = 10  # 每个文档显示前几个主题
print ('10个文档的主题分布：')
doc_topics = lda.get_document_topics(corpus_tfidf)  # 所有文档的主题分布
idx = np.arange(M)
np.random.shuffle(idx)
idx = idx[:10]
for i in idx:
    topic = np.array(doc_topics[i])
    topic_distribute = np.array(topic[:, 1])
    # print topic_distribute
    topic_idx = topic_distribute.argsort()[:-num_show_topic-1:-1]
    print ('第%d个文档的前%d个主题：' % (i, num_show_topic)), topic_idx
    print topic_distribute[topic_idx]
num_show_term = 7   # 每个主题显示几个词
print ('每个主题的词分布：')
for topic_id in range(num_topics):
    print ('主题#%d：\t' % topic_id)
    term_distribute_all = lda.get_topic_terms(topicid=topic_id)
    term_distribute = term_distribute_all[:num_show_term]
    term_distribute = np.array(term_distribute)
    term_id = term_distribute[:, 0].astype(np.int)
    print ('词：\t',)
    for t in term_id:
        print (dictionary.id2token[t])
    print
    # print '\n概率：\t', term_distribute[:, 1]


In [44]:

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import lda
import lda.datasets 
from pprint import pprint



In [45]:


if __name__ == "__main__":
    # document-term matrix
    X = lda.datasets.load_reuters()
    print("type(X): {}".format(type(X)))
    print("shape: {}\n".format(X.shape))
    print(X[:10, :10])

    # the vocab
    vocab = lda.datasets.load_reuters_vocab()
    print("type(vocab): {}".format(type(vocab)))
    print("len(vocab): {}\n".format(len(vocab)))
    print(vocab[:10])

    # titles for each story
    titles = lda.datasets.load_reuters_titles()
    print("type(titles): {}".format(type(titles)))
    print("len(titles): {}\n".format(len(titles)))
    pprint(titles[:10])

    print ('LDA start ----')
    topic_num = 20
    model = lda.LDA(n_topics=topic_num, n_iter=500, random_state=1)
    model.fit(X)

    # topic-word
    topic_word = model.topic_word_
    print("type(topic_word): {}".format(type(topic_word)))
    print("shape: {}".format(topic_word.shape))
    print(vocab[:5])
    print(topic_word[:, :5])

    # Print Topic distribution
    n = 7
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n + 1):-1]
        print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))

    # Document - topic
    doc_topic = model.doc_topic_
    print("type(doc_topic): {}".format(type(doc_topic)))
    print("shape: {}".format(doc_topic.shape))
    for i in range(10):
        topic_most_pr = doc_topic[i].argmax()
        print(u"文档: {} 主题: {} value: {}".format(i, topic_most_pr, doc_topic[i][topic_most_pr]))

    mpl.rcParams['font.sans-serif'] = [u'SimHei']
    mpl.rcParams['axes.unicode_minus'] = False

    # Topic - word
    plt.figure(figsize=(8, 9))
    # f, ax = plt.subplots(5, 1, sharex=True)
    for i, k in enumerate([0, 5, 9, 14, 19]):
        ax = plt.subplot(5, 1, i+1)
        ax.plot(topic_word[k, :], 'r-')
        ax.set_xlim(-50, 4350)   # [0,4258]
        ax.set_ylim(0, 0.08)
        ax.set_ylabel(u"概率")
        ax.set_title(u"主题 {}".format(k))
    plt.xlabel(u"词", fontsize=14)
    plt.tight_layout()
    plt.suptitle(u'主题的词分布', fontsize=18)
    plt.subplots_adjust(top=0.9)
    plt.show()

    # Document - Topic
    plt.figure(figsize=(8, 9))
    # f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
    for i, k in enumerate([1, 3, 4, 8, 9]):
        ax = plt.subplot(5, 1, i+1)
        ax.stem(doc_topic[k, :], linefmt='g-', markerfmt='ro')
        ax.set_xlim(-1, topic_num+1)
        ax.set_ylim(0, 1)
        ax.set_ylabel(u"概率")
        ax.set_title(u"文档 {}".format(k))
    plt.xlabel(u"主题", fontsize=14)
    plt.suptitle(u'文档的主题分布', fontsize=18)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()


AttributeError: module 'lda' has no attribute 'datasets'