# A simple demonstration on how to use the LDA codes

In [20]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display, HTML

from operator import itemgetter
import itertools
import sys
sys.path.append('/Users/joewandy/git/lda/code/')

from lda import VariationalLDA

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load some test corpus

Below we load some small text corpus for demo purposes ...

In [4]:
def parse_bag_of_words_file(docword, vocab):
    with open(vocab, "r") as f:
        kos_vocab = [word.strip() for word in f.readlines()]
        id_to_word = {i: word for i, word in enumerate(kos_vocab)}

    with open(docword, "r") as f:
        raw = [map(int, _.strip().split()) for _ in f.readlines()][3:]

    docs = []
    for _, grp in itertools.groupby(raw, lambda x: x[0]):
        doc = []
        for _, word_id, word_cnt in grp:
            doc += word_cnt * [id_to_word[word_id-1]]
        docs.append(doc)
    return docs, id_to_word

all_docs, id_to_word = parse_bag_of_words_file("hdp/docword.kos.txt", "hdp/vocab.kos.txt")
vocab_size = len(set(word for doc in all_docs for word in doc))

In [5]:
print 'total docs = %d' % len(all_docs)

first_doc = all_docs[0]
print 'first doc = %s' % first_doc

total docs = 3430
first doc = ['action', 'action', 'added', 'administration', 'alliances', 'antiwar', 'approve', 'assault', 'attack', 'attitude', 'attitude', 'aug', 'battle', 'believes', 'broad', 'bush', 'bush', 'career', 'chairman', 'click', 'coalition', 'committee', 'committee', 'compared', 'compared', 'conclude', 'conclusion', 'conclusions', 'congressional', 'congressional', 'congressman', 'congressman', 'considered', 'constituents', 'constituents', 'countrys', 'damaging', 'damn', 'departure', 'destruction', 'district', 'district', 'doug', 'doug', 'dramatic', 'electorate', 'engaged', 'enlarge', 'fact', 'final', 'general', 'general', 'gooper', 'greater', 'hold', 'house', 'house', 'households', 'households', 'inadequate', 'incidentally', 'independents', 'intelligence', 'intelligence', 'intelligence', 'international', 'international', 'iraq', 'iraq', 'ive', 'john', 'justified', 'kerry', 'kerry', 'kerry', 'knowing', 'launch', 'lead', 'letter', 'lower', 'maintained', 'mass', 'member', 'm

We need to convert the set of documents (a corpus) into a format suitable for LDA.

A corpus is a dictionary. The keys are objects (I normally use strings) that identify the documents. The values are dictionaries again, where each key is a feature ID (these need to be consistent across documents — i.e. the same feature needs to have the same key in different documents) and the values are the counts (integers).

In [7]:
corpus = {}
for n in range(len(all_docs)):
    
    doc_id = 'doc_%d' % n
    doc = all_docs[n]

    counts = {}
    for word in doc:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
    
    corpus[doc_id] = counts

In [10]:
print corpus['doc_1']

{'operations': 1, 'campaign': 1, 'wasnt': 1, 'request': 1, 'mission': 2, 'general': 1, 'unfit': 1, 'times': 1, 'border': 2, 'debate': 1, 'boat': 1, 'group': 2, 'aug': 1, 'truth': 1, 'enlarge': 1, 'responds': 1, 'click': 1, 'spokesman': 1, 'bush': 1, 'book': 1, 'damn': 1, 'special': 2, 'ambitious': 2, 'online': 1, 'crossed': 1, 'cspan': 1, 'john': 2, 'veterans': 2, 'christmas': 1, 'secretaries': 2, 'play': 1, 'war': 2, 'kerry': 6, 'forces': 1, 'watching': 2, 'moment': 1, 'operating': 1, 'vietnam': 2, 'members': 1, 'swift': 1, 'pool': 3, 'service': 1, 'calls': 1, 'occasions': 1, 'inside': 2, 'drop': 1, 'michael': 1, 'wrong': 1, 'volleyball': 2, 'playing': 1, 'statement': 2, 'command': 1, 'place': 1, 'kerrys': 1, 'occasion': 1, 'ill': 1}


## Run LDA

Now we can run LDA on the corpus. Here we set the number of topics to 10, alpha (the hyperparameter for the document-topic distribution) to 1.0, and eta (the hyperparameter for topics, i.e. pseudo word counts) to 0.1. If update_alpha is set to True, we update alpha from the data during inference.

In [13]:
lda = VariationalLDA(corpus=corpus, K=10, alpha=1, eta=0.1, update_alpha=True)

Found 6906 unique words
Object created with 3430 documents


In [14]:
lda.run_vb(n_its=100, initialise=True)

Initialising
Starting iterations
Iteration 0 (change = 10.0443510667)
Iteration 1 (change = 0.127660713973)
Iteration 2 (change = 0.144967090951)
Iteration 3 (change = 0.136674043735)
Iteration 4 (change = 0.124899876201)
Iteration 5 (change = 0.115018121512)
Iteration 6 (change = 0.107417402584)
Iteration 7 (change = 0.101556569565)
Iteration 8 (change = 0.09692231146)
Iteration 9 (change = 0.0931667601487)
Iteration 10 (change = 0.0900839439106)
Iteration 11 (change = 0.0875110977076)
Iteration 12 (change = 0.0853430011253)
Iteration 13 (change = 0.0834992145047)
Iteration 14 (change = 0.0819648160852)
Iteration 15 (change = 0.0806802551121)
Iteration 16 (change = 0.0796132715837)
Iteration 17 (change = 0.0787581699883)
Iteration 18 (change = 0.0781067877871)
Iteration 19 (change = 0.0776588978571)
Iteration 20 (change = 0.0774096824883)
Iteration 21 (change = 0.077375625563)
Iteration 22 (change = 0.0775859064118)
Iteration 23 (change = 0.0780940588162)
Iteration 24 (change = 0.0788

Look at the resulting topics

In [26]:
for k in range(lda.K):
    
    # td is a dictionary, the key is a word, the value is the probability
    td = lda.get_topic_as_dict(k) 
    
    # convert td into a list of (word, prob)
    items = td.items()
    
    # sort items by the probabilities
    # itemgetter(1) refers to the 2nd field of each element, used for sorting
    sorted_items = sorted(items, key=itemgetter(1), reverse=True)
    
    # print the top-10 words
    # alternatively we can also set a threshold t and only print words at probabilities > t
    print 'Topic %d' % k
    for word, prob in sorted_items[0:10]:
        print ' -', word, prob
    print

Topic 0
 - campaign 0.0141899924775
 - party 0.00993347951454
 - people 0.0081665226726
 - democratic 0.00757208231043
 - money 0.00728027009252
 - media 0.0070606786256
 - million 0.00702070992699
 - time 0.00689339214193
 - political 0.00565475604784
 - candidates 0.00483042530822

Topic 1
 - media 0.00921477406766
 - republican 0.00908980253863
 - campaign 0.00871748708523
 - nader 0.00817413774442
 - party 0.00796156750609
 - general 0.00786535687276
 - news 0.00784097335873
 - republicans 0.00751286743511
 - ballot 0.00711968189158
 - election 0.00622302489692

Topic 2
 - bush 0.0603931377622
 - kerry 0.0491488824776
 - poll 0.016196439607
 - general 0.0153852198664
 - percent 0.0117269736725
 - voters 0.0113829153162
 - president 0.0113410475929
 - bushs 0.00941891776388
 - polls 0.00902183891196
 - kerrys 0.00815442097599

Topic 3
 - people 0.00705243269259
 - years 0.00552496108838
 - war 0.00521619154586
 - american 0.00514834128773
 - bush 0.00468636073549
 - america 0.004684

Show the topic probabilities for a document

In [32]:
eth = lda.get_expect_theta()
print eth.shape
for doc in lda.corpus:

    print "Document: " + str(doc)
    doc_pos = lda.doc_index[doc] # This is its row in eth
    for k in range(lda.K):
        if eth[doc_pos,k] > 0.01:
            print "Topic {} : {}".format(k,eth[doc_pos,k])
            
    # Break the loop so as not to get all output
    break

(3430, 10)
Document: doc_875
Topic 0 : 0.582166113393
Topic 1 : 0.112190933781
Topic 2 : 0.303511344846
