In [1]:
import sys
sys.path.append('..')
import logging

import pyLDAvis.gensim as gensimvis
import pyLDAvis
from src.models.corpus import HackernewsCorpus
from gensim import utils, models
from gensim.models.wrappers import ldamallet
from src.data.load_data import get_hackernews_files, load_hackernews_dataframe

logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

In [2]:
# Load one HackerNews DataFrame
files = get_hackernews_files()
df = load_hackernews_dataframe(files[-1])

In [3]:
# Focus on the stories
comments_df = df[df.type == "comment"]
comments_df = comments_df.dropna(subset=['text'])
comments = comments_df['text']

# Set up the streamed corpus
corpus = HackernewsCorpus(comments)

2017-12-27 10:17:26,001 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-12-27 10:17:28,836 : INFO : adding document #10000 to Dictionary(26163 unique tokens: ['bitch', 'showcases', 'reversible', 'dataloader', 'geekwire']...)
2017-12-27 10:17:31,794 : INFO : adding document #20000 to Dictionary(38044 unique tokens: ['bitch', 'showcases', 'reversible', 'dataloader', 'enduring']...)
2017-12-27 10:17:34,878 : INFO : adding document #30000 to Dictionary(46157 unique tokens: ['tiobe', 'dataloader', 'theyre', 'symbasync', 'kennedy']...)
2017-12-27 10:17:37,692 : INFO : adding document #40000 to Dictionary(52718 unique tokens: ['tiobe', 'dataloader', 'theyre', 'symbasync', 'kennedy']...)
2017-12-27 10:17:41,083 : INFO : adding document #50000 to Dictionary(58865 unique tokens: ['tiobe', 'dataloader', 'theyre', 'symbasync', 'kennedy']...)
2017-12-27 10:17:43,923 : INFO : adding document #60000 to Dictionary(64014 unique tokens: ['tiobe', 'dataloader', 'theyre', 'symbasync', 

In [4]:
# Train 10 LDA topics using MALLET
mallet_path = '/home/madness/Programs/mallet-2.0.6/bin/mallet'
model = models.wrappers.LdaMallet(mallet_path, corpus, num_topics=10, id2word=corpus.dictionary)

2017-12-27 10:18:23,059 : INFO : serializing temporary corpus to /tmp/b13b75_corpus.txt
2017-12-27 10:19:32,386 : INFO : converting temporary corpus to MALLET format with /home/madness/Programs/mallet-2.0.6/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /tmp/b13b75_corpus.txt --output /tmp/b13b75_corpus.mallet
2017-12-27 10:19:46,572 : INFO : training MALLET LDA with /home/madness/Programs/mallet-2.0.6/bin/mallet train-topics --input /tmp/b13b75_corpus.mallet --num-topics 10  --alpha 50 --optimize-interval 0 --num-threads 4 --output-state /tmp/b13b75_state.mallet.gz --output-doc-topics /tmp/b13b75_doctopics.txt --output-topic-keys /tmp/b13b75_topickeys.txt --num-iterations 1000 --inferencer-filename /tmp/b13b75_inferencer.mallet --doc-topics-threshold 0.0
2017-12-27 10:32:39,372 : INFO : loading assigned topics from /tmp/b13b75_state.mallet.gz


In [5]:
# Predict the topics of a document
doc = "As someone primarily interested in interpretation of deep models, I strongly resonate with this warning against anthropomorphization of neural networks. Deep learning isn't special; deep models tend to be more accurate than other methods, but fundamentally they aren't much closer to working like the human brain than e.g. gradient boosting models."
bow = corpus.dictionary.doc2bow(utils.simple_preprocess(doc))
print(model[bow])

2017-12-27 10:33:28,550 : INFO : serializing temporary corpus to /tmp/b13b75_corpus.txt
2017-12-27 10:33:28,564 : INFO : converting temporary corpus to MALLET format with /home/madness/Programs/mallet-2.0.6/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /tmp/b13b75_corpus.txt --output /tmp/b13b75_corpus.mallet.infer --use-pipe-from /tmp/b13b75_corpus.mallet
2017-12-27 10:33:37,731 : INFO : inferring topics with MALLET LDA '/home/madness/Programs/mallet-2.0.6/bin/mallet infer-topics --input /tmp/b13b75_corpus.mallet.infer --inferencer /tmp/b13b75_inferencer.mallet --output-doc-topics /tmp/b13b75_doctopics.txt.infer --num-iterations 100 --doc-topics-threshold 0.0'


[(8, 0.28205128205128205), (9, 0.11538461538461539), (0, 0.0868945868945869), (6, 0.08547008547008547), (5, 0.08262108262108261), (2, 0.07407407407407407), (7, 0.07122507122507123), (4, 0.06837606837606838), (3, 0.06837606837606838), (1, 0.06552706552706553)]


In [6]:
with open(model.ftopickeys()) as input:
    topic_keys_lines = input.read()
print(topic_keys_lines)

0	5	data don system service security access network internet run doesn server case account running information key services hardware set 
1	5	google open web app source support windows user linux facebook users game apple version apps site content ve file 
2	5	time ve don people years good things work day lot back ll find long thing didn feel life days 
3	5	cost high car low bitcoin power price space amount buy small costs big cars year large expensive higher long 
4	5	https href rel nofollow www http org news github en wiki wikipedia amp html id ycombinator item article blog 
5	5	quot gt don people point article comment doesn person read wrong question bad thing isn understand talking women kind 
6	5	work money company companies business job market make pay time product software good working tech experience team full jobs 
7	5	people gt world government don public care law state social country free china society legal tax countries poor system 
8	5	problem problems point human real th

In [41]:
corpus_list = [corpus.dictionary.doc2bow(text.split(' ')) for text in corpus.documents]

In [45]:
wrapped_model = ldamallet.malletmodel2ldamodel(model)

2017-12-27 11:07:32,831 : INFO : using symmetric eta at 3.0274590536162998e-05
2017-12-27 11:07:32,847 : INFO : using serial LDA version on this node


In [52]:
wrapped_model.print_topics()

2017-12-27 11:18:56,311 : INFO : topic #0 (5.000): 0.000*"washing" + 0.000*"interview" + 0.000*"explainer" + 0.000*"freebsd" + 0.000*"rounding" + 0.000*"inability" + 0.000*"fantastically" + 0.000*"priv" + 0.000*"ola" + 0.000*"prying"
2017-12-27 11:18:56,313 : INFO : topic #1 (5.000): 0.000*"infarction" + 0.000*"equations" + 0.000*"san" + 0.000*"daemon" + 0.000*"ostracise" + 0.000*"corporatism" + 0.000*"erodes" + 0.000*"anonymously" + 0.000*"ambulance" + 0.000*"prefecture"
2017-12-27 11:18:56,317 : INFO : topic #2 (5.000): 0.000*"floodgates" + 0.000*"monies" + 0.000*"algebras" + 0.000*"init" + 0.000*"hoodies" + 0.000*"laugh" + 0.000*"elect" + 0.000*"glyphs" + 0.000*"gracious" + 0.000*"infotainment"
2017-12-27 11:18:56,320 : INFO : topic #3 (5.000): 0.000*"profits" + 0.000*"contention" + 0.000*"anime" + 0.000*"interactivity" + 0.000*"skyscraper" + 0.000*"infamous" + 0.000*"between" + 0.000*"dancing" + 0.000*"swath" + 0.000*"everything"
2017-12-27 11:18:56,322 : INFO : topic #4 (5.000): 0

[(0,
  '0.000*"washing" + 0.000*"interview" + 0.000*"explainer" + 0.000*"freebsd" + 0.000*"rounding" + 0.000*"inability" + 0.000*"fantastically" + 0.000*"priv" + 0.000*"ola" + 0.000*"prying"'),
 (1,
  '0.000*"infarction" + 0.000*"equations" + 0.000*"san" + 0.000*"daemon" + 0.000*"ostracise" + 0.000*"corporatism" + 0.000*"erodes" + 0.000*"anonymously" + 0.000*"ambulance" + 0.000*"prefecture"'),
 (2,
  '0.000*"floodgates" + 0.000*"monies" + 0.000*"algebras" + 0.000*"init" + 0.000*"hoodies" + 0.000*"laugh" + 0.000*"elect" + 0.000*"glyphs" + 0.000*"gracious" + 0.000*"infotainment"'),
 (3,
  '0.000*"profits" + 0.000*"contention" + 0.000*"anime" + 0.000*"interactivity" + 0.000*"skyscraper" + 0.000*"infamous" + 0.000*"between" + 0.000*"dancing" + 0.000*"swath" + 0.000*"everything"'),
 (4,
  '0.000*"yuan" + 0.000*"vascular" + 0.000*"christianity" + 0.000*"cyclist" + 0.000*"disagreeable" + 0.000*"ltsb" + 0.000*"deflect" + 0.000*"dcom" + 0.000*"ear" + 0.000*"wt"'),
 (5,
  '0.000*"jest" + 0.000*"

In [46]:
vis_data = gensimvis.prepare(wrapped_model, corpus_list, corpus.dictionary)
pyLDAvis.display(vis_data)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


As one can tell from the output of both topic distributions, it seems like the wrapped models topics completely differ from the fitted MALLET model. This means that we have to directly visualize the topics of the MALLET model without wrapping it.