In [155]:
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
from gensim.models import Phrases
from sklearn import decomposition
import pyLDAvis.gensim as gensimvis
import pyLDAvis

# initialize tokenizer and stopwords
tokenizer = RegexpTokenizer(r'\w+')
en_stop = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as',
           'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot',
           'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each',
           'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd",
           "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd",
           "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more',
           'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought',
           'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should',
           "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
           'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to',
           'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't",
           'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",
           'with', "won't", 'would', "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself',
           'yourselves', 'apos', 's', 'I', 'will', 'go', 'get', '(', ')', '?', ':', ';', ',', '.', '!', '/', '"', "'", "...",
           "``", "&apos", "&apos;s", "&apos;&apos;"]

stop_chars = ['<', '>']

# get all lemmas between a <story>-</story>-pair:
stories = []
with open('2016-01.txt') as infile:
    for line in infile:
        l = line.rstrip()
        if l == "<story>":
            story = []
        elif l == "</story>":
            stories.append(story)
            story = []
        elif not any(stop_char in l for stop_char in stop_chars):
            if l not in en_stop:
                story.append(l)



In [141]:
print(stories)

[['rocket'], ['faster', 'gyro', 'copter', 'powerful', 'pizza', 'rat', 'hit', 'harder', 'Holly', 'Holm', 'year', 'countless', 'victim', 'politics', 'sport', 'pop', 'culture', 'music', 'movie', 'take', 'league', 'Super', 'hero', 'actress', 'Melissa', 'Joan', 'Hart', 'CNN', 'quest', 'mean', 'business', 'Richard', 'Quest', 'model', 'reality', 'tv', 'star', 'Carmen', 'Carrera', 'actor', 'singer', 'Titus', 'Burgess', 'morning', 'radio', 'show', 'Bethany', 'Watson', 'CNN', 'Anchor', 'John', 'Berman', 'fusion', 'shut', 'Paul', 'F.', 'Tomkins', 'hln', 'morning', 'express', 'Robin', 'Meade', 'best', 'worst', '2015'], ['welcome', 'Tom', 'foreman', 'use', 'superhero', 'help', 'challenge', 'year', 'exasperating', 'exhilarating', 'comical', 'next', 'hour', 'start', 'best', 'worst', 'world', 'big', 'news'], ['year', 'first', 'think', 'lot', 'people', 'predict', 'lot', 'thing', 'wrong', 'can', 'believe', 'end', 'year', 'already', 'weird', 'weird', 'stuff', 'happen', 'kind', 'horrifying', 'term', 'head

In [156]:
# create dictionary and wordcounts corpus:
dictionary = corpora.Dictionary(stories)
#print(dictionary.token2id)
dictionary.save("wordcounts.dict")
print(len(dictionary))

# Bag-of-words representation of the stories.
corpus = [dictionary.doc2bow(story) for story in stories]
corpora.MmCorpus.serialize("corpus.mm", corpus)
print(len(corpus))
print(len(stories))


113856
42262
42262


1525
14
14


In [157]:
# create tf.idf model:
tfidf_model = models.TfidfModel(corpus)
tfidf_model.save("tfidf_model")
tfidf_corpus = tfidf_model[corpus]
tfidf_corpus
corpora.MmCorpus.serialize("tfidf_corpus.mm", tfidf_corpus)


In [159]:
# create topic models:
# LSI
%time lsi_model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=100) # initialize an LSI transformation
lsi_corpus = lsi_model[tfidf_corpus]
print(lsi_model.print_topics(10, 10))


CPU times: user 1min 10s, sys: 6.19 s, total: 1min 17s
Wall time: 59.1 s
[(0, '0.139*"think" + 0.128*"say" + 0.120*"Cruz" + 0.116*"people" + 0.113*"Iowa" + 0.110*"Clinton" + 0.109*"know" + 0.104*"trump" + 0.103*"Trump" + 0.102*"can"'), (1, '-0.282*"Cruz" + -0.239*"Iowa" + -0.221*"Clinton" + -0.217*"trump" + -0.214*"Trump" + -0.207*"Donald" + -0.167*"Sanders" + -0.162*"Ted" + -0.150*"Hillary" + -0.143*"debate"'), (2, '0.259*"snow" + 0.246*"rain" + -0.209*"®" + -0.200*"doctor" + 0.176*"storm" + -0.139*"Humira" + -0.136*"blood" + -0.129*"infection" + 0.127*"temperature" + 0.125*"morning"'), (3, '-0.204*"®" + 0.187*"gun" + -0.171*"doctor" + 0.163*"president" + 0.163*"Iran" + -0.160*"snow" + -0.144*"rain" + -0.131*"Cruz" + -0.123*"Humira" + 0.121*"police"'), (4, '-0.409*"Clinton" + -0.358*"Sanders" + 0.323*"Cruz" + -0.279*"Bernie" + -0.262*"Hillary" + 0.210*"trump" + 0.199*"Trump" + 0.195*"Donald" + 0.194*"Ted" + -0.145*"gun"'), (5, '-0.344*"Chapo" + -0.293*"El" + 0.223*"Iran" + 0.218*"wate

In [84]:
# LDA
num_topics = 100

%time lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, update_every=0, chunksize=20000, passes=20)
lda_model.save("lda_model")
lda_corpus = lda_model[corpus]
corpora.MmCorpus.serialize("lda_corpus.mm", lda_corpus)

print("\nTopics by Latent Dirichlet Allocation model")
topics_found_lda = lda_model.print_topics(num_topics=10, num_words=10)
counter = 1
for t in topics_found_lda:
    print("Topic #{} {}".format(counter, t))
    counter += 1
    

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.

top_topics = lda_model.top_topics(corpus, num_words=20)
avg_topic_coherence = sum([t[1] for t in top_topics ]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

data_vis_lda = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(data_vis_lda)


CPU times: user 3.09 s, sys: 15.9 ms, total: 3.1 s
Wall time: 3.12 s

Topics by Latent Dirichlet Allocation model
Topic #1 (0, '0.001*"like" + 0.001*"best" + 0.001*"worst" + 0.001*"year" + 0.001*"think" + 0.001*"know" + 0.001*"can" + 0.001*"see" + 0.001*"say" + 0.001*"come"')
Topic #2 (1, '0.015*"like" + 0.015*"people" + 0.014*"think" + 0.011*"even" + 0.009*"many" + 0.009*"way" + 0.008*"know" + 0.008*"come" + 0.008*"just" + 0.008*"voter"')
Topic #3 (2, '0.013*"worst" + 0.011*"year" + 0.010*"know" + 0.010*"one" + 0.010*"woman" + 0.009*"best" + 0.009*"come" + 0.008*"really" + 0.006*"can" + 0.006*"just"')
Topic #4 (3, '0.013*"best" + 0.013*"like" + 0.013*"thing" + 0.009*"moment" + 0.009*"way" + 0.009*"dark" + 0.009*"hell" + 0.009*"find" + 0.005*"year" + 0.005*"show"')
Topic #5 (4, '0.001*"know" + 0.001*"like" + 0.001*"year" + 0.001*"best" + 0.001*"one" + 0.001*"come" + 0.001*"just" + 0.001*"say" + 0.001*"worst" + 0.001*"think"')
Topic #6 (5, '0.012*"like" + 0.012*"year" + 0.011*"best" + 0

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]


In [160]:
# HDP
%time hdp_model = models.hdpmodel.HdpModel(corpus, dictionary, T=30)
hdp_model.save("hdp_model")

print("\nTopics by Hierarchical Dirichlet process model")
topics_found_hdp = hdp_model.print_topics(num_topics=10, num_words=5)
counter = 1
for t in topics_found_hdp:
    print("Topic #{} {}".format(counter, t))
    counter += 1
vis_hdp = gensimvis.prepare(hdp_model, corpus, dictionary)
pyLDAvis.display(vis_hdp)


CPU times: user 4min 14s, sys: 11.2 s, total: 4min 25s
Wall time: 4min 36s

Topics by Hierarchical Dirichlet process model
Topic #1 (0, '0.011*say + 0.008*think + 0.007*can + 0.007*people + 0.007*know')
Topic #2 (1, '0.007*say + 0.007*see + 0.007*now + 0.006*can + 0.006*one')
Topic #3 (2, '0.008*say + 0.005*now + 0.005*one + 0.005*year + 0.004*people')
Topic #4 (3, '0.006*say + 0.005*people + 0.005*year + 0.004*new + 0.004*one')
Topic #5 (4, '0.004*year + 0.004*say + 0.004*new + 0.003*like + 0.003*can')
Topic #6 (5, '0.001*year + 0.001*new + 0.001*come + 0.001*right + 0.001*now')
Topic #7 (6, '0.001*say + 0.001*like + 0.001*come + 0.001*see + 0.001*know')
Topic #8 (7, '0.001*year + 0.001*can + 0.001*new + 0.001*know + 0.001*come')
Topic #9 (8, '0.001*year + 0.001*come + 0.001*new + 0.001*one + 0.001*know')
Topic #10 (9, '0.001*year + 0.001*-- + 0.001*new + 0.001*right + 0.000*come')


  doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]


_io.TextIOWrapper