In [24]:
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
from gensim.models import Phrases
from sklearn import decomposition
import pyLDAvis.gensim as gensimvis
import pyLDAvis

# initialize tokenizer and stopwords
en_stop = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't", 'as',
           'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't", 'cannot',
           'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each',
           'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having', 'he', "he'd",
           "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how', "how's", 'i', "i'd",
           "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's", 'me', 'more',
           'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'ought',
           'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd", "she'll", "she's", 'should',
           "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then',
           'there', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'this', 'those', 'through', 'to',
           'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we', "we'd", "we'll", "we're", "we've", 'were', "weren't",
           'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's",
           'with', "won't", 'would', "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself',
           'yourselves', 'apos', 's', 'I', 'will', 'go', 'get', '(', ')', '?', ':', ';', ',', '.', '!', '/', '"', "'", "...",
           "``", "&apos", "&apos;s", "&apos;&apos;", "-lsb-", "-rsb-", "-lcb-", "-rcb-", "-lrb-", "-rrb-", "O&apos;MALLEY", "--"]

stop_chars = ['<', '>']

# get all lemmas between a <story>-</story>-pair:
stories = []
with open('3-4-5.txt') as infile:
    for line in infile:
        l = line.rstrip()
        if l == "<story>":
            story = []
        elif l == "</story>":
            stories.append(story)
            story = []
        elif not any(stop_char in l for stop_char in stop_chars):
            if l not in en_stop:
                story.append(l)



In [1]:
#print(stories)

In [25]:
# create dictionary and wordcounts corpus:
dictionary = corpora.Dictionary(stories)
#print(dictionary.token2id)
dictionary.save("wordcounts.dict")
print(len(dictionary))

# Bag-of-words representation of the stories.
corpus = [dictionary.doc2bow(story) for story in stories]
corpora.MmCorpus.serialize("corpus.mm", corpus)
print(len(corpus))
print(len(stories))


28920
3174
3174


In [26]:
# create tf.idf model:
tfidf_model = models.TfidfModel(corpus)
tfidf_model.save("tfidf_model")
tfidf_corpus = tfidf_model[corpus]
tfidf_corpus
corpora.MmCorpus.serialize("tfidf_corpus.mm", tfidf_corpus)


In [16]:
# create topic models:
# LSI
%time lsi_model = models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=10) # initialize an LSI transformation
lsi_corpus = lsi_model[tfidf_corpus]
print("\nTopics by Latent Sementic Indexing model")
topics_found_lsi = lsi_model.print_topics(num_topics=3, num_words=5)

from crawler import search
import re

counter = 1
for t in topics_found_lsi:
    print("Topic #{} {}".format(counter, t))
    words = re.findall('"([^"]+)"', t[1])
    words = ' '.join(words)
    print(words)
    for url in search(words, stop=5):
        print(url)
    counter += 1

CPU times: user 2.98 s, sys: 242 ms, total: 3.22 s
Wall time: 3.16 s

Topics by Latent Sementic Indexing model
Topic #1 (0, '0.141*"year" + 0.125*"know" + 0.122*"new" + 0.118*"people" + 0.114*"say"')
year know new people say
http://www.npr.org/sections/health-shots/2017/06/19/533269211/can-you-find-the-defibrillator-at-work-half-of-people-say-no
https://www.forbes.com/sites/dandiamond/2013/01/01/just-8-of-people-achieve-their-new-years-resolutions-heres-how-they-did-it/
https://www.inc.com/marcel-schwantes/science-says-92-percent-of-people-dont-achieve-goals-heres-how-the-other-8-perce.html
https://www.buzzfeed.com/carolynkylstra/impress-literally-everyone-you-meet
http://people.com/babies/kendra-wilkinson-baskett-7-year-old-son-knows-about-sex/
http://www.nbcnewyork.com/news/local/Richard-Rojas-Times-Square-Car-Plow-Pedestrian-Death-Injuries-Investigation-Evacuation-NYPD-Drugs-Alcohol-423010814.html
Topic #2 (1, '-0.346*"fire" + -0.246*"Dubai" + -0.225*"hotel" + -0.180*"building" + -0

In [None]:
# LDA
num_topics = 100

%time lda_model = models.ldamodel.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=num_topics, update_every=0, chunksize=10000, passes=20)
lda_model.save("lda_model")
lda_corpus = lda_model[corpus]
#corpora.MmCorpus.serialize("lda_corpus.mm", lda_corpus)

print("\nTopics by Latent Dirichlet Allocation model")
topics_found_lda = lda_model.print_topics(num_topics=5, num_words=10)

print(topics_found_lda)


In [15]:
from crawler import search
import re

counter = 1
for t in topics_found_lda:
    print("Topic #{} {}".format(counter, t))
    words = re.findall('"([^"]+)"', t[1])
    words = ' '.join(words)
    print(words)
    for url in search(words, stop=3):
        print(url)
    counter += 1
    

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.

top_topics = lda_model.top_topics(corpus, num_words=20)
avg_topic_coherence = sum([t[1] for t in top_topics ]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

data_vis_lda = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(data_vis_lda)


Topic #1 (66, '0.004*"1993" + 0.003*"capital" + 0.003*"reopen" + 0.003*"closed" + 0.003*"universal"')
1993 capital reopen closed universal
https://en.wikipedia.org/wiki/List_of_former_Universal_Studios_Florida_attractions
https://en.wikipedia.org/wiki/Universal_CityWalk
https://www.wired.com/1993/03/drucker-2/
Topic #2 (72, '0.015*"couch" + 0.012*"Ethan" + 0.011*"Mexico" + 0.009*"probation" + 0.008*"violate"')
couch Ethan Mexico probation violate
https://www.dallasnews.com/news/crime/2015/12/03/tarrant-da-investigating-twitter-video-claiming-to-show-affluenza-teen-ethan-couch-playing-beer-pong
http://www.cnn.com/2016/04/13/us/texas-affluenza-ethan-couch/index.html
http://www.cnn.com/2015/12/29/us/affluenza-teen-ethan-couch-detained-in-mexico/index.html
Topic #3 (81, '0.003*"tax" + 0.003*"IRS" + 0.002*"balance" + 0.002*"billow" + 0.002*"twist"')
tax IRS balance billow twist
https://www.irs.gov/uac/view-your-tax-account
https://www.irs.gov/uac/three-ways-to-pay-your-federal-income-tax
ht

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]


In [19]:
# HDP
%time hdp_model = models.hdpmodel.HdpModel(corpus, dictionary, T=10)
hdp_model.save("hdp_model")

print("\nTopics by Hierarchical Dirichlet process model")
topics_found_hdp = hdp_model.print_topics(num_topics=10, num_words=10)



CPU times: user 6.92 s, sys: 98 ms, total: 7.01 s
Wall time: 7.11 s

Topics by Hierarchical Dirichlet process model


In [20]:
counter = 1
for t in topics_found_hdp:
    print("Topic #{} {}".format(counter, t))
    counter += 1

Topic #1 (0, '0.007*year + 0.005*say + 0.005*know + 0.005*like + 0.004*new + 0.004*can + 0.004*one + 0.004*people + 0.004*see + 0.004*just')
Topic #2 (1, '0.002*year + 0.002*know + 0.001*people + 0.001*say + 0.001*look + 0.001*can + 0.001*see + 0.001*just + 0.001*one + 0.001*like')
Topic #3 (2, '0.001*year + 0.001*right + 0.001*one + 0.001*can + 0.001*just + 0.001*like + 0.001*time + 0.001*-- + 0.001*think + 0.001*new')
Topic #4 (3, '0.001*year + 0.001*say + 0.001*people + 0.001*new + 0.001*know + 0.001*-- + 0.001*like + 0.001*can + 0.001*come + 0.001*one')
Topic #5 (4, '0.000*outbreak + 0.000*turkey + 0.000*Den + 0.000*barack + 0.000*113 + 0.000*mightier + 0.000*filing + 0.000*heroic + 0.000*five-year + 0.000*ody')
Topic #6 (5, '0.001*need + 0.000*new + 0.000*Mber + 0.000*Sri + 0.000*slingblade + 0.000*democrattatic + 0.000*P.H. + 0.000*lottery + 0.000*switcher + 0.000*Rapids')
Topic #7 (6, '0.001*presidential + 0.000*diligent + 0.000*diploma + 0.000*alter + 0.000*MERAMEC + 0.000*MISS

In [18]:
counter = 1
for t in topics_found_hdp:
    print("Topic #{} {}".format(counter, t))
    words = re.findall('"([^"]+)"', t[1])
    words = ' '.join(words)
    print(words)
    for url in search(words, stop=3):
        print(url)
    counter += 1
    
vis_hdp = gensimvis.prepare(hdp_model, corpus, dictionary)
pyLDAvis.display(vis_hdp)

Topic #1 (0, '0.005*year + 0.004*say + 0.004*people + 0.004*know + 0.004*new + 0.004*can + 0.003*one + 0.003*just + 0.003*see + 0.003*now')



AttributeError: 'NoneType' object has no attribute 'findAll'