In [3]:
import json
from pprint import pprint
from __future__ import division, print_function
from gensim import corpora, models, similarities, matutils
import re
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

In [4]:
with open('final_tweets_canv_trump.json', 'r') as f:
    tweets = json.load(f)
print(type(tweets[0]))
docs = []
for new_string1 in tweets:
    topic_list = new_string1.split()
    docs.append(topic_list)
print(docs[0])


<type 'unicode'>
[u'I', u'DONT', u'WANT', u'DEMS', u'BACK', u'IN', u'POWER', u'-BUT', u'IF', u'THIS', u'HC', u'BILL', u'PASSES', u'-ITS', u'A', u'BAD', u'BILL', u'-RATES', u'&amp;', u'DEDUCTS', u'WONT', u'GO', u'DOWN', u'@POTUS', u'&amp;\u2026', u'https://t.co/IULaU2fliu']


In [5]:
from gensim import corpora
dic = corpora.Dictionary(docs)
print(dic)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : built Dictionary(5864 unique tokens: [u'trump!!', u'@StephenBannon', u'four', u'Does', u'https://t.co/qDyfj8GocG']...) from 1000 documents (total 16451 corpus positions)


Dictionary(5864 unique tokens: [u'trump!!', u'@StephenBannon', u'four', u'Does', u'https://t.co/qDyfj8GocG']...)


In [6]:
corpus = [dic.doc2bow(text) for text in docs]
print(type(corpus), len(corpus))

<type 'list'> 1000


In [7]:
for corp in corpus:
    print(len(corp), corp[:10])

25 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
10 [(14, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)]
19 [(16, 1), (29, 1), (30, 2), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1)]
19 [(34, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1)]
21 [(14, 1), (26, 1), (34, 1), (50, 1), (59, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1)]
19 [(34, 1), (51, 1), (56, 1), (73, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1)]
18 [(26, 1), (51, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1)]
23 [(16, 2), (41, 1), (46, 1), (51, 1), (108, 1), (115, 1), (116, 1), (117, 1), (118, 1), (119, 1)]
19 [(14, 1), (41, 1), (98, 1), (124, 1), (133, 1), (134, 1), (135, 1), (136, 1), (137, 1), (138, 1)]
23 [(29, 1), (34, 1), (46, 1), (101, 1), (123, 1), (148, 1), (149, 1), (150, 1), (151, 2), (152, 1)]
16 [(14, 1), (34, 1), (59, 1), (133, 1), (1

In [8]:
from gensim import models
tfidf = models.TfidfModel(corpus)
print(type(tfidf))

INFO : collecting document frequencies
INFO : PROGRESS: processing document #0
INFO : calculating IDF weights for 1000 documents and 5863 features (15854 matrix non-zeros)


<class 'gensim.models.tfidfmodel.TfidfModel'>


In [9]:
corpus_tfidf = tfidf[corpus]
print(type(corpus_tfidf))

<class 'gensim.interfaces.TransformedCorpus'>


In [11]:
NUM_TOPICS = 10
model = models.ldamodel.LdaModel(corpus_tfidf, 
                                 num_topics=NUM_TOPICS, 
                                 id2word=dic, 
                                 update_every=1, 
                                 passes=100)

INFO : using symmetric alpha at 0.1
INFO : using symmetric eta at 0.000170532060027
INFO : using serial LDA version on this node
INFO : running online LDA training, 10 topics, 100 passes over the supplied corpus of 1000 documents, updating model once every 1000 documents, evaluating perplexity every 1000 documents, iterating 50x with a convergence threshold of 0.001000
INFO : -28.151 per-word bound, 298042663.8 perplexity estimate based on a held-out corpus of 1000 documents with 3627 words
INFO : PROGRESS: pass 0, at document #1000/1000
INFO : topic #4 (0.100): 0.002*"the" + 0.002*"Trump" + 0.002*"to" + 0.002*"is" + 0.002*"and" + 0.002*"a" + 0.002*"@realDonaldTrump" + 0.002*"on" + 0.002*"@POTUS" + 0.002*"has"
INFO : topic #0 (0.100): 0.003*"is" + 0.003*"to" + 0.003*"the" + 0.003*"I" + 0.003*"a" + 0.003*"Trump" + 0.002*"you" + 0.002*"@realDonaldTrump" + 0.002*"and" + 0.002*"for"
INFO : topic #9 (0.100): 0.004*"#WomensMarch…" + 0.004*"#U2" + 0.004*"#Trumpleaks" + 0.004*"#TrumpRussia" + 

In [12]:
print("LDA model")
topics_found = model.print_topics(20)
counter = 1
for t in topics_found:
    print("Topic #{} {}".format(counter, t))
    counter += 1

INFO : topic #0 (0.100): 0.003*"is" + 0.003*"the" + 0.003*"to" + 0.003*"I" + 0.003*"a" + 0.003*"Trump" + 0.002*"you" + 0.002*"and" + 0.002*"@realDonaldTrump" + 0.002*"for"
INFO : topic #1 (0.100): 0.003*"@realDonaldTrump" + 0.002*"to" + 0.002*"the" + 0.002*"it" + 0.002*"and" + 0.002*"from" + 0.002*"@POTUS" + 0.002*"of" + 0.002*"in" + 0.002*"with"
INFO : topic #2 (0.100): 0.003*"Trump" + 0.002*"on" + 0.002*"a" + 0.002*"to" + 0.002*"of" + 0.002*"and" + 0.002*"in" + 0.002*"the" + 0.002*"&amp;" + 0.002*"team,"
INFO : topic #3 (0.100): 0.003*"The" + 0.003*"is" + 0.002*"the" + 0.002*"to" + 0.002*"a" + 0.002*"in" + 0.002*"of" + 0.002*"Joke" + 0.002*"Already" + 0.002*"@realDonaldTrump"
INFO : topic #4 (0.100): 0.002*"the" + 0.002*"Trump" + 0.002*"@realDonaldTrump" + 0.002*"to" + 0.002*"is" + 0.002*"and" + 0.002*"a" + 0.002*"@POTUS" + 0.002*"on" + 0.002*"Volunteer"
INFO : topic #5 (0.100): 0.002*"to" + 0.002*"you" + 0.002*"a" + 0.002*"the" + 0.002*"Trump's" + 0.002*"of" + 0.002*"is" + 0.002*"Tr

LDA model
Topic #1 (0, u'0.003*"is" + 0.003*"the" + 0.003*"to" + 0.003*"I" + 0.003*"a" + 0.003*"Trump" + 0.002*"you" + 0.002*"and" + 0.002*"@realDonaldTrump" + 0.002*"for"')
Topic #2 (1, u'0.003*"@realDonaldTrump" + 0.002*"to" + 0.002*"the" + 0.002*"it" + 0.002*"and" + 0.002*"from" + 0.002*"@POTUS" + 0.002*"of" + 0.002*"in" + 0.002*"with"')
Topic #3 (2, u'0.003*"Trump" + 0.002*"on" + 0.002*"a" + 0.002*"to" + 0.002*"of" + 0.002*"and" + 0.002*"in" + 0.002*"the" + 0.002*"&amp;" + 0.002*"team,"')
Topic #4 (3, u'0.003*"The" + 0.003*"is" + 0.002*"the" + 0.002*"to" + 0.002*"a" + 0.002*"in" + 0.002*"of" + 0.002*"Joke" + 0.002*"Already" + 0.002*"@realDonaldTrump"')
Topic #5 (4, u'0.002*"the" + 0.002*"Trump" + 0.002*"@realDonaldTrump" + 0.002*"to" + 0.002*"is" + 0.002*"and" + 0.002*"a" + 0.002*"@POTUS" + 0.002*"on" + 0.002*"Volunteer"')
Topic #6 (5, u'0.002*"to" + 0.002*"you" + 0.002*"a" + 0.002*"the" + 0.002*"Trump\'s" + 0.002*"of" + 0.002*"is" + 0.002*"Trump" + 0.002*"that" + 0.002*"on"')
Topi

In [13]:
from gensim import models
NUM_TOPICS = 10
model = models.lsimodel.LsiModel(corpus_tfidf,
                                 id2word=dic,
                                 num_topics=NUM_TOPICS
                                )

INFO : using serial LSI version on this node
INFO : updating model with new documents
INFO : preparing a new chunk of documents
INFO : using 100 extra samples and 2 power iterations
INFO : 1st phase: constructing (5864L, 110L) action matrix
INFO : orthonormalizing (5864L, 110L) action matrix
INFO : 2nd phase: running dense svd on (110L, 1000L) matrix
INFO : computing the final decomposition
INFO : keeping 10 factors (discarding 67.646% of energy spectrum)
INFO : processed documents up to #1000
INFO : topic #0(6.473): 0.320*"TRAITOR" + 0.320*"#blacklivesmatter" + 0.319*"IMPEACH" + 0.318*"#Trumpleaks" + 0.318*"#WomensMarch…" + 0.318*"#U2" + 0.311*"#TheResistance" + 0.311*"@MoveOn" + 0.307*"THE" + 0.305*"#TrumpRussia"
INFO : topic #1(3.365): 0.229*"the" + 0.228*"is" + 0.226*"to" + 0.205*"a" + 0.193*"you" + 0.187*"and" + 0.184*"Trump" + 0.171*"of" + 0.150*"@realDonaldTrump" + 0.146*"I"
INFO : topic #2(2.287): -0.287*"#MaraLago" + -0.287*"limits" + -0.287*"vacations:" + -0.287*"Demand" + -0

In [14]:
model.print_topics()

INFO : topic #0(6.473): 0.320*"TRAITOR" + 0.320*"#blacklivesmatter" + 0.319*"IMPEACH" + 0.318*"#Trumpleaks" + 0.318*"#WomensMarch…" + 0.318*"#U2" + 0.311*"#TheResistance" + 0.311*"@MoveOn" + 0.307*"THE" + 0.305*"#TrumpRussia"
INFO : topic #1(3.365): 0.229*"the" + 0.228*"is" + 0.226*"to" + 0.205*"a" + 0.193*"you" + 0.187*"and" + 0.184*"Trump" + 0.171*"of" + 0.150*"@realDonaldTrump" + 0.146*"I"
INFO : topic #2(2.287): -0.287*"#MaraLago" + -0.287*"limits" + -0.287*"vacations:" + -0.287*"Demand" + -0.287*"taxpayer" + -0.284*"golf" + -0.284*"trips" + -0.281*"spent" + -0.273*"money" + -0.248*"family"
INFO : topic #3(2.124): 0.283*"Placement" + 0.283*"Expose'" + 0.283*"Disturbing" + 0.283*"🌎" + 0.283*"Volunteer" + 0.283*"https://t.co/wvnVOEvrhT" + 0.283*"FACTS" + 0.283*"Refugee" + 0.278*"UN" + 0.263*"A"
INFO : topic #4(2.023): -0.271*"spied" + -0.266*"administration" + -0.263*"team," + -0.242*"'smoking" + -0.242*"source" + -0.242*"Potential" + -0.242*"gun'" + -0.241*"showing" + -0.225*"says" 

[(0,
  u'0.320*"TRAITOR" + 0.320*"#blacklivesmatter" + 0.319*"IMPEACH" + 0.318*"#Trumpleaks" + 0.318*"#WomensMarch\u2026" + 0.318*"#U2" + 0.311*"#TheResistance" + 0.311*"@MoveOn" + 0.307*"THE" + 0.305*"#TrumpRussia"'),
 (1,
  u'0.229*"the" + 0.228*"is" + 0.226*"to" + 0.205*"a" + 0.193*"you" + 0.187*"and" + 0.184*"Trump" + 0.171*"of" + 0.150*"@realDonaldTrump" + 0.146*"I"'),
 (2,
  u'-0.287*"#MaraLago" + -0.287*"limits" + -0.287*"vacations:" + -0.287*"Demand" + -0.287*"taxpayer" + -0.284*"golf" + -0.284*"trips" + -0.281*"spent" + -0.273*"money" + -0.248*"family"'),
 (3,
  u'0.283*"Placement" + 0.283*"Expose\'" + 0.283*"Disturbing" + 0.283*"\U0001f30e" + 0.283*"Volunteer" + 0.283*"https://t.co/wvnVOEvrhT" + 0.283*"FACTS" + 0.283*"Refugee" + 0.278*"UN" + 0.263*"A"'),
 (4,
  u'-0.271*"spied" + -0.266*"administration" + -0.263*"team," + -0.242*"\'smoking" + -0.242*"source" + -0.242*"Potential" + -0.242*"gun\'" + -0.241*"showing" + -0.225*"says" + -0.221*"Obama"'),
 (5,
  u'-0.314*"you" + 0.