In [130]:
import json_lines
import csv

def process_tweet(tweet):  
    d = {}
    d['hashtags'] = [hashtag['text'] for hashtag in tweet['entities']['hashtags']]
    d['text'] = tweet['full_text']
    d['user'] = tweet['user']['screen_name']
    d['user_loc'] = tweet['user']['location']
    d['created_at'] = tweet['created_at']
    return d

if False:
    with open('congress_dataset/senators-1.jsonl', 'rb') as f:
        with open(r'senators-1-tweets.csv', 'a') as file:
            writer = csv.writer(file)
            for item in json_lines.reader(f):
                # Only collect tweets in English
                if item['lang'] == 'en' and len(item['entities']['hashtags']) > 0:
                    tweet_data = process_tweet(item)
                    writer.writerow(list(tweet_data.values()))


In [131]:
import pandas as pd
tweets = pd.read_csv("senators-1-tweets.csv", header=None, names=['hashtags', 'text', 'user', 'user_location', 'created_at'])  
print('num tweets: {}'.format(len(tweets)))
tweets.head()

num tweets: 224667


Unnamed: 0,hashtags,text,user,user_location,created_at
0,['GOTV'],Doug Jones greeting the crowd after @StP_Broke...,DougJones,"Birmingham, Alabama",Sun Dec 10 02:48:38 +0000 2017
1,['GOTV'],“Each of us in our time is supposed to do what...,DougJones,"Birmingham, Alabama",Sun Dec 10 02:58:43 +0000 2017
2,['GOTV4Doug'],Thank you @StP_BrokenBones for your endorsemen...,DougJones,"Birmingham, Alabama",Sun Dec 10 03:56:23 +0000 2017
3,"['VoteDec12', 'GOTV4Doug']",Thank you to great Alabamians Mike Cooley and ...,DougJones,"Birmingham, Alabama",Mon Dec 11 00:47:50 +0000 2017
4,"['GOTV4Doug', 'CharlesBarkley', 'RightSideOfHi...",Backstage with Louise and me at the final #GOT...,DougJones,"Birmingham, Alabama",Tue Dec 12 01:06:29 +0000 2017


In [132]:
import spacy
nlp = spacy.load('en')

In [133]:
docs = []
N = 50000
for i in range(N):
    docs.append(nlp(tweets.iloc[i]['text']))


In [134]:
stop_words = ['senator']

In [135]:
# Clean tweets here
cleaned_tweets = []
cleaned_tweets_text = []
for doc in docs:
    curr_tweet = []
    for w in doc:
        if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num:
            curr_tweet.append(w.lemma_) # add lemmatized version of the word
    cleaned_tweets.append(curr_tweet)
    cleaned_tweets_text.append(' '.join(curr_tweet))

In [136]:
# here we should get only cleaned tweets
import gensim
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary

bigram = gensim.models.Phrases(cleaned_tweets)

In [137]:
cleaned_tweets = [bigram[t] for t in cleaned_tweets]

In [138]:
# create dictionary and corpus
dictionary = Dictionary(cleaned_tweets)
corpus = [dictionary.doc2bow(clean_tween) for clean_tween in cleaned_tweets]

In [139]:
#### LSI MODEL basically SVD / Principal component analysis

lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [140]:
lsimodel.show_topics(num_topics=5)

[(0,
  '0.744*"amp" + 0.541*"-PRON-" + 0.137*"today" + 0.086*"help" + 0.085*"work" + 0.084*"rt" + 0.076*"\'s" + 0.075*"’s" + 0.071*"need" + 0.063*"family"'),
 (1,
  '0.776*"-PRON-" + -0.614*"amp" + 0.050*"need" + 0.046*"today" + 0.044*"not" + 0.032*"\n\n" + 0.027*" " + 0.025*"rt" + 0.024*"have" + 0.022*"bill"'),
 (2,
  '-0.788*"rt" + -0.350*"today" + 0.212*"-PRON-" + -0.199*"\'s" + 0.168*"amp" + -0.145*"thank" + -0.111*"the" + -0.106*" " + -0.082*"’s" + -0.077*"-PRON-_be"'),
 (3,
  '-0.785*"today" + 0.515*"rt" + -0.168*"’s" + -0.146*"\'s" + 0.121*"-PRON-" + 0.097*"amp" + -0.070*"great" + -0.061*"-PRON-_be" + -0.054*"the" + -0.050*"day"'),
 (4,
  '0.450*"the" + -0.369*"today" + 0.299*"help" + 0.296*"work" + -0.209*"rt" + 0.204*"’s" + 0.187*"-PRON-_be" + 0.183*"bill" + 0.180*"\n\n" + 0.165*"need"')]

In [141]:
# HDP - Hierarchical Dirichlet process 
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
hdpmodel.show_topics()

  start_time = time.clock()


[(0,
  "0.022*amp + 0.020*-PRON- + 0.007*today + 0.006*rt + 0.005*'s + 0.005*work + 0.005*the + 0.005*help + 0.005*’s + 0.004*-PRON-_be + 0.004*need + 0.003*bill + 0.003*support + 0.003*family + 0.003*thank + 0.003*not + 0.003*\n\n + 0.003*  + 0.003*great + 0.002*year"),
 (1,
  "0.021*amp + 0.019*-PRON- + 0.007*today + 0.005*rt + 0.005*help + 0.005*'s + 0.004*work + 0.004*the + 0.004*’s + 0.004*-PRON-_be + 0.004*support + 0.003*need + 0.003*thank + 0.003*family + 0.003*bill + 0.003*not + 0.003*  + 0.003*great + 0.002*this + 0.002*year"),
 (2,
  "0.019*amp + 0.017*-PRON- + 0.007*today + 0.006*rt + 0.005*the + 0.005*'s + 0.005*’s + 0.004*help + 0.003*need + 0.003*work + 0.003*-PRON-_be + 0.003*bill + 0.003*  + 0.003*not + 0.002*support + 0.002*thank + 0.002*family + 0.002*w/ + 0.002*vote + 0.002*\n\n"),
 (3,
  "0.016*amp + 0.013*-PRON- + 0.005*rt + 0.005*today + 0.004*the + 0.004*'s + 0.003*work + 0.003*’s + 0.003*need + 0.003*help + 0.003*-PRON-_be + 0.002*not + 0.002*bill + 0.002*  + 0

In [142]:
# LDA 
ldamodel = LdaModel(corpus=corpus, num_topics=20, id2word=dictionary)
ldamodel.show_topics()

[(19,
  '0.032*"-PRON-_will" + 0.023*"talk" + 0.023*"watch" + 0.022*"tonight" + 0.022*"tune" + 0.020*"tomorrow" + 0.020*"rt" + 0.019*"join" + 0.016*"name" + 0.016*"\'s"'),
 (0,
  '0.036*"amp" + 0.016*"business" + 0.014*"free" + 0.012*"worker" + 0.011*"small_business" + 0.010*"south_carolina" + 0.010*"area" + 0.010*"icymi_senator" + 0.009*"company" + 0.009*"prepare"'),
 (1,
  '0.043*"rt" + 0.025*"epa" + 0.023*"https://t" + 0.022*"hold" + 0.019*"congratulation" + 0.018*"icymi" + 0.014*"staff" + 0.013*"htt" + 0.011*"https://" + 0.011*"+"'),
 (2,
  '0.065*"obamacare" + 0.022*"rt" + 0.020*"budget" + 0.020*"tbt" + 0.015*"\'s" + 0.013*"resolution" + 0.013*"house" + 0.012*"summer" + 0.012*"tax" + 0.011*"food"'),
 (12,
  '0.051*"rt_@senateaggop" + 0.022*"look_forward" + 0.019*"ny" + 0.019*"amp" + 0.014*"-PRON-" + 0.013*"\'s" + 0.012*"sen." + 0.012*"@senjohnthune" + 0.011*"terrorist" + 0.011*"supreme_court"'),
 (18,
  '0.052*"on" + 0.031*"woman" + 0.021*"nation" + 0.019*"man" + 0.019*"@potu" + 0

In [143]:
# use lda and nmf in sklearn

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [144]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(cleaned_tweets_text)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(cleaned_tweets_text)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)



Topic 0:
pron let time fight stand know need support vote continue
Topic 1:
http marcorubio senator obamacare video icymi nm visit new read
Topic 2:
rt markwarner senwhitehouse senateaggop senatortimscott sen senatortester senator senpatrobert join
Topic 3:
amp join proud protect nm nytownoftheday support ri discuss community
Topic 4:
https happy read day visit new marcorubio live utpol week
Topic 5:
senate floor watch vote pass speak live house head bipartisan
Topic 6:
today day year honor good ago celebrate remember luck join
Topic 7:
thank support service work happy leadership veteran ri senwhitehouse birthday
Topic 8:
work help family need health tax care job americans taxreform
Topic 9:
great news meet student morning congrat talk meeting discussion visit
Topic 0:
job new pron https obamacare business economy country help america
Topic 1:
rt https meet important act pron tune iran report read
Topic 2:
https year health woman good care pron today sign need
Topic 3:
http rt amp sena

In [145]:
tfidf_feature_names[2], tf_feature_names[2]


('2nd', '2nd')

In [146]:
for line in tf:
    print(line)
    break

  (0, 413)	1
  (0, 701)	1


In [147]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
