In [1]:
import json_lines
import csv

def process_tweet(tweet):  
    d = {}
    d['hashtags'] = [hashtag['text'] for hashtag in tweet['entities']['hashtags']]
    d['text'] = tweet['full_text']
    d['user'] = tweet['user']['screen_name']
    d['user_loc'] = tweet['user']['location']
    d['created_at'] = tweet['created_at']
    return d

if False:
    with open('congress_dataset/senators-1.jsonl', 'rb') as f:
        with open(r'senators-1-tweets.csv', 'a') as file:
            writer = csv.writer(file)
            for item in json_lines.reader(f):
                # Only collect tweets in English
                if item['lang'] == 'en' and len(item['entities']['hashtags']) > 0:
                    tweet_data = process_tweet(item)
                    writer.writerow(list(tweet_data.values()))


In [2]:
import pandas as pd
tweets = pd.read_csv("senators-1-tweets.csv", header=None, names=['hashtags', 'text', 'user', 'user_location', 'created_at'])  
print('num tweets: {}'.format(len(tweets)))
tweets.head(10)

num tweets: 449334


Unnamed: 0,hashtags,text,user,user_location,created_at
0,['GOTV'],Doug Jones greeting the crowd after @StP_Broke...,DougJones,"Birmingham, Alabama",Sun Dec 10 02:48:38 +0000 2017
1,['GOTV'],“Each of us in our time is supposed to do what...,DougJones,"Birmingham, Alabama",Sun Dec 10 02:58:43 +0000 2017
2,['GOTV4Doug'],Thank you @StP_BrokenBones for your endorsemen...,DougJones,"Birmingham, Alabama",Sun Dec 10 03:56:23 +0000 2017
3,"['VoteDec12', 'GOTV4Doug']",Thank you to great Alabamians Mike Cooley and ...,DougJones,"Birmingham, Alabama",Mon Dec 11 00:47:50 +0000 2017
4,"['GOTV4Doug', 'CharlesBarkley', 'RightSideOfHi...",Backstage with Louise and me at the final #GOT...,DougJones,"Birmingham, Alabama",Tue Dec 12 01:06:29 +0000 2017
5,"['CharlesBarkley', 'GOTV4Doug']","“You know if somebody told you guys, put this ...",DougJones,"Birmingham, Alabama",Tue Dec 12 01:11:37 +0000 2017
6,"['CharlesBarkley', 'GOTV4Doug']","“I am begging you to call, text your friends. ...",DougJones,"Birmingham, Alabama",Tue Dec 12 01:13:03 +0000 2017
7,['CharlesBarkley'],“We are here with the new mayor of Birmingham ...,DougJones,"Birmingham, Alabama",Tue Dec 12 01:15:47 +0000 2017
8,['GOTV4Doug'],“I know what it’s like to climb an uphill batt...,DougJones,"Birmingham, Alabama",Tue Dec 12 01:18:55 +0000 2017
9,['GOTV4Doug'],“What are the odds of Doug Jones winning? It’s...,DougJones,"Birmingham, Alabama",Tue Dec 12 01:21:03 +0000 2017


In [3]:
import spacy
nlp = spacy.load('en_core_web_md')

In [4]:
len(tweets)

449334

In [5]:
import random

docs = []
N = 10000
rand_tweets = random.sample(range(len(tweets)), k=N)
for i, tw in enumerate(rand_tweets):
    if i % 1000 == 0:
        print('{}%'.format(100./N*i), end=' ')
    docs.append(nlp(tweets.iloc[i]['text']))


0.0% 10.0% 20.0% 30.0% 40.0% 50.0% 60.0% 70.0% 80.0% 90.0% 

In [6]:
stop_words = ['senator', '\'s', '-PRON-', '’', '’s', 'amp', 'i' ]

In [7]:
# Clean tweets here
cleaned_tweets = []
cleaned_tweets_text = []
for doc in docs:
    curr_tweet = []
    for w in doc:
        if w.text == '\n' or w.is_stop or w.is_punct or w.like_num or w.like_url: continue
        if w.is_space or w.is_oov or not w.is_ascii or w.text.lower() in stop_words: continue
        curr_tweet.append(w.text.lower()) # add lemmatized version of the word
    if curr_tweet:
        cleaned_tweets.append(curr_tweet)
        cleaned_tweets_text.append(' '.join(curr_tweet))

In [8]:
# here we should get only cleaned tweets
import gensim
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary

bigram = gensim.models.Phrases(cleaned_tweets)

In [9]:
cleaned_tweets = [bigram[t] for t in cleaned_tweets]

In [10]:
# create dictionary and corpus
dictionary = Dictionary(cleaned_tweets)
corpus = [dictionary.doc2bow(clean_tween) for clean_tween in cleaned_tweets]

In [11]:
#### LSI MODEL basically SVD / Principal component analysis

lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)

In [12]:
lsimodel.show_topics(num_topics=5, formatted=False)

[(0,
  [('the', 0.6471219050844754),
   ('to', 0.529175100991817),
   ('and', 0.2400431882694098),
   ('of', 0.2126225235391503),
   ('in', 0.2032437664554701),
   ('a', 0.17376942701348497),
   ('for', 0.16633082408128944),
   ('is', 0.11836302250065929),
   ('on', 0.10046733230539089),
   ('that', 0.08091195296282377)]),
 (1,
  [('to', -0.7493546593892311),
   ('the', 0.6178807859387433),
   ('of', 0.16866488632267482),
   ('a', -0.07793549194823207),
   ('our', -0.043342470434441484),
   ('in', 0.041622162528355204),
   ('for', -0.041370743335681096),
   ('you', -0.031964652481370516),
   ('have', -0.02928944732875948),
   ('help', -0.0228652816653375)]),
 (2,
  [('and', 0.7431733392406403),
   ('a', 0.33305879016823026),
   ('to', -0.3255259094360341),
   ('the', -0.31077402610767474),
   ('for', 0.24658816824877958),
   ('in', 0.1264656997195878),
   ('is', 0.10728076025014195),
   ('our', 0.0729443886437602),
   ('that', 0.06670031400947306),
   ('of', 0.06370036087036034)]),
 (3

In [20]:
# HDP - Hierarchical Dirichlet process 
hdpmodel = HdpModel(corpus=corpus, id2word=dictionary)
hdpmodel.show_topics(formatted=False)

[(0,
  [('the', 0.004226814406253814),
   ('today', 0.003937463900809259),
   ('rt', 0.002754152039073402),
   ('this', 0.002373111728655636),
   ('bill', 0.002262189339532362),
   ('help', 0.002076994598081894),
   ('we', 0.002046211271738316),
   ('new', 0.0020262320027482563),
   ('day', 0.00193642907394789),
   ('support', 0.0019286067491862476),
   ('it', 0.0018621060415295611),
   ('senate', 0.0018349558919432665),
   ('space', 0.0017930519572517114),
   ('health_care', 0.0016874971194595731),
   ('work', 0.0016753731168917666),
   ('country', 0.0015995865173036973),
   ('women', 0.0015662260077663759),
   ('as', 0.0015507841358255445),
   ('need', 0.0015191663940060818),
   ('year', 0.0014905851648723937)]),
 (1,
  [('today', 0.0028210764733901045),
   ('rt', 0.0020945927157966595),
   ('the', 0.0020921991137402567),
   ('continue', 0.0014845488931130183),
   ('important', 0.0014738739434267728),
   ('we', 0.0014392854445360074),
   ('administration', 0.0013041050230621586),
   

In [21]:
# LDA 
ldamodel = LdaModel(corpus=corpus, num_topics=20, id2word=dictionary)
ldamodel.show_topics(formatted=False)

[(4,
  [('today', 0.017967142),
   ('w/', 0.015932305),
   ('rt', 0.012052194),
   ('trade', 0.011068836),
   ('farm_bill', 0.007932832),
   ('help', 0.007908866),
   ('celebrate', 0.0070627225),
   ('included', 0.006975399),
   ('experience', 0.0068505537),
   ('need', 0.0068182927)]),
 (13,
  [('rt', 0.03282023),
   ('week', 0.016110167),
   ('texas', 0.010189907),
   ('president', 0.009859935),
   ('the', 0.009762983),
   ('violence', 0.009511479),
   ('senate', 0.007956992),
   ('help', 0.00791576),
   ('alpolitics', 0.007758635),
   ('means', 0.007370275)]),
 (10,
  [('today', 0.01569545),
   ('farmers', 0.013049958),
   ('kentucky', 0.011675654),
   ('help', 0.011463838),
   ('thanks', 0.010846344),
   ('community', 0.009514981),
   ('alabama', 0.008557066),
   ('families', 0.008039369),
   ('support', 0.007855759),
   ('state', 0.007749385)]),
 (7,
  [('the', 0.015988152),
   ('senate_passed', 0.011579881),
   ('like', 0.008752671),
   ('national', 0.008040916),
   ('this', 0.00

In [55]:
# use lda and nmf in sklearn

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [56]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(cleaned_tweets_text)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(cleaned_tweets_text)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
help work need new support country make american families communities
Topic 1:
rt thanks sen https tonight conference morning agree meeting says
Topic 2:
health care insurance coverage affordable plan sign quality americans lose
Topic 3:
judge scotus kavanaugh gorsuch court supreme hearing nomination nominee brett
Topic 4:
today day years honor ago good lost lives celebrate hearing
Topic 5:
senate passed floor bipartisan vote house legislation committee week support
Topic 6:
great day news visit state thanks morning work meet meeting
Topic 7:
thank dreamers standing join solidarity time pass end democrats past
Topic 8:
watch live tune speaking floor hearing morning event joining facebook
Topic 9:
internet netneutrality open free vote consumers repeal rules fcc fight
Topic 0:
small businesses netneutrality open workers internet business rt american tune
Topic 1:
jobs proud help today local tonight opportunity ohio congrats economic
Topic 2:
great thanks day work rights farmers 

In [57]:
tfidf_feature_names[2], tf_feature_names[2]


('abuse', 'abuse')

In [58]:
for line in tf:
    print(line)
    break

  (0, 705)	1
  (0, 473)	1
  (0, 248)	1


In [59]:
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

ValidationError: 
 * Not all rows (distributions) in topic_term_dists sum to 1.

In [68]:
##### pymc3
import pymc3 as pm
import numpy as np

class NaiveLDA:
    def __init__(self, K, V):
        self.K = K
        self.V = V
        
    def init(self):
        # hyper for global distr. of topics
        alpha = np.ones(K)

        # hyper for distribution of words in topic
        beta = np.ones(V)
        
        with pm.Model() as model:
            # distribution over topics
            theta = pm.Dirichlet('theta', a=alpha)
            
            # distribution over words in topics
            phi = pm.Dirichlet('phi', a=beta, shape=(K, V))
            
            # topic of document
            z = pm.Categorical('z', p=theta, shape=(D,))
            
            # per document
            for i in range(D):
                # words in document
                
                w = pm.Categorical('w_{}'.format(i), p=phi[z[i]], observed=X[i])
            
V = len(nlp.vocab)
K = 10
naive_lda = NaiveLDA(K, V)
naive_lda.init()


TypeError: __init__() takes 2 positional arguments but 3 were given