In [1]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data


In [54]:
small_docs = documents[:1000]

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(small_docs)
tf_feature_names = tf_vectorizer.get_feature_names()

In [3]:
tf

<1000x7358 sparse matrix of type '<class 'numpy.int64'>'
	with 51316 stored elements in Compressed Sparse Row format>

In [4]:
tf[0]

<1x7358 sparse matrix of type '<class 'numpy.int64'>'
	with 48 stored elements in Compressed Sparse Row format>

In [5]:
from sklearn.decomposition import LatentDirichletAllocation


no_topics = 10
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)




In [6]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
ocean activity investigated tea cartridges dissolved radical sheep colleague tpa
Topic 1:
mm blinker bulb atf agenda ban waco lamp starter brightness
Topic 2:
win event expose british rr text flags columbia receiving mask
Topic 3:
key keys plane surrender abc effect geb pitt gordon session
Topic 4:
people just don god like think good know time did
Topic 5:
israel kk lebanese israeli zone soldiers captain attacks withdraw traded
Topic 6:
biggest chop offensive defensive tires hit disappointment kings nren nsfnet
Topic 7:
space section firearm military gm shall government weapon encryption person
Topic 8:
hole holes exist pointer electron edu metal graphics semiconductor stated
Topic 9:
edu like use know just com don new think time


In [68]:
import gensim
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
import spacy
nlp = spacy.load('en_core_web_md')

small_docs = documents[:1000]


import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = [t.lower_ for t in nlp(text)]
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens
    
    
small_docs = [prepare_text_for_lda(text) for text in small_docs]

bigram = gensim.models.Phrases(small_docs)
small_docs_bigram = [bigram[t] for t in small_docs]

# create dictionary and corpus
dictionary = Dictionary(small_docs_bigram)

# corpus = (token_id, count_in_curr_doc) , sparse representation
corpus = [dictionary.doc2bow(d) for d in small_docs_bigram]
small_docs_id = [dictionary.doc2idx(document=tw) for tw in small_docs_bigram]
small_docs_id = [d for d in small_docs_id if len(d) != 0]

[nltk_data] Downloading package wordnet to /home/simi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/simi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [85]:
import pyro
import pyro.distributions as p
from pyro import optim
from pyro import infer
import torch
from torch.distributions import constraints

D = 20
N = [len(d) for d in small_docs_id]
K = 5
V = max([max(d) for d in small_docs_id])
print(f'D: {D} | V: {V} | K: {K}')

#@pyro.poutine.broadcast
def model(data):
    # topic - word distribution
    phi = pyro.sample("phi", p.Dirichlet(torch.ones([K, V])).independent(1))
  
    for d in pyro.plate("documents", D):
        # document-topic distribution
        theta_d = pyro.sample(f"theta_{d}", p.Dirichlet(torch.ones([K])))
    
        with pyro.plate(f"words_{d}", N[d]):
            z = pyro.sample(f"z_{d}", p.Categorical(theta_d))
            pyro.sample(f"w_{d}", p.Categorical(phi[z]), obs=data[d])
            
#@pyro.poutine.broadcast
def guide(data):
    beta_q = pyro.param("beta_q", torch.ones([K, V]),constraint=constraints.positive)
    phi_q = pyro.sample("phi",p.Dirichlet(beta_q).independent(1))
  
    for d in pyro.plate("documents", D):
        alpha_q = pyro.param(f"alpha_q_{d}", torch.ones([K]),constraint=constraints.positive)
        q_theta_d = pyro.sample(f"theta_{d}", p.Dirichlet(alpha_q))
    
        with pyro.plate(f"words_{d}", N[d]):
            q_i = pyro.param(f"q_{d}", torch.randn([N[d], K]).exp(), constraint=constraints.simplex)
            pyro.sample(f"z_{d}", p.Categorical(q_i))

data = [torch.tensor(d).float() for d in small_docs_id]            
            
adam_params = {"lr": 0.01, "betas": (0.90, 0.999)}
optimizer = optim.Adam(adam_params)

# infer.config_enumerate(guide, 'parallel')
pyro.clear_param_store()
svi = infer.SVI(model, guide, optimizer, loss=infer.TraceEnum_ELBO(max_iarange_nesting=1))
for _ in range(3000):
    loss = svi.step(data)
    print(loss)
        

D: 20 | V: 14065 | K: 5
51283.5390625
50248.1171875
53486.796875
50227.203125
50262.421875
52337.49609375
50557.37109375
51631.77734375
54041.71875
51546.54296875
50774.15234375
49709.98046875
50452.75390625
52415.609375
49846.59765625
51495.2421875
49580.69140625
49355.63671875
52615.34765625
50220.3984375
49253.5
52066.640625
51281.6796875
51011.921875
50575.2578125
52674.13671875
50046.234375
50445.84765625
52150.33203125
50347.796875
50306.72265625
51165.25390625
50529.5703125
48964.56640625
49278.4453125
52605.9375
52989.98828125
49878.68359375
50004.43359375
49499.03125
48604.9140625
49791.7265625
49566.3125
49022.3515625
49600.484375
49437.6796875
49043.29296875
52324.6171875
49675.9921875
48750.4140625
49885.8125
50817.76953125
48639.88671875
49788.6015625
51142.64453125
49696.1484375
50139.02734375
48592.73046875
49223.46484375
49017.76171875
48591.515625
50052.1875
49721.0078125
53080.1328125
49243.0625
52490.1640625
50031.76953125
48496.41796875
49794.19140625
48581.21484375

46503.15625
47153.5078125
46804.84765625
46754.0390625
47140.56640625
46596.66015625
46823.828125
46534.59765625
46943.87890625
46567.30859375
46721.38671875
46649.58984375
46348.015625
47112.51953125
46565.390625
46755.015625
46947.3359375
46714.76953125
46757.05078125
46612.1953125
46910.3125
46536.21484375
46556.453125
48261.1328125
47061.54296875
46825.75
46967.6640625
46645.421875
47734.87109375
46624.48828125
46845.62109375
46755.59765625
46972.12109375
46947.21875
46772.1328125
46894.359375
46720.7265625
46399.546875
46726.109375
47082.0546875
46975.5
46539.46484375
46842.09765625
46679.88671875
47690.86328125
46625.44140625
46718.765625
47061.54296875
46827.06640625
47065.73828125
46717.09375
47284.58203125
46992.0859375
46683.16796875
46670.96875
46934.76171875
46961.40234375
47168.96484375
47375.26953125
47146.4453125
46936.53125
47357.9375
46653.96875
46533.98046875
46694.78515625
46891.75
46508.73046875
47219.3125
46808.4140625
46590.515625
47560.578125
47573.70703125
47131

KeyboardInterrupt: 

'T'

In [86]:
params = pyro.get_param_store()

beta_q = params["beta_q"]
topic_words_distribution = p.Dirichlet(beta_q).sample()


for t in range(K):
    print("---- topic {} -----".format(t))
    print("  -- top words ---")
    top5_words = (torch.argsort(topic_words_distribution[t])[-10:]).cpu().numpy()
    top5_words = list(map(lambda x: dictionary[x], reversed(
        top5_words)))
    print(top5_words)    

---- topic 0 -----
  -- top words ---
['tracer', '\n\t\t\t       ', 'incurring', 'rayshade', 'package', 'support', '\n        ', 'chemist', 'image', 'graphics']
---- topic 1 -----
  -- top words ---
['graphics', 'format', 'material', '\n        ', 'crash', 'object', 'amiga', 'stuff', 'volunteer', 'soveriegn']
---- topic 2 -----
  -- top words ---
['image', 'author', 'server', 'system', 'object', 'permit', 'explore', 'amiga', '\n        ', "qur'an"]
---- topic 3 -----
  -- top words ---
['package', 'available', 'image', 'buzzer', 'administration', 'file', 'server', 'graphics', 'blitter', 'include']
---- topic 4 -----
  -- top words ---
['graphics', 'object', 'catagories', 'dynamic', 'ontario', 'shall', 'absolutely', 'scoring', 'renderers', 'image']


In [7]:
###### Twitter Data
#####################
#     LOAD DATA     #
#####################

import json_lines
import csv

def process_tweet(tweet):  
    d = {}
    d['hashtags'] = [hashtag['text'] for hashtag in tweet['entities']['hashtags']]
    d['text'] = tweet['full_text']
    d['user'] = tweet['user']['screen_name']
    d['user_loc'] = tweet['user']['location']
    d['created_at'] = tweet['created_at']
    return d

if False:
    with open('congress_dataset/senators-1.jsonl', 'rb') as f:
        with open(r'senators-1-tweets.csv', 'a') as file:
            writer = csv.writer(file)
            for item in json_lines.reader(f):
                # Only collect tweets in English
                if item['lang'] == 'en' and len(item['entities']['hashtags']) > 0:
                    tweet_data = process_tweet(item)
                    writer.writerow(list(tweet_data.values()))

                    
import pandas as pd
tweets = pd.read_csv("senators-1-tweets.csv", header=None, names=['hashtags', 'text', 'user', 'user_location', 'created_at'])  
print('num tweets: {}'.format(len(tweets)))


import spacy
nlp = spacy.load('en_core_web_md')

def tokenize(text):
    lda_tokens = []
    tokens = nlp(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens


import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens
    

import random

docs = []
hashtags = []
N = 2000
rand_tweets = list(range(N)) #random.sample(range(len(tweets)), k=N)
for i, tw in enumerate(rand_tweets):
    if i % 1000 == 0:
        print('{}%'.format(100./N*i), end=' ')
    text = tweets.iloc[i]['text']
    tokens = prepare_text_for_lda(text)
    if random.random() > .9999:
        print(tokens)
    taggs = tweets.iloc[i]['hashtags'].replace('[', '').replace(']', '').replace('\'', '').split(",")
    hashtags.append([t.strip() for t in taggs])
    docs.append(tokens)

num tweets: 449334


[nltk_data] Downloading package wordnet to /home/simi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/simi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0.0% 50.0% 

In [8]:
all_docs = [' '.join(d) for d in docs]

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(all_docs)
tf_feature_names = tf_vectorizer.get_feature_names()

In [10]:
from sklearn.decomposition import LatentDirichletAllocation


no_topics = 10
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=15, learning_method='online', learning_offset=50.,random_state=0).fit(tf)




In [11]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
today screen_name health getcovered visit county deadline tomorrow great healthcare
Topic 1:
family dreamer screen_name dreamactnow child country congress community goptaxscam americandreamer
Topic 2:
senate goptaxscam screen_name taxreform american business forward republican watch opportunity
Topic 3:
eclipse matter solar solareclipse montana mtpol angusontheroad pretty conservation lastbestoutdoorsfest
Topic 4:
celebrate happy world hanukkah wishing right courtsmatter light great friend
Topic 5:
rally accountable sesta jones birmingham final trafficker gotv4doug traffic training
Topic 6:
screen_name learn solareclipse2017 birthday alabama happy ussjohnsmccain rightsideofhistory visiting thought
Topic 7:
screen_name netneutrality internet savenetneutrality repeal protect economy today american fight
Topic 8:
dreamer screen_name thank standing family dreamactnow democrat earn solidarity 000
Topic 9:
screen_name today watch thanks support utpol great national discus honor
