In [9]:
#####################
#     LOAD DATA     #
#####################

import json_lines
import csv

def process_tweet(tweet):  
    d = {}
    d['hashtags'] = [hashtag['text'] for hashtag in tweet['entities']['hashtags']]
    d['text'] = tweet['full_text']
    d['user'] = tweet['user']['screen_name']
    d['user_loc'] = tweet['user']['location']
    d['created_at'] = tweet['created_at']
    return d

if False:
    with open('congress_dataset/senators-1.jsonl', 'rb') as f:
        with open(r'senators-1-tweets.csv', 'a') as file:
            writer = csv.writer(file)
            for item in json_lines.reader(f):
                # Only collect tweets in English
                if item['lang'] == 'en' and len(item['entities']['hashtags']) > 0:
                    tweet_data = process_tweet(item)
                    writer.writerow(list(tweet_data.values()))

                    
import pandas as pd
tweets = pd.read_csv("senators-1-tweets.csv", header=None, names=['hashtags', 'text', 'user', 'user_location', 'created_at'])  
print('num tweets: {}'.format(len(tweets)))


import spacy
nlp = spacy.load('en_core_web_md')

def tokenize(text):
    lda_tokens = []
    tokens = nlp(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens


import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens
    

import random

docs = []
hashtags = []
N = 2000
rand_tweets = list(range(N)) #random.sample(range(len(tweets)), k=N)
for i, tw in enumerate(rand_tweets):
    if i % 1000 == 0:
        print('{}%'.format(100./N*i), end=' ')
    text = tweets.iloc[i]['text']
    tokens = prepare_text_for_lda(text)
    if random.random() > .9999:
        print(tokens)
    taggs = tweets.iloc[i]['hashtags'].replace('[', '').replace(']', '').replace('\'', '').split(",")
    hashtags.append([t.strip() for t in taggs])
    docs.append(tokens)

num tweets: 449334


[nltk_data] Downloading package wordnet to /home/simi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/simi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0.0% 50.0% ['forward', 'hearing', 'tillerson', 'mattis', 'potus’', 'afghanistan', 'strategy', 'america', 'safe']


In [10]:
docs[0]

['jones', 'greeting', 'crowd', 'SCREEN_NAME', 'rally', 'birmingham']

In [2]:
# here we should get only cleaned tweets
import gensim
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary

import torch
import numpy as np

N_SMALL = N

cleaned_tweetz = docs[:N_SMALL]

bigram = gensim.models.Phrases(cleaned_tweetz)
cleaned_tweetz = [bigram[t] for t in cleaned_tweetz]

# create dictionary and corpus
dictionary = Dictionary(cleaned_tweetz)

# corpus = (token_id, count_in_curr_doc) , sparse representation
corpus = [dictionary.doc2bow(clean_tween) for clean_tween in cleaned_tweetz]
cleaned_tweetz_id = [dictionary.doc2idx(document=tw) for tw in cleaned_tweetz]

# Cut all tweets after 5 words and remove tweets below 5 words
cleaned_tweetz_id = [tw for tw in cleaned_tweetz_id if len(tw) >= 5]
cleaned_tweetz_id = [tw[:5] for tw in cleaned_tweetz_id]
cleaned_tweetz_id = torch.Tensor(np.array([np.array(tw, dtype=np.int32) for tw in cleaned_tweetz_id])).long()
cleaned_tweetz_id = cleaned_tweetz_id.transpose(1,0)


[['jones', 'greeting', 'crowd', 'SCREEN_NAME', 'rally_birmingham'],
 ['suppose',
  'things',
  'better',
  'behind',
  'generational',
  'responsibility',
  'chance',
  'right',
  'direction',
  'jones',
  'SCREEN_NAME',
  'SCREEN_NAME',
  'rally_birmingham'],
 ['thank',
  'SCREEN_NAME',
  'endorsement',
  'amaze',
  'night',
  'december',
  'gotv4doug'],
 ['thank',
  'great',
  'alabamian',
  'cooley',
  'SCREEN_NAME',
  'support',
  'endorsement',
  'votedec12',
  'gotv4doug'],
 ['backstage',
  'louise',
  'final_gotv4doug',
  'rally',
  'SCREEN_NAME',
  'SCREEN_NAME',
  'SCREEN_NAME',
  'charlesbarkley',
  'rightsideofhistory'],
 ['somebody',
  'election',
  'movie',
  'script',
  'would',
  'throw',
  'trash',
  'would',
  'possible',
  'leading',
  'polls',
  'charlesbarkley',
  'final_gotv4doug',
  'rally'],
 ['begging',
  'friend',
  'point',
  'charlesbarkley',
  'gotv4doug',
  'final',
  'rally'],
 ['mayor',
  'birmingham',
  'SCREEN_NAME',
  'senator',
  'alabama-',
  'jones'

In [3]:
tag_dictionary = Dictionary(hashtags)
tags_id = [tag_dictionary.doc2idx(tag) for tag in hashtags]


In [4]:

import torch
from torch import nn
from torch.distributions import constraints

import pyro
import pyro.distributions as dist
from pyro.infer import SVI, JitTraceEnum_ELBO, TraceEnum_ELBO
from pyro.optim import Adam

import functools
from operator import itemgetter

K = 10
V = len(dictionary)
V_per_doc = 5
D = cleaned_tweetz_id.shape[1]
M = len(tag_dictionary)
l_sizes = '500-100-50'

print(f'Number of tags: {M} | Number of Docs: {D}')

# This is a fully generative model of a batch of documents.
# data is a [num_words_per_doc, num_documents] shaped array of word ids
# (specifically it is not a histogram). We assume in this simple example
# that all documents have the same number of words.
def model(data=None, tags=None, batch_size=None):
    # Globals.
    with pyro.plate("topics", K):
        topic_weights = pyro.sample("topic_weights", dist.Gamma(1. / K, 1.))
        topic_words = pyro.sample("topic_words", dist.Dirichlet(torch.ones(V) / V))
        topic_tags_distr = pyro.sample("topic_tags", dist.Dirichlet(torch.ones(M) / M))

    # Locals.
    with pyro.plate("documents", D) as ind:
        if data is not None:
            with pyro.util.ignore_jit_warnings():
                assert data.shape == (V_per_doc, D)
            data = data[:, ind]
            tags = torch.tensor([tags[int(id)][0] for id in ind]).float()

        doc_topics = pyro.sample("doc_topics", dist.Dirichlet(topic_weights))
        with pyro.plate("words", V_per_doc):
            # The word_topics variable is marginalized out during inference,
            # achieved by specifying infer={"enumerate": "parallel"} and using
            # TraceEnum_ELBO for inference. Thus we can ignore this variable in
            # the guide.
            word_topics = pyro.sample("word_topics", dist.Categorical(doc_topics),
                                      infer={"enumerate": "parallel"})
            data = pyro.sample("doc_words", dist.Categorical(topic_words[word_topics]),
                               obs=data)

        # sample 1 topic
        tag_topic = pyro.sample("tag_topic", dist.Categorical(doc_topics), infer={"enumerate": "parallel"})
        if tags is not None:
            tag = pyro.sample("tag", dist.Categorical(topic_tags_distr[tag_topic]), obs=tags)

    return topic_weights, topic_words, data, tag


# We will use amortized inference of the local topic variables, achieved by a
# multi-layer perceptron. We'll wrap the guide in an nn.Module.
def make_predictor():
    layer_sizes = ([V] +
                   [int(s) for s in l_sizes.split('-')] +
                   [K])
    print('Creating MLP with sizes {}'.format(layer_sizes))
    layers = []
    for in_size, out_size in zip(layer_sizes, layer_sizes[1:]):
        layer = nn.Linear(in_size, out_size)
        layer.weight.data.normal_(0, 0.001)
        layer.bias.data.normal_(0, 0.001)
        layers.append(layer)
        layers.append(nn.Sigmoid())
    layers.append(nn.Softmax(dim=-1))
    return nn.Sequential(*layers)


def parametrized_guide(predictor, data, tags, batch_size=None):
    # Use a conjugate guide for global variables.
    topic_weights_posterior = pyro.param("topic_weights_posterior",
        lambda: torch.ones(K),
        constraint=constraints.positive)
    topic_words_posterior = pyro.param("topic_words_posterior",
        lambda: torch.ones(K, V),
        constraint=constraints.greater_than(0.5))
    topic_tags_posterior = pyro.param("topic_tags_posterior",
        lambda: torch.ones(K, M),
        constraint=constraints.greater_than(0.5))
    with pyro.plate("topics", K):
        pyro.sample("topic_weights", dist.Gamma(topic_weights_posterior, 1.))
        pyro.sample("topic_words", dist.Dirichlet(topic_words_posterior))
        pyro.sample("topic_tags", dist.Dirichlet(topic_tags_posterior))

    # Use an amortized guide for local variables.
    pyro.module("predictor", predictor)
    with pyro.plate("documents", D, batch_size) as ind:
        # The neural network will operate on histograms rather than word
        # index vectors, so we'll convert the raw data to a histogram.
        if torch._C._get_tracing_state():
            counts = torch.eye(1024)[data[:, ind]].sum(0).t()
        else:
            counts = torch.zeros(V, ind.size(0))
            counts.scatter_add_(0, data[:, ind], torch.tensor(1.).expand(counts.shape))
        doc_topics = predictor(counts.transpose(0, 1))
        pyro.sample("doc_topics", dist.Delta(doc_topics, event_dim=1))


pyro.set_rng_seed(0)
#pyro.clear_param_store()
# pyro.enable_validation(True)

# We can generate synthetic data directly by calling the model.
#true_topic_weights, true_topic_words, data = model()

# We'll train using SVI.
predictor = make_predictor()
guide = functools.partial(parametrized_guide, predictor)
Elbo = TraceEnum_ELBO  # JitTraceEnum_ELBO if args.jit else TraceEnum_ELBO
elbo = Elbo(max_plate_nesting=2)
optim = Adam({'lr': 1e-2})
svi = SVI(model, guide, optim, elbo)
print('Step\tLoss')
for step in range(5000):
    loss = svi.step(cleaned_tweetz_id, tags_id, batch_size=64)
    if step % 10 == 0:
        print('{: >5d}\t{}'.format(step, loss))
loss = elbo.loss(model, guide, data)
print('final loss = {}'.format(loss))

Number of tags: 725 | Number of Docs: 1899
Creating MLP with sizes [4227, 500, 100, 50, 10]
Step	Loss
    0	407291.25
   10	411057.75
   20	397918.375
   30	397112.875
   40	398101.46875
   50	381775.40625
   60	393088.1875
   70	387221.0625
   80	388013.78125
   90	380743.5625
  100	388348.90625
  110	383393.71875
  120	383877.84375
  130	378455.625
  140	398303.75
  150	379131.28125
  160	377269.15625
  170	372439.15625
  180	372314.9375
  190	371127.4375
  200	377577.875
  210	372556.4375
  220	369484.375
  230	367467.8125
  240	369047.125
  250	362627.375
  260	374055.25
  270	368238.9375
  280	366829.3125
  290	365166.8125
  300	367542.71875
  310	363581.8125
  320	368034.9375
  330	364144.21875
  340	356425.8125
  350	354423.25
  360	358214.21875
  370	359224.0625
  380	353577.46875
  390	357225.0
  400	354199.03125
  410	361909.59375
  420	350048.125
  430	350637.09375
  440	349262.5
  450	349428.6875
  460	355609.40625
  470	356165.28125
  480	346304.65625
  490	354100.0625
  5

 4520	312617.90625
 4530	322343.59375
 4540	319623.78125
 4550	315740.625
 4560	317079.4375
 4570	316941.5
 4580	323578.0625
 4590	318920.75
 4600	316614.28125
 4610	316377.9375
 4620	318823.78125
 4630	313120.15625
 4640	314366.90625
 4650	317523.0625
 4660	314053.25
 4670	309431.9375
 4680	311364.5
 4690	312933.40625
 4700	312334.09375
 4710	313517.125
 4720	312560.71875
 4730	314659.78125
 4740	321344.59375
 4750	317261.46875
 4760	314020.4375
 4770	318030.90625
 4780	322061.9375
 4790	318437.09375
 4800	314932.65625
 4810	311752.40625
 4820	317687.375
 4830	311916.125
 4840	313483.71875
 4850	317087.375
 4860	318792.625
 4870	316089.09375
 4880	318611.90625
 4890	315227.90625
 4900	329111.03125
 4910	318097.0625
 4920	322212.28125
 4930	313433.40625
 4940	320513.34375
 4950	316412.5
 4960	315900.75
 4970	313272.25
 4980	321697.15625
 4990	335196.25


NameError: name 'data' is not defined

'SCREEN_NAME'

In [6]:
params = pyro.get_param_store()

words_per_topic_distr = params['topic_words_posterior']
tags_in_topic_distr = params['topic_tags_posterior']

topic_words =  dist.Dirichlet(words_per_topic_distr).sample()
tags_topic = dist.Dirichlet(tags_in_topic_distr).sample()

for t in range(K):
    print("---- topic {} -----".format(t))
    print("  -- top words ---")
    top5_words = (torch.argsort(topic_words[t])[-10:]).cpu().numpy()
    top5_words = list(map(lambda x: dictionary[x], reversed(
        top5_words)))
    print(top5_words)
    
    print("  -- top tags ---")
    topk_tags = (torch.argsort(tags_topic[t])[-3:]).cpu().numpy()
    topk_tags = list(map(lambda x: tag_dictionary[x], reversed(topk_tags)))
    print(topk_tags)
    
    

---- topic 0 -----
  -- top words ---
['SCREEN_NAME', 'dreamer', 'family', 'fight', 'another', 'people', 'proud', 'years', 'dreamer_democrat', 'community']
  -- top tags ---
['Dreamers', 'CHIP', 'GOPTaxScam']
---- topic 1 -----
  -- top words ---
['visit', 'senator', 'country', 'join', 'business', 'americandreamer', 'alabama', 'icymi', 'morning', 'congrats']
  -- top tags ---
['DreamActNow', 'DREAMers', 'TahoeSummit']
---- topic 2 -----
  -- top words ---
['continue', 'joining', 'call', 'honor', 'health_insurance', 'friend', 'opportunity', 'tonight', 'decision', 'discus']
  -- top tags ---
['MonumentsForAll', 'OMDP4NM', 'NAFTA']
---- topic 3 -----
  -- top words ---
['great', 'thank', 'netneutrality', 'SCREEN_NAME', '355ships', 'cardinlugar', 'harley', 'older', 'lose', 'teacher']
  -- top tags ---
['GOPTaxScam', 'DRC', 'TheRightSideOfHistory']
---- topic 4 -----
  -- top words ---
['senate', 'health', 'solar_eclipse', 'implement', 'SCREEN_NAME', 'judicial_nominee', 'southwest', 'markey

In [None]:
tags_topic[0].sort()[0][-3:]


In [None]:
_, _, _, tags = model(cleaned_tweetz_id, tags_id)

In [None]:
cleaned_tweetz[299]

In [None]:
for i in range(400, 450):
    print("{} tweet: {} \n| tag: {}\n".format(i, tweets.iloc[i].text, tag_dictionary[int(tags[i])]))
    print('real tags: {}\n\n'.format(tweets.iloc[i].hashtags))