In [33]:
#####################
#     LOAD DATA     #
#####################

import json_lines
import csv

def process_tweet(tweet):  
    d = {}
    d['hashtags'] = [hashtag['text'] for hashtag in tweet['entities']['hashtags']]
    d['text'] = tweet['full_text']
    d['user'] = tweet['user']['screen_name']
    d['user_loc'] = tweet['user']['location']
    d['created_at'] = tweet['created_at']
    return d

if False:
    with open('congress_dataset/senators-1.jsonl', 'rb') as f:
        with open(r'senators-1-tweets.csv', 'a') as file:
            writer = csv.writer(file)
            for item in json_lines.reader(f):
                # Only collect tweets in English
                if item['lang'] == 'en' and len(item['entities']['hashtags']) > 0:
                    tweet_data = process_tweet(item)
                    writer.writerow(list(tweet_data.values()))

                    
import pandas as pd
tweets = pd.read_csv("senators-1-tweets.csv", header=None, names=['hashtags', 'text', 'user', 'user_location', 'created_at'])  
print('num tweets: {}'.format(len(tweets)))


import spacy
nlp = spacy.load('en_core_web_md')

def tokenize(text):
    lda_tokens = []
    tokens = nlp(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens


import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens
    

import random

docs = []
hashtags = []
N = 2000
rand_tweets = list(range(N)) #random.sample(range(len(tweets)), k=N)
for i, tw in enumerate(rand_tweets):
    if i % 1000 == 0:
        print('{}%'.format(100./N*i), end=' ')
    text = tweets.iloc[i]['text']
    tokens = prepare_text_for_lda(text)
    if random.random() > .9999:
        print(tokens)
    taggs = tweets.iloc[i]['hashtags'].replace('[', '').replace(']', '').replace('\'', '').split(",")
    hashtags.append([t.strip() for t in taggs])
    docs.append(tokens)

num tweets: 449334


[nltk_data] Downloading package wordnet to /home/simi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/simi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0.0% 50.0% ['forward', 'hearing', 'tillerson', 'mattis', 'potus‚Äô', 'afghanistan', 'strategy', 'america', 'safe']


In [34]:
# here we should get only cleaned tweets
import gensim
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary

import torch
import numpy as np

N_SMALL = N

cleaned_tweetz = docs[:N_SMALL]

bigram = gensim.models.Phrases(cleaned_tweetz)
cleaned_tweetz = [bigram[t] for t in cleaned_tweetz]

# create dictionary and corpus
dictionary = Dictionary(cleaned_tweetz)

# corpus = (token_id, count_in_curr_doc) , sparse representation
corpus = [dictionary.doc2bow(clean_tween) for clean_tween in cleaned_tweetz]
cleaned_tweetz_id = [dictionary.doc2idx(document=tw) for tw in cleaned_tweetz]

# Cut all tweets after 5 words and remove tweets below 5 words
cleaned_tweetz_id = [tw for tw in cleaned_tweetz_id if len(tw) >= 5]
cleaned_tweetz_id = [tw[:5] for tw in cleaned_tweetz_id]
cleaned_tweetz_id = torch.Tensor(np.array([np.array(tw, dtype=np.int32) for tw in cleaned_tweetz_id])).long()
cleaned_tweetz_id = cleaned_tweetz_id.transpose(1,0)


In [35]:
tag_dictionary = Dictionary(hashtags)
tags_id = [tag_dictionary.doc2idx(tag) for tag in hashtags]


In [38]:

import torch
from torch import nn
from torch.distributions import constraints

import pyro
import pyro.distributions as dist
from pyro.infer import SVI, JitTraceEnum_ELBO, TraceEnum_ELBO
from pyro.optim import Adam

import functools
from operator import itemgetter

K = 10
V = len(dictionary)
V_per_doc = 5
D = cleaned_tweetz_id.shape[1]
M = len(tag_dictionary)
l_sizes = '100-100'

print(f'Number of tags: {M} | Number of Docs: {D}')

# This is a fully generative model of a batch of documents.
# data is a [num_words_per_doc, num_documents] shaped array of word ids
# (specifically it is not a histogram). We assume in this simple example
# that all documents have the same number of words.
def model(data=None, tags=None, batch_size=None):
    # Globals.
    with pyro.plate("topics", K):
        topic_weights = pyro.sample("topic_weights", dist.Gamma(1. / K, 1.))
        topic_words = pyro.sample("topic_words", dist.Dirichlet(torch.ones(V) / V))
        topic_tags_distr = pyro.sample("topic_tags", dist.Dirichlet(torch.ones(M) / M))

    # Locals.
    with pyro.plate("documents", D) as ind:
        if data is not None:
            with pyro.util.ignore_jit_warnings():
                assert data.shape == (V_per_doc, D)
            data = data[:, ind]
            tags = torch.tensor([tags[int(id)][0] for id in ind]).float()

        doc_topics = pyro.sample("doc_topics", dist.Dirichlet(topic_weights))
        with pyro.plate("words", V_per_doc):
            # The word_topics variable is marginalized out during inference,
            # achieved by specifying infer={"enumerate": "parallel"} and using
            # TraceEnum_ELBO for inference. Thus we can ignore this variable in
            # the guide.
            word_topics = pyro.sample("word_topics", dist.Categorical(doc_topics),
                                      infer={"enumerate": "parallel"})
            data = pyro.sample("doc_words", dist.Categorical(topic_words[word_topics]),
                               obs=data)

        # sample 1 topic
        tag_topic = pyro.sample("tag_topic", dist.Categorical(doc_topics), infer={"enumerate": "parallel"})
        if tags is not None:
            tag = pyro.sample("tag", dist.Categorical(topic_tags_distr[tag_topic]), obs=tags)

    return topic_weights, topic_words, data, tag


# We will use amortized inference of the local topic variables, achieved by a
# multi-layer perceptron. We'll wrap the guide in an nn.Module.
def make_predictor():
    layer_sizes = ([V] +
                   [int(s) for s in l_sizes.split('-')] +
                   [K])
    print('Creating MLP with sizes {}'.format(layer_sizes))
    layers = []
    for in_size, out_size in zip(layer_sizes, layer_sizes[1:]):
        layer = nn.Linear(in_size, out_size)
        layer.weight.data.normal_(0, 0.001)
        layer.bias.data.normal_(0, 0.001)
        layers.append(layer)
        layers.append(nn.Sigmoid())
    layers.append(nn.Softmax(dim=-1))
    return nn.Sequential(*layers)


def parametrized_guide(predictor, data, tags, batch_size=None):
    # Use a conjugate guide for global variables.
    topic_weights_posterior = pyro.param(
        "topic_weights_posterior",
        lambda: torch.ones(K),
        constraint=constraints.positive)
    topic_words_posterior = pyro.param(
        "topic_words_posterior",
        lambda: torch.ones(K, V),
        constraint=constraints.greater_than(0.5))
    topic_tags_posterior = pyro.param(
        "topic_tags_posterior",
        lambda: torch.ones(K, M),
        constraint=constraints.greater_than(0.5))
    with pyro.plate("topics", K):
        pyro.sample("topic_weights", dist.Gamma(topic_weights_posterior, 1.))
        pyro.sample("topic_words", dist.Dirichlet(topic_words_posterior))
        pyro.sample("topic_tags", dist.Dirichlet(topic_tags_posterior))

    # Use an amortized guide for local variables.
    pyro.module("predictor", predictor)
    with pyro.plate("documents", D, batch_size) as ind:
        # The neural network will operate on histograms rather than word
        # index vectors, so we'll convert the raw data to a histogram.
        if torch._C._get_tracing_state():
            counts = torch.eye(1024)[data[:, ind]].sum(0).t()
        else:
            counts = torch.zeros(V, ind.size(0))
            counts.scatter_add_(0, data[:, ind], torch.tensor(1.).expand(counts.shape))
        doc_topics = predictor(counts.transpose(0, 1))
        pyro.sample("doc_topics", dist.Delta(doc_topics, event_dim=1))


pyro.set_rng_seed(0)
pyro.clear_param_store()
# pyro.enable_validation(True)

# We can generate synthetic data directly by calling the model.
#true_topic_weights, true_topic_words, data = model()

# We'll train using SVI.
predictor = make_predictor()
guide = functools.partial(parametrized_guide, predictor)
Elbo = TraceEnum_ELBO  # JitTraceEnum_ELBO if args.jit else TraceEnum_ELBO
elbo = Elbo(max_plate_nesting=2)
optim = Adam({'lr': 1e-2})
svi = SVI(model, guide, optim, elbo)
print('Step\tLoss')
for step in range(5000):
    loss = svi.step(cleaned_tweetz_id, tags_id, batch_size=64)
    if step % 10 == 0:
        print('{: >5d}\t{}'.format(step, loss))
loss = elbo.loss(model, guide, data)
print('final loss = {}'.format(loss))

Number of tags: 725 | Number of Docs: 1899
Creating MLP with sizes [4227, 100, 100, 10]
Step	Loss
    0	401365.0
   10	403034.0
   20	403725.75
   30	394284.28125
   40	402631.875
   50	390290.78125
   60	388546.96875
   70	388825.5
   80	393618.0625
   90	389916.5625
  100	382247.6875
  110	381889.625
  120	378190.625
  130	382897.0625
  140	375515.375
  150	379963.90625
  160	377358.78125
  170	375438.0625
  180	371513.0
  190	379264.6875
  200	379262.25
  210	374889.125
  220	379864.3125
  230	366250.90625
  240	368189.0625
  250	368085.375
  260	373838.78125
  270	369557.40625
  280	368699.78125
  290	363842.375
  300	360918.375
  310	365012.28125
  320	363589.0625
  330	358202.1875
  340	363491.21875
  350	361633.75
  360	372190.96875
  370	364865.96875
  380	361387.65625
  390	372541.09375
  400	357418.25
  410	358022.9375
  420	350235.625
  430	354729.5625
  440	359668.5625
  450	354494.5625
  460	355842.0
  470	355696.28125
  480	352970.3125
  490	353021.46875
  500	358836.3125

KeyboardInterrupt: 

In [39]:
params = pyro.get_param_store()

words_per_topic_distr = params['topic_words_posterior']

for t in range(K):
    print("---- topic {} -----".format(t))
    top5_words = (torch.argsort(words_per_topic_distr[t])[-10:]).cpu().numpy()
    top5_words = list(map(lambda x: dictionary[x], reversed(top5_words)))
    print(top5_words)

---- topic 0 -----
['today', 'thank', 'netneutrality', 'SCREEN_NAME', 'repeal_netneutrality', 'congress', 'goptaxscam', 'family', 'would', 'watch']
---- topic 1 -----
['SCREEN_NAME', 'internet', 'american', 'thanks', 'today', 'republican', 'great', 'dreamer', 'family', 'netneutrality']
---- topic 2 -----
['SCREEN_NAME', 'dreamer_democrat', 'family', 'friend', 'community', 'visit', 'senator', 'join', 'young', 'years']
---- topic 3 -----
['people', 'SCREEN_NAME', 'taxreform', 'dreamer', 'republican', 'speaking', 'netneutrality', 'great', 'congress', 'today']
---- topic 4 -----
['thank', 'support', 'internet', 'today', 'SCREEN_NAME', 'dreamer', 'taxreform', 'republican', 'congress', 'community']
---- topic 5 -----
['SCREEN_NAME', 'visit', 'sign', 'bipartisan', 'dreamactnow_family', 'hatch', 'honor', 'senator', 'today', 'national']
---- topic 6 -----
['today', 'goptaxscam', 'congress', 'dreamer', 'netneutrality', 'fight', 'SCREEN_NAME', 'senate', 'family', 'internet']
---- topic 7 -----
['

In [40]:
_, _, _, tags = model(cleaned_tweetz_id, tags_id)

In [41]:
cleaned_tweetz[299]

['calling',
 'SCREEN_NAME',
 'chairman',
 'SCREEN_NAME',
 'abandon',
 'reckless',
 'netneutrality',
 'future',
 'internet',
 'hang',
 'balance',
 'savenetneutrality']

In [42]:
for i in range(400, 450):
    print("{} tweet: {} \n| tag: {}\n".format(i, tweets.iloc[i].text, tag_dictionary[int(tags[i])]))
    print('real tags: {}\n\n'.format(tweets.iloc[i].hashtags))

400 tweet: Kyle Duncan defended a law that would have made abortion nearly impossible to access in Texas. The Supreme Court disagreed and overturned the law. He‚Äôs #BadForWomen #CourtsMatter 
| tag: BadForWomen

real tags: ['BadForWomen', 'CourtsMatter']


401 tweet: Don Willett said he ‚Äúresisted‚Äù the idea that glass ceilings, pay equity, sexual discrimination and harassment were serious problems for women.  #BadforWomen #CourtsMatter 
| tag: BadforWomen

real tags: ['BadforWomen', 'CourtsMatter']


402 tweet: Steven Grasz defended state efforts to deny Medicaid coverage of abortion to women who are raped, which is required by federal law. #BadForWomen #CourtsMatter 
| tag: BadForWomen

real tags: ['BadForWomen', 'CourtsMatter']


403 tweet: Our courts have a critical role in upholding women‚Äôs constitutional and civil rights, from health care to workplace discrimination. We must speak out against judges who are #BadforWomen. #CourtsMatter 
| tag: BadforWomen

real tags: ['Badfor