In [42]:
import nltk
from sklearn.naive_bayes import MultinomialNB
import xlrd
import string
from nltk.corpus import stopwords
from random import shuffle
import csv
from gensim import corpora

import numpy as np
from scipy.spatial import distance

# nltk.download('stopwords')  

## Read-in list of tweets

In [2]:
Tweet_list = ["realDonaldTrump", "BarackObama"]

In [3]:
def read_in_tweets(handle): 
    contents_all = []
    with open("../" + handle + ".tsv") as f: 
        for l in f.readlines():
            l = l.split('\t')
            if l[0] != 'id':
                contents_all.append(l[-1])
    return(contents_all)

## Text cleaning

In [4]:
stop_words = set(stopwords.words('english'))

In [5]:
### Remove punctuation for unicode strings (code I got from online)
import regex as re

def remove_punctuation(text):
    return re.sub(u"\p{P}+", "", text)


def data_clean(line):
    l = []
    line = line.split()
    for i in line:
        i = i.lower()
        if i[0] != '@' and i[0] != "#": # Getting rid of hashtag and @s
            i = remove_punctuation(i)
            if len(i) != 0 and i[:5] != 'https' and i not in stop_words and i != 'rt' and i != 'amp': # Getting rid of 
                l.append(i)
    return l

In [6]:
cleaned_tweets = dict()

for handle in Tweet_list:        
    cleaned_tweets[handle] = [data_clean(l) for l in read_in_tweets(handle)]

## Topic Modeling

In [7]:
from gensim import corpora
import pickle
import gensim
import copy

In [8]:
def get_topics(handle, NUM_TOPICS, NUM_WORDS, NUM_PASSES = 15, seed = 123):
    dictionary = corpora.Dictionary(cleaned_tweets[handle])
    corpus = [dictionary.doc2bow(text) for text in cleaned_tweets[handle]]
    # pickle.dump(corpus, open('corpus/' + handle + '_corpus.pkl', 'wb'))
    # dictionary.save('dictionary/' + hanlde + '_dictionary.gensim')
    
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes = NUM_PASSES, random_state = seed)
    ## Getting the topics with the top words asscociated with this topic
    topics = ldamodel.print_topics(num_words=NUM_WORDS)
    
    with open(handle + '_model_' + str(NUM_TOPICS) + '.pkl', 'wb') as f:
        pickle.dump(ldamodel, f, pickle.HIGHEST_PROTOCOL)    
    
    topics_dic = {}
    
    for i in range(NUM_TOPICS):
        topic = topics[i][1].split("+")
        topic_dic = {}
        for item in topic:
            weight = float(item.split("*")[0])
            key = item.split("*")[1].split("\"")[1]
            topic_dic[key] = weight
        topics_dic[i] = topic_dic.copy()
        
    return(topics_dic)

def print_topics(topic_dic):
    for i in topic_dic.keys():
        print("Topic " + str(i + 1) + ":")
        for j in topic_dic[i].keys():
            print('  ' + j + ' (' + str(topic_dic[i][j]) + ')')    

In [9]:
## Hyper-parameter for training and for display
num_topic = 5
num_words = 10

In [10]:
handle = "realDonaldTrump"
DT_dic = get_topics(handle, num_topic, num_words)
with open(handle + '_dic_' + str(num_topic) + '.pkl', 'wb') as f:
    pickle.dump(DT_dic, f, pickle.HIGHEST_PROTOCOL)

In [11]:
handle = "BarackObama"
BO_dic = get_topics(handle, num_topic, num_words)
with open(handle + '_dic_' + str(num_topic) + '.pkl', 'wb') as f:
    pickle.dump(BO_dic, f, pickle.HIGHEST_PROTOCOL)

## Checking and Visualizing Topics

In [12]:
print_topics(DT_dic)

Topic 1:
  great (0.029)
  thank (0.017)
  america (0.009)
  today (0.008)
  big (0.008)
  border (0.007)
  people (0.006)
  party (0.006)
  country (0.006)
  make (0.005)
Topic 2:
  news (0.018)
  fake (0.015)
  media (0.013)
  new (0.009)
  people (0.008)
  president (0.006)
  years (0.005)
  jobs (0.005)
  obama (0.004)
  york (0.004)
Topic 3:
  president (0.013)
  great (0.011)
  trump (0.009)
  collusion (0.006)
  time (0.006)
  years (0.005)
  best (0.005)
  american (0.005)
  country (0.005)
  china (0.004)
Topic 4:
  trump (0.012)
  democrats (0.01)
  president (0.008)
  great (0.007)
  states (0.007)
  collusion (0.007)
  wall (0.006)
  today (0.006)
  mueller (0.006)
  united (0.006)
Topic 5:
  border (0.018)
  democrats (0.015)
  must (0.009)
  southern (0.008)
  people (0.007)
  president (0.007)
  never (0.006)
  mueller (0.005)
  mexico (0.005)
  security (0.005)


In [13]:
print_topics(BO_dic)

Topic 1:
  president (0.05)
  obama (0.044)
  watch (0.012)
  live (0.011)
  get (0.01)
  health (0.01)
  et (0.009)
  tune (0.008)
  today (0.007)
  speaking (0.007)
Topic 2:
  president (0.028)
  obama (0.028)
  health (0.019)
  care (0.015)
  americans (0.011)
  today (0.009)
  make (0.007)
  years (0.007)
  millions (0.006)
  people (0.006)
Topic 3:
  president (0.034)
  obama (0.03)
  change (0.015)
  climate (0.012)
  address (0.01)
  weekly (0.009)
  watch (0.009)
  time (0.008)
  make (0.007)
  states (0.007)
Topic 4:
  senate (0.013)
  leaders (0.01)
  supreme (0.009)
  change (0.009)
  climate (0.009)
  court (0.009)
  add (0.008)
  job (0.008)
  name (0.008)
  make (0.007)
Topic 5:
  president (0.057)
  obama (0.05)
  economy (0.008)
  immigration (0.008)
  live (0.008)
  system (0.007)
  read (0.006)
  america (0.006)
  watch (0.006)
  progress (0.006)


In [14]:
import pyLDAvis.gensim

In [15]:
handle = "BarackObama"

dictionary = corpora.Dictionary(cleaned_tweets[handle])
corpus = [dictionary.doc2bow(text) for text in cleaned_tweets[handle]]
with open(handle + '_model_' + str(num_topic) + '.pkl', 'rb') as f:
    ldamodel = pickle.load(f)
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)

with open(handle + '_lda_display_' + str(num_topic) + '.pkl', 'wb') as f:
    pickle.dump(lda_display, f, pickle.HIGHEST_PROTOCOL)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [16]:
pyLDAvis.display(lda_display)

In [71]:
doc = corpus[11]

In [72]:
a = ldamodel[doc].sorted

In [82]:
dictionary = corpora.Dictionary(cleaned_tweets[handle])
corpus = [dictionary.doc2bow(text) for text in cleaned_tweets[handle]]

In [85]:
len(a)

96750

In [84]:
a = cleaned_tweets[handle] * 30

In [86]:
dictionary = corpora.Dictionary(a)
corpus = [dictionary.doc2bow(text) for text in a]

In [17]:
handle = "realDonaldTrump"

dictionary = corpora.Dictionary(cleaned_tweets[handle])
corpus = [dictionary.doc2bow(text) for text in cleaned_tweets[handle]]
with open(handle + '_model_' + str(num_topic) + '.pkl', 'rb') as f:
    ldamodel = pickle.load(f)
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)

with open(handle + '_lda_display_' + str(num_topic) + '.pkl', 'wb') as f:
    pickle.dump(lda_display, f, pickle.HIGHEST_PROTOCOL)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [18]:
pyLDAvis.display(lda_display)

## Creating 'word' embeddings for each topic

In [21]:
with open("../glove_dict.pkl" , "rb") as f:
    glove_dic = pickle.load(f)

In [51]:
vocab = glove_dic.keys()

def create_embeddings(topic_dic):
    vec_dic = {}
    for key in topic_dic.keys():
        topic = topic_dic[key]
        acc = 0
        acc_vec = np.zeros(100)
        for key_word in topic: 
            if key_word in vocab:
                acc_vec += glove_dic[key_word] * topic[key_word]
                acc += topic[key_word]
        if acc == 0:
            print("No word found in the vocabulary :(")
        else:
            acc_vec /= acc
        vec_dic[key] = acc_vec
    return(vec_dic)

In [52]:
DT_vec = create_embeddings(DT_dic)
handle = "realDonaldTrump"
with open(handle + '_vec_' + str(num_topic) + '.pkl', 'wb') as f:
    pickle.dump(DT_vec, f, pickle.HIGHEST_PROTOCOL)

In [53]:
BO_vec = create_embeddings(BO_dic)
handle = "BarackObama"
with open(handle + '_vec_' + str(num_topic) + '.pkl', 'wb') as f:
    pickle.dump(BO_vec, f, pickle.HIGHEST_PROTOCOL)