# Tweets - Topic Modeling with LDA

This is created following this [notebook](https://github.com/kapadias/mediumposts/blob/master/natural_language_processing/topic_modeling/notebooks/Introduction%20to%20Topic%20Modeling.ipynb)

In [17]:
# Importing modules
import pandas as pd
import os
from collections import defaultdict

tweets_per_peak = defaultdict(list)
t_count = 0
with open('../input/topicmodelling/all_tweets_neg_emo_words_with_emoji.tsv', mode='r', encoding='utf8') as fin:
    for line in fin:
        parts = line.strip().split('\t')
        
        if len(parts) > 2:
            tweets_per_peak[parts[1]].append(parts[2])
            t_count += 1

print(f'Total tweets: {t_count}, for {len(tweets_per_peak)} peak months')

Total tweets: 20086, for 26 peak months


## Processing

In [21]:
import re
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import emoji
from tqdm import tqdm

stop_words = stopwords.words('english')
all_words = []
peaks = []

for d_key, d_val in tqdm(tweets_per_peak.items()):
    peaks.append(d_key)
    doc = ' '.join(d_val)
    doc = emoji.replace_emoji(doc, replace=' ')
    doc = re.sub(r'[^A-Za-z0-9]+', ' ', doc)
    doc = doc.lower()
    words = gensim.utils.simple_preprocess(doc, deacc=True)
    words = [w for w in words if w not in stop_words]
    all_words.append(words)

print(f'Number of docs: {len(all_words)}')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 26/26 [00:02<00:00, 11.10it/s]

Number of docs: 26





In [22]:
import gensim.corpora as corpora

id2word = corpora.Dictionary(all_words)

# Create Corpus
texts = all_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 5), (4, 1), (5, 1), (6, 4), (7, 1), (8, 2), (9, 1), (10, 1), (11, 4), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 8), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)]


In [23]:
from pprint import pprint

# number of topics
num_topics = 10

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.020*"gay" + 0.011*"pride" + 0.007*"conversion" + 0.007*"rt" + '
  '0.006*"bullying" + 0.006*"people" + 0.006*"therapy" + 0.005*"news" + '
  '0.004*"homophobia" + 0.004*"sex"'),
 (1,
  '0.022*"gay" + 0.009*"pride" + 0.007*"conversion" + 0.007*"ban" + '
  '0.006*"therapy" + 0.005*"people" + 0.005*"new" + 0.005*"discrimination" + '
  '0.004*"rt" + 0.004*"porn"'),
 (2,
  '0.032*"gay" + 0.018*"pride" + 0.008*"conversion" + 0.007*"rt" + '
  '0.007*"people" + 0.006*"therapy" + 0.006*"ban" + 0.005*"news" + 0.005*"new" '
  '+ 0.005*"women"'),
 (3,
  '0.028*"gay" + 0.011*"pride" + 0.009*"rt" + 0.007*"therapy" + '
  '0.007*"conversion" + 0.005*"news" + 0.005*"bullying" + 0.005*"people" + '
  '0.004*"ban" + 0.004*"discrimination"'),
 (4,
  '0.035*"gay" + 0.018*"pride" + 0.011*"conversion" + 0.011*"therapy" + '
  '0.008*"rt" + 0.008*"ban" + 0.005*"news" + 0.005*"bullying" + 0.004*"new" + '
  '0.004*"people"'),
 (5,
  '0.016*"gay" + 0.008*"pride" + 0.007*"therapy" + 0.007*"conversion" + '


In [24]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(num_topics) +'.html')

LDAvis_prepared

  """
  from imp import reload
  by='saliency', ascending=False).head(R).drop('saliency', 1)


  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
