In [9]:
from pathlib import Path
from pprint import pprint

import pandas as pd

import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import Phrases, ldamodel
from nltk.corpus import stopwords

In [2]:
# Config params

# Data directory
DATA = Path('../../reddit/data/')

# Define stop words
stop_words = stopwords.words('english')

# Set random state for topic model experiments
seed = 2019

### 1. Preprocessing and Feature Extraction

In [3]:
# Read consolidated and filtered posts file
allposts = pd.read_csv(DATA/'posts/processed/filtered_posts.tsv', sep='\t', encoding='utf-8')

In [4]:
def tokenize(text, remove_stopwords=True):
    if remove_stopwords:
        return [s for s in simple_preprocess(text) if s not in stop_words]
    
    else:
        simple_preprocess(text)

In [5]:
%%time

# Tokenize post titles
allposts['tokenized_titles'] = allposts['title'].apply(tokenize)

# Create Dictionary
id2word = corpora.Dictionary(allposts['tokenized_titles'])

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in allposts['tokenized_titles']]

CPU times: user 2min 8s, sys: 1.09 s, total: 2min 9s
Wall time: 2min 9s


### 2. Create topic model

In [6]:
%%time

lda_model = ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=30, 
    random_state=seed,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha='auto',
    per_word_topics=True)

CPU times: user 5d 5h 34min 8s, sys: 6h 53min 58s, total: 5d 12h 28min 6s
Wall time: 22h 15min 54s


### 3. Analyze

In [10]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(24,
  '0.000*"babb" + 0.000*"pedofag" + 0.000*"drivethemout" + '
  '0.000*"iwillredpillyou" + 0.000*"lanier" + 0.000*"kippy" + 0.000*"hiphop" + '
  '0.000*"geographically" + 0.000*"abdominal" + 0.000*"stansted"'),
 (13,
  '0.401*"children" + 0.206*"legal" + 0.102*"future" + 0.073*"faces" + '
  '0.039*"professor" + 0.028*"consequences" + 0.000*"geographically" + '
  '0.000*"pedofag" + 0.000*"manchesterian" + 0.000*"mobb"'),
 (22,
  '0.178*"security" + 0.137*"social" + 0.125*"questions" + 0.118*"senator" + '
  '0.114*"confirmed" + 0.062*"half" + 0.059*"pm" + 0.045*"scheme" + '
  '0.026*"raped" + 0.013*"rick"'),
 (18,
  '0.228*"thought" + 0.188*"emails" + 0.130*"mass" + 0.108*"nobody" + '
  '0.103*"muslim" + 0.074*"murder" + 0.038*"cares" + 0.000*"biden" + '
  '0.000*"geographically" + 0.000*"pedofag"'),
 (7,
  '0.190*"next" + 0.180*"paul" + 0.102*"rand" + 0.093*"debate" + '
  '0.076*"facebook" + 0.060*"ceo" + 0.057*"posted" + 0.047*"stage" + '
  '0.039*"ben" + 0.027*"outraged"'),
 (3,


In [15]:
%%time

lda_model.save(fname=str(DATA/'lda_model'))

CPU times: user 192 ms, sys: 0 ns, total: 192 ms
Wall time: 194 ms


In [16]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -33.83386822880872


NameError: name 'CoherenceModel' is not defined

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

### Future Work

In [None]:
# # Build the bigram and trigram models
# bigram = Phrases(tokenized_sents, min_count=20, 
#                  threshold=100) 

# trigram = gensim.models.Phrases(bigram[data_words],
#                                 threshold=100)  

# # Faster way to get a sentence clubbed as a trigram/bigram
# bigram_mod = gensim.models.phrases.Phraser(bigram)
# trigram_mod = gensim.models.phrases.Phraser(trigram)

# # See trigram example
# print(trigram_mod[bigram_mod[data_words[0]]])

# def make_bigrams(texts):
#     return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]
#
# def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     """https://spacy.io/api/annotation"""
#     texts_out = []
#     for sent in texts:
#         doc = nlp(" ".join(sent)) 
#         texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
#     return texts_out
#
# # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# # python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])

# # Do lemmatization keeping only noun, adj, vb, adv
# data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# print(data_lemmatized[:1])