# Latent Dirichlet Allocation (LDA)

Sklearn example from https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

### Load data from the 20 news groups data

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print(documents[0])

Well i'm not sure about the story nad it did seem biased. What
I disagree with is your statement that the U.S. Media is out to
ruin Israels reputation. That is rediculous. The U.S. media is
the most pro-israeli media in the world. Having lived in Europe
I realize that incidences such as the one described in the
letter have occured. The U.S. media as a whole seem to try to
ignore them. The U.S. is subsidizing Israels existance and the
Europeans are not (at least not to the same degree). So I think
that might be a reason they report more clearly on the
atrocities.
	What is a shame is that in Austria, daily reports of
the inhuman acts commited by Israeli soldiers and the blessing
received from the Government makes some of the Holocaust guilt
go away. After all, look how the Jews are treating other races
when they got power. It is unfortunate.



### Prepare the data for LDA

In [3]:
num_features = 1000

# LDA can only uses raw term counts
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)

In [4]:
tf_feature_names = tf_vectorizer.get_feature_names() #theses are the words in our bag of words

### Build the LDA model

In [5]:
num_topics = 10

# Run LDA
lda = LatentDirichletAllocation(n_components=num_topics, max_iter=5, learning_method='online',random_state=0, n_jobs=-1)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=5,
                          mean_change_tol=0.001, n_components=10, n_jobs=-1,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [6]:
lda.components_

array([[1.00224371e-01, 2.94490425e+02, 1.00009572e-01, ...,
        4.58066266e+00, 1.40763399e+02, 1.08318812e+02],
       [1.00335073e-01, 1.99507980e+01, 1.00066313e-01, ...,
        2.71899073e+02, 4.59168261e+00, 1.00093416e-01],
       [2.40288744e+00, 1.00044968e-01, 2.06517163e-01, ...,
        1.01934120e-01, 1.00037526e-01, 1.00024359e-01],
       ...,
       [1.11080956e-01, 1.00025849e-01, 1.00576955e-01, ...,
        1.00000721e-01, 1.00258149e-01, 1.00005005e-01],
       [1.00300329e-01, 1.00184041e-01, 1.00014551e-01, ...,
        2.58994253e+02, 4.49397641e+01, 1.54072534e+02],
       [1.05807440e+03, 3.57307803e+02, 2.19107405e+02, ...,
        1.67731290e-01, 1.03143091e+02, 3.38903202e+01]])

### Display the top ten words for each topic

In [8]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))

num_top_words = 10
display_topics(lda, tf_feature_names, num_top_words)

Topic 0:
people government gun armenian said armenians states war israel state
Topic 1:
people don like right point make know think use question
Topic 2:
space program db output data entry nasa use build line
Topic 3:
key encryption chip government use keys public clipper security law
Topic 4:
edu file com available mail information ftp files list send
Topic 5:
god people jesus does believe say think christian bible true
Topic 6:
windows use drive thanks does card problem like know using
Topic 7:
ax max b8f g9v a86 pl 145 1d9 0t 34u
Topic 8:
just don like think know good time didn going ve
Topic 9:
10 00 20 15 25 12 14 11 17 16


### Model evaluation

Model [perplexity](https://en.wikipedia.org/wiki/Perplexity) is often used in LDA to evaluate how well a model predicts a sample.

In [9]:
print("Model perplexity: {0:0.3f}".format(lda.perplexity(tf)))

Model perplexity: 265.308


## Show how to do LDA in gensim

Example from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

If you don't already have gensim installed:  
`$ pip install -U gensim`  

And pyLDAVis:  
`$ pip install pyldavis`

### Imports

In [12]:
import pandas as pd
import numpy as np

#gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim.corpora as corpora
from gensim.models import CoherenceModel

#nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package wordnet to /home/frank/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Load data

In [13]:
data = pd.read_csv('data/abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

### Preprocess data

In [14]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [15]:
stemmer = SnowballStemmer('english')
processed_docs = documents['headline_text'].map(preprocess)

In [16]:
processed_docs[:10]

0            [decid, communiti, broadcast, licenc]
1                               [wit, awar, defam]
2           [call, infrastructur, protect, summit]
3                      [staff, aust, strike, rise]
4             [strike, affect, australian, travel]
5               [ambiti, olsson, win, tripl, jump]
6           [antic, delight, record, break, barca]
7    [aussi, qualifi, stosur, wast, memphi, match]
8            [aust, address, secur, council, iraq]
9                         [australia, lock, timet]
Name: headline_text, dtype: object

### Bag of words

In [17]:
#create dictionary
id2word = gensim.corpora.Dictionary(processed_docs)

#create corpus
texts = processed_docs

#Term Document Frequency
bow_corpus = [id2word.doc2bow(text) for text in texts]

In [18]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in bow_corpus]

[[('broadcast', 1), ('communiti', 1), ('decid', 1), ('licenc', 1)],
 [('awar', 1), ('defam', 1), ('wit', 1)],
 [('call', 1), ('infrastructur', 1), ('protect', 1), ('summit', 1)],
 [('aust', 1), ('rise', 1), ('staff', 1), ('strike', 1)],
 [('strike', 1), ('affect', 1), ('australian', 1), ('travel', 1)],
 [('ambiti', 1), ('jump', 1), ('olsson', 1), ('tripl', 1), ('win', 1)],
 [('antic', 1), ('barca', 1), ('break', 1), ('delight', 1), ('record', 1)],
 [('aussi', 1),
  ('match', 1),
  ('memphi', 1),
  ('qualifi', 1),
  ('stosur', 1),
  ('wast', 1)],
 [('aust', 1), ('address', 1), ('council', 1), ('iraq', 1), ('secur', 1)],
 [('australia', 1), ('lock', 1), ('timet', 1)],
 [('iraq', 1), ('australia', 1), ('contribut', 1), ('million', 1)],
 [('barca', 1), ('record', 1), ('birthday', 1), ('celebr', 1), ('robson', 1)],
 [('ahead', 1), ('bathhous', 1), ('plan', 1)],
 [('championship', 1), ('cycl', 1), ('hop', 1), ('launceston', 1)],
 [('plan', 1), ('boost', 1), ('paroo', 1), ('suppli', 1), ('wat

In [18]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=id2word, passes=2, workers=2)

### View topics in the LDA model

In [19]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[bow_corpus]

[(0,
  '0.030*"elect" + 0.022*"warn" + 0.019*"live" + 0.017*"drug" + 0.017*"donald" '
  '+ 0.011*"campaign" + 0.011*"violenc" + 0.010*"john" + 0.010*"young" + '
  '0.010*"show"'),
 (1,
  '0.032*"govern" + 0.024*"say" + 0.019*"help" + 0.015*"state" + '
  '0.015*"report" + 0.014*"labor" + 0.013*"servic" + 0.013*"peopl" + '
  '0.012*"feder" + 0.012*"victoria"'),
 (2,
  '0.026*"polic" + 0.025*"queensland" + 0.023*"attack" + 0.020*"kill" + '
  '0.019*"crash" + 0.018*"melbourn" + 0.016*"die" + 0.016*"interview" + '
  '0.016*"brisban" + 0.015*"shoot"'),
 (3,
  '0.018*"rural" + 0.015*"death" + 0.015*"power" + 0.014*"call" + '
  '0.013*"investig" + 0.013*"farmer" + 0.013*"communiti" + 0.013*"concern" + '
  '0.011*"farm" + 0.010*"royal"'),
 (4,
  '0.033*"trump" + 0.023*"adelaid" + 0.020*"perth" + 0.017*"final" + '
  '0.017*"world" + 0.015*"open" + 0.015*"win" + 0.013*"test" + 0.012*"lose" + '
  '0.011*"flood"'),
 (5,
  '0.034*"charg" + 0.032*"court" + 0.025*"murder" + 0.021*"face" + '
  '0.018*"

### Compute Model Perplexity and Coherence Score (interpretability of the model)



In [20]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.242384565660066

Coherence Score:  0.2074296670593884


### Visualize the topics-keywords

In [21]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
