# Latent Dirichlet Allocation Demo

## Import dependencies

In [None]:
import os
import time
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.manifold import TSNE
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool

## Corpus processing

20 Newsgroups data set. Set of 20,000 newsgroup documents spread accross 20 different topics. This makes it a nice testing ground for topic classification.

In [None]:
n_iter = 500
n_top_words = 5
threshold = 0.0

In [None]:
remove = ('headers', 'footers', 'quotes')
newsgroups = fetch_20newsgroups(subset='all', remove=remove)
# newsgroups_test = fetch_20newsgroups(subset='test', remove=remove)
corpus_raw = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in
        newsgroups.data]
# print(newsgroups_train.data)
print("Before:\n", newsgroups.data[0])
print("After:\n", corpus_raw[0])

## Clean documents

In [None]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

stopwords = set(stopwords.words('english'))
punctuation = set(string.punctuation) 
lemmatize = WordNetLemmatizer()

def cleaning(article):
    one = " ".join([i for i in article.split() if i not in stopwords])
    two = "".join(i for i in one if i not in punctuation)
    three = " ".join(lemmatize.lemmatize(i) for i in two.split())
    four = three.split(" ")
    return four

corpus_tokenized = [cleaning(doc) for doc in corpus_raw]
print(corpus_tokenized[0])

## Create BOW matrix for the model

In [None]:
from time import time
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
                   filename='running.log',filemode='w')

# Importing Gensim
import gensim
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer


# Creating the term dictionary of our corpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
dictionary = corpora.Dictionary(corpus_tokenized)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus_tokenized]

print("Len of raw corpus: %i | Len of matrix: %i" % (len(corpus_raw), len(doc_term_matrix)))
print("Processed:\n", doc_term_matrix[0])


## Train LDA model

This can take a couple minutes. I chose to use the actual number of topics for the sake of visualization.

In [None]:
from gensim.models.ldamodel import LdaModel

start = time()
# Creating the object for LDA model using gensim library
# Lda = gensim.models.ldamodel.LdaModel

# Get topics
num_topics = len(newsgroups.target_names)
print(num_topics)

# Running and Trainign LDA model on the document term matrix.
ldamodel = LdaModel(doc_term_matrix, num_topics=num_topics, id2word = dictionary, passes=50)
print('used: {:.2f}s'.format(time()-start))

ldamodel.save('topic.model')

print("Model Saved")

## Load model

Load model if you already have a trained model.

In [None]:
# Loads saved model

from gensim.models import LdaModel
loaded_model = LdaModel.load('topic.model')

print(loaded_model.print_topics(num_topics=2, num_words=4))

## Top words in each topic

In [None]:
# Topics
for i in ldamodel.print_topics(): 
    for j in i: print(j)

## Lets test it out on the test set

In [None]:
remove = ('headers', 'footers', 'quotes')
newsgroups_test = fetch_20newsgroups(subset='all', remove=remove)
# newsgroups_test = fetch_20newsgroups(subset='test', remove=remove)
corpus_raw_test = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in
        newsgroups_test.data]

print("Original Sentence:\n", newsgroups_test.data[0])

corpus_tokenized_test = [cleaning(doc) for doc in corpus_raw_test]   

doc_term_matrix_test = [dictionary.doc2bow(doc) for doc in corpus_tokenized_test]

print("\nAfter processing:\n", doc_term_matrix_test[0])

## Example output for one sentence

In [None]:
test_output = loaded_model[doc_term_matrix_test[100]]
print("Output:\n", test_output)
for i in test_output:
    print(i)

# Visualization

## Get vectors

We have to predict the probabilities for each document and put them in a matrix.

In [None]:
prob_matrix = np.zeros((len(doc_term_matrix_test), num_topics))

for i, doc in enumerate(doc_term_matrix_test):
    predictions = loaded_model[doc]
    idx, prob = zip(*predictions)
    prob_matrix[i, idx] = prob


## t-SNE

20 dimentions are hard to visualize, so lets run t-SNE to reduce the dimentionality. This can also take a couple minutes.

In [None]:
_idx = np.amax(prob_matrix, axis=1) > threshold  # idx of news that > threshold
_topics = prob_matrix[_idx]

num_example = len(_topics)

tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99,
                  init='pca')
tsne_lda = tsne_model.fit_transform(_topics[:num_example])


## Set up metadata for visualization

In [None]:
# find the most probable topic for each news
_lda_keys = []
for i in range(_topics.shape[0]):
    _lda_keys += _topics[i].argmax(),

# show topics and their top words
topic_summaries = []
for i in range(num_topics):
    word, _ = zip(*loaded_model.show_topic(i, topn=n_top_words))
    topic_summaries.append(' '.join(word))


# 20 colors
colormap = np.array([
  "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
  "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
  "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
  "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

title = "[20 newsgroups] t-SNE visualization of LDA model trained on {} news, " \
        "{} topics, thresholding at {} topic probability, {} iter ({} data " \
        "points and top {} words)".format(
  prob_matrix.shape[0], num_topics, threshold, n_iter, num_example, n_top_words)
        

## Visualize!

In [None]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource, CDSView
from bokeh.io import output_notebook
output_notebook() 

p = bp.figure(plot_width=1400, plot_height=1100,
                     title=title,
                     tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

source = ColumnDataSource(data=dict(
  x=tsne_lda[:,0],
  y=tsne_lda[:, 1],
  color=colormap[_lda_keys][:num_example],
  content=corpus_raw_test[:num_example],
  topic_key=_lda_keys[:num_example]
  )
)

p.scatter(x='x', y='y', color='color', source=source)

topic_coord = np.empty((prob_matrix.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
  if not np.isnan(topic_coord).any():
    break
  topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]

# plot crucial words
for i in range(prob_matrix.shape[1]):
  p.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])

# hover tools
hover = p.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}

# p.scatter(x=tsne_lda[:,0], y=tsne_lda[:, 1], color=colormap[_lda_keys][:num_example])
show(p)