
# Topic Modeling Algorithms
 
* LDA – Latent Dirichlet Allocation – The one we’ll be focusing in this tutorial. Its foundations are Probabilistic Graphical Models
* LSA or LSI – Latent Semantic Analysis or Latent Semantic Indexing – Uses Singular Value Decomposition (SVD) on the Document-Term Matrix. Based on Linear Algebra
* NMF – Non-Negative Matrix Factorization – Based on Linear Algebra

## gensim

In [2]:
from nltk.corpus import brown
 
data = []
 
for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)
 
NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[:1])

500


In [3]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
 
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')
 
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text
 
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))
 
 
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
 
# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...
 
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

[(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2), (44, 2), (45, 2), (46, 2), (47, 2), (49, 1), (50, 1), (53, 1), (56, 1), (59, 1), (60, 1), (66, 1), (75, 1), (80, 1), (98, 1), (101, 1), (106, 1), (117, 1), (129, 1), (130, 2), (132, 2), (135, 2), (140, 1), (141, 2), (143, 4), (144, 2), (145, 2), (166, 1), (195, 1), (198, 3), (219, 1), (220, 4), (221, 3), (223, 1), (229, 4), (230, 4), (231, 2), (235, 1), (236, 1), (242, 2), (246, 2), (255, 1), (263, 1), (269, 1), (270, 5), (271, 2), (275, 5), (276, 1), (278, 4), (280, 2), (281, 1), (307, 2), (310, 1), (311, 3), (313, 1), (314, 5), (318, 4), (322, 1), (336, 1), (338, 3), (339, 1), (340, 1), (341, 1), (345, 1), (346, 1), (351, 1), (354, 1), (355, 1), (366, 3), (368, 13), (370, 1), (372, 1), (374, 3), (377, 3), (381, 3), (386, 1), (392, 6), (396, 1), (401, 1), (412, 2), (426, 2), (428, 2), (431, 2), (434, 2), (439, 2), (444, 1), (450, 1), (452, 1), (462, 1), (465, 1), (467, 1), (470, 1), (478, 1), (483, 1), (

In [6]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 40)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 40)

LDA Model:
Topic #0: 0.006*"would" + 0.005*"one" + 0.004*"said" + 0.003*"new" + 0.003*"time" + 0.003*"man" + 0.003*"could" + 0.003*"first" + 0.003*"like" + 0.002*"may"
Topic #1: 0.006*"one" + 0.005*"would" + 0.004*"could" + 0.004*"said" + 0.003*"new" + 0.003*"time" + 0.003*"first" + 0.003*"like" + 0.002*"man" + 0.002*"may"
Topic #2: 0.005*"one" + 0.005*"would" + 0.004*"time" + 0.003*"said" + 0.003*"new" + 0.003*"man" + 0.003*"even" + 0.003*"first" + 0.002*"like" + 0.002*"two"
Topic #3: 0.007*"one" + 0.005*"would" + 0.003*"said" + 0.003*"like" + 0.003*"new" + 0.002*"man" + 0.002*"could" + 0.002*"two" + 0.002*"even" + 0.002*"time"
Topic #4: 0.006*"one" + 0.005*"would" + 0.003*"two" + 0.003*"could" + 0.003*"said" + 0.003*"new" + 0.003*"made" + 0.002*"like" + 0.002*"may" + 0.002*"back"
Topic #5: 0.006*"would" + 0.006*"one" + 0.004*"said" + 0.003*"may" + 0.003*"time" + 0.003*"could" + 0.003*"new" + 0.003*"made" + 0.003*"two" + 0.002*"years"
Topic #6: 0.007*"one" + 0.003*"would" + 0.003*"cou

In [10]:
text = "The economy is working better than ever"
text2 = "Yo mama is hella thicc"
bow = dictionary.doc2bow(clean_text(text2))
 
print(lsi_model[bow],'\n')


 
print(lda_model[bow])

[(0, -0.005957209831818709), (1, 0.010017210910680325), (2, 0.011070348658075945), (3, -0.007651844784759697), (4, 0.011900071336933887), (5, -0.0260031360772242), (6, 0.005315282365291289), (7, -0.03651638533287692), (8, 0.03149981656372626), (9, 0.00328679075694875)] 

[(0, 0.050090764), (1, 0.050079573), (2, 0.05007484), (3, 0.050083853), (4, 0.5493299), (5, 0.05007368), (6, 0.050066877), (7, 0.050068233), (8, 0.050064757), (9, 0.05006755)]


The largerst numbers suggest the likelyhood of the right topic

In [11]:
from gensim import similarities
 
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities[:10])
# [(104, 0.87591344), (178, 0.86124849), (31, 0.8604598), (77, 0.84932965), (85, 0.84843522), (135, 0.84421808), (215, 0.84184396), (353, 0.84038532), (254, 0.83498049), (13, 0.82832891)]
 
# Let's see what's the most similar document
document_id, similarity = similarities[0]
print(data[document_id][:1000])

[(93, 0.973749), (363, 0.97255266), (447, 0.9720818), (7, 0.97179264), (287, 0.9708616), (418, 0.97085434), (495, 0.97042394), (233, 0.9703239), (246, 0.9688419), (122, 0.96796376)]
I have , within the past fifty years , come out of all uncertainty into a faith which is a dominating conviction of the Truth and about which I have not a shadow of doubt . It has been my lot all through life to associate with eminent scientists and at times to discuss with them the deepest and most vital of all questions , the nature of the hope of a life beyond this . I have also constantly engaged in scientific work and am fully aware of the value of opinions formed in science as well as in the religions in the world . In an amateurish , yet in a very real sense , I have followed the developments of archaeology , geology , astronomy , herpetology , and mycology with a hearty appreciation of the advances being made in these fields . At one time I became disturbed in the faith in which I had grown up by th

## Using Scikit-Learn for Topic Modeling

Let’s now go through the same process with sklearn. This librabry offers a NMF implementation as well. The algorithms are more bare-bones than what we’ve seen with gensim but on the plus side, they implement the fit/transform interface we’re used with:

In [13]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
 
# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

(500, 10)
(500, 10)
(500, 10)
[8.10474935e-01 1.05606263e-04 1.05622931e-04 6.49138574e-02
 1.05624772e-04 1.05596750e-04 1.05617619e-04 1.05607002e-04
 1.05611137e-04 1.23871921e-01]
[0.         0.         2.11604366 0.07700698 0.         0.54407566
 1.06747352 0.         0.         0.24740574]
[ 2.33068433e+01  1.59525076e+00  2.18268258e+01  1.27520434e-02
  8.21588945e-01  1.16973291e+01  3.91321465e+00 -2.49564812e+00
 -3.53735748e-02 -1.35467269e+01]


In [14]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('state', 418.67046439300594), ('new', 389.73218455052483), ('states', 334.61437728219397), ('united', 313.8271352852116), ('president', 265.0047659772917), ('government', 255.6998641283773), ('said', 251.90502030818897), ('year', 233.447022796534), ('american', 183.06700613757945), ('court', 182.49046394995867)]
Topic 1:
[('drill', 24.32423066570807), ('locking', 21.13312813752891), ('bar', 19.13438094608137), ('frame', 18.808369875978492), ('bars', 18.224986154086167), ('pieces', 14.741599800819987), ('signal', 11.256400953214145), ('file', 10.853012670092696), ('piece', 9.685263718913905), ('fig', 9.570508489196397)]
Topic 2:
[('world', 480.6338610948919), ('new', 456.82570036017563), ('time', 404.602922251454), ('man', 404.33992717886116), ('people', 389.7796773369035), ('life', 383.69727988266266), ('great', 323.60642585406123), ('social', 297.6291441783118), ('way', 288.0764644705706), ('work', 273.37751848910756)]
Topic 3:
[('used', 252.71415970182704), ('ne

In [17]:
text = "The economy is working better than ever"
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x)

[0.0028994  0.         0.         0.         0.         0.00439744
 0.         0.         0.         0.00468957]


In [18]:
from sklearn.metrics.pairwise import euclidean_distances
 
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar
 
similarities = most_similar(x, nmf_Z)
document_id, similarity = similarities[0]
print(data[document_id][:1000])
 

Livery stable -- J. Vernon , prop. '' . Coaching had declined considerably by 1905 , but the sign was still there , near the old Wells Fargo building in San Francisco , creaking in the fog as it had for thirty years . John Vernon had had all the patronage he cared for -- he had prospered , but he could not retire from horsedom . Coaching was in his blood . He had two interests in life : the pleasures of the table and driving . Twice a week he drove his tallyho over the Santa Cruz road , upland and through the redwood forest , with orchards below him at one hand , and glimpses of the Pacific at the other . The journey back he made along the coast road , traveling hell-for-leather , every lantern of the tallyho ablaze . The southward route was the classic run in California , and the most fashionable . His patronage on this stretch was made up largely of San Franciscans -- regulars , most of them , and trenchermen like himself . They did not complain at the inhuman hour of starting ( seve

## Plotting words and documents in 2D with SVD

We can use SVD with 2 components (topics) to display words and documents in 2D. The process is really similar. Let’s start with displaying documents since it’s a bit more straightforward.

In case you are running this in a Jupyter Notebook, run the following lines to init bokeh:

In [21]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [22]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(data))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [23]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

## More about Latent Dirichlet Allocation

LDA is the most popular method for doing topic modeling in real-world applications. That is because it provides accurate results, can be trained online (do not retrain every time we get new data) and can be run on multiple cores. Let’s repeat the process we did in the previous sections with sklearn and LatentDirichletAllocation:

In [25]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
text = "The economy is working better than ever"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())

[0.02500058 0.0250002  0.02500522 0.02500243 0.77496824 0.02500379
 0.02500147 0.02500843 0.02500214 0.02500751] 1.0000000000000002


The purpose of LDA is to compute how much of the document was generated by which topic. In this example, more than half of the document has been generated by the (5) topic:

In [27]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

  """
