# Simple LDA

A minimum working example to showcase topic modeling concepts.

In [11]:
#!pip install pyLDAvis

In [13]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

# (optional) some predefined samples for testing
from gensim.test.utils import common_texts
from gensim.test.utils import common_dictionary, common_corpus

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt

from pprint import pprint  # data pretty printer - provides a capability to “pretty-print” arbitrary Python data structures

In [15]:
# Create a corpus from a list of texts

# A. Load and define your text
mytext=[['human', 'interface', 'computer'],
        ['survey', 'user', 'computer', 'system', 'response', 'time'],
        ['eps', 'user', 'interface', 'system'],
        ['system', 'human', 'system', 'eps'],
        ['user', 'response', 'time'],
        ['trees'],
        ['graph', 'trees'],
        ['graph', 'minors', 'trees'],
        ['graph', 'minors', 'survey']]

# B. Create Dictionary
dictionary = Dictionary(mytext)

# C. Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in mytext]

In [18]:
# (for debugging) using predefined corpus

#mytext=common_texts
#dictionary=common_dictionary
#corpus=common_corpus

In [20]:
# Human readable format of corpus (term-frequency)
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('computer', 1), ('human', 1), ('interface', 1)]]

In [22]:
# Train the model on the corpus.
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=100)

In [24]:
# Visualize keywords
pprint(lda.print_topics())

[(0,
  '0.223*"human" + 0.222*"interface" + 0.222*"computer" + 0.037*"graph" + '
  '0.037*"minors" + 0.037*"system" + 0.037*"trees" + 0.037*"eps" + '
  '0.037*"survey" + 0.037*"user"'),
 (1,
  '0.128*"minors" + 0.128*"eps" + 0.128*"survey" + 0.128*"graph" + '
  '0.128*"system" + 0.128*"user" + 0.128*"interface" + 0.021*"trees" + '
  '0.021*"human" + 0.021*"computer"'),
 (2,
  '0.258*"trees" + 0.178*"system" + 0.178*"graph" + 0.097*"eps" + '
  '0.097*"minors" + 0.097*"human" + 0.016*"survey" + 0.016*"user" + '
  '0.016*"interface" + 0.016*"computer"'),
 (3,
  '0.193*"user" + 0.193*"time" + 0.193*"response" + 0.105*"survey" + '
  '0.105*"system" + 0.105*"computer" + 0.018*"graph" + 0.018*"minors" + '
  '0.018*"trees" + 0.018*"eps"'),
 (4,
  '0.083*"graph" + 0.083*"minors" + 0.083*"trees" + 0.083*"system" + '
  '0.083*"eps" + 0.083*"survey" + 0.083*"human" + 0.083*"user" + '
  '0.083*"interface" + 0.083*"computer"')]


In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.show(vis, local=False) # or you can simply run 'vis' for in-notebook view

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [19/Mar/2025 05:41:58] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [19/Mar/2025 05:41:58] code 404, message Not Found
127.0.0.1 - - [19/Mar/2025 05:41:58] "GET /favicon.ico HTTP/1.1" 404 -


In [None]:
# Save to an HTML file
pyLDAvis.save_html(vis, 'lda_topic_visualization.html')