# Simple LDA

A minimum working example to showcase topic modeling concepts.

In [24]:
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

# (optional) some predefined samples for testing
from gensim.test.utils import common_texts
from gensim.test.utils import common_dictionary, common_corpus

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt

from pprint import pprint  # data pretty printer - provides a capability to “pretty-print” arbitrary Python data structures

In [25]:
# Create a corpus from a list of texts

# A. Load and define your text
mytext=[['human', 'interface', 'computer'],
        ['survey', 'user', 'computer', 'system', 'response', 'time'],
        ['eps', 'user', 'interface', 'system'],
        ['system', 'human', 'system', 'eps'],
        ['user', 'response', 'time'],
        ['trees'],
        ['graph', 'trees'],
        ['graph', 'minors', 'trees'],
        ['graph', 'minors', 'survey']]

# B. Create Dictionary
dictionary = Dictionary(mytext)

# C. Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in mytext]

In [26]:
# (for debugging) using predefined corpus

#mytext=common_texts
#dictionary=common_dictionary
#corpus=common_corpus

In [27]:
# Human readable format of corpus (term-frequency)
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('computer', 1), ('human', 1), ('interface', 1)]]

In [28]:
# Train the model on the corpus.
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=100)

In [29]:
# Visualize keywords
pprint(lda.print_topics())

[(0,
  '0.307*"trees" + 0.307*"graph" + 0.211*"minors" + 0.020*"survey" + '
  '0.019*"interface" + 0.019*"human" + 0.019*"user" + 0.019*"system" + '
  '0.019*"response" + 0.019*"time"'),
 (1,
  '0.223*"user" + 0.222*"response" + 0.222*"time" + 0.037*"survey" + '
  '0.037*"interface" + 0.037*"human" + 0.037*"trees" + 0.037*"system" + '
  '0.037*"graph" + 0.037*"minors"'),
 (2,
  '0.083*"survey" + 0.083*"interface" + 0.083*"human" + 0.083*"user" + '
  '0.083*"trees" + 0.083*"system" + 0.083*"graph" + 0.083*"minors" + '
  '0.083*"response" + 0.083*"time"'),
 (3,
  '0.308*"system" + 0.211*"eps" + 0.115*"human" + 0.115*"interface" + '
  '0.115*"user" + 0.019*"survey" + 0.019*"trees" + 0.019*"response" + '
  '0.019*"time" + 0.019*"graph"'),
 (4,
  '0.178*"computer" + 0.177*"survey" + 0.097*"interface" + 0.097*"human" + '
  '0.097*"response" + 0.097*"time" + 0.097*"user" + 0.097*"system" + '
  '0.016*"minors" + 0.016*"graph"')]


In [23]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, id2word)
pyLDAvis.show(vis, local=False) # or you can simply run 'vis' for in-notebook view

Serving to http://127.0.0.1:8894/    [Ctrl-C to exit]


127.0.0.1 - - [25/Mar/2024 22:06:23] "GET / HTTP/1.1" 200 -



stopping Server...
