# Topic Modeling

**Importing needed libraries**

In [20]:
import re

import pyLDAvis.gensim_models
from gensim.models.ldamulticore import LdaMulticore
import warnings

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore

**Topic Modeling with Alic in Wondeland Text**

- Prepare text: Load text file, split into documents, tokenize/lemmatize, remove stop words
- Create the term dictionary for the corpus
- Create a document term matrix (DTM)
- Set up the LDA model, decide on the number of topics
- Run and train the model
- Topics!

In [3]:
# Function for tokenizing the 
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

In [7]:
# Read in the text (download to run locally)
with open('data/alice_in_wonderland.txt', 'r', encoding="utf8") as file:
    text_str = file.read()

# Split the string on the newline character
text = text_str.split('\n')

# Tokenize each chunk of text
text_tokens = [tokenize(chunk) for chunk in text]

# Look at the first 10 tokens
text_tokens[0][0:10]

['alice', 'adventures', 'wonderland']

In [9]:
# Create the term dictionary of our corpus
# every unique term is assigned an index
dictionary = corpora.Dictionary(text_tokens)

# Convert list of documents (corpus) into Document Term Matrix 
# using the dictionary we just created
doc_term_matrix = [dictionary.doc2bow(doc) for doc in text_tokens]

# What does this matrix look like?
print(doc_term_matrix[0][0:25])

[(0, 1), (1, 1), (2, 1)]


In [11]:
# Create the object for LDA model
lda = gensim.models.ldamodel.LdaModel

# Train LDA model on the document term matrix
# topics = 5
ldamodel = lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)

# Print out the topics
print(ldamodel.print_topics(num_topics=3, num_words=5))

[(2, '0.031*"queen" + 0.015*"king" + 0.014*"know" + 0.014*"gryphon" + 0.011*"like"'), (0, '0.030*"alice" + 0.019*"think" + 0.017*"like" + 0.016*"works" + 0.011*"rabbit"'), (4, '0.112*"said" + 0.060*"alice" + 0.014*"turtle" + 0.013*"hatter" + 0.013*"mock"')]


In [13]:
words = [re.findall(r'"([^"]*)"', t[1]) for t in ldamodel.print_topics()]
topics = [' '.join(t[0:5]) for t in words]
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
alice think like works rabbit

------ Topic 1 ------
little alice voice went terms

------ Topic 2 ------
queen king know gryphon like

------ Topic 3 ------
gutenberg project tm work electronic

------ Topic 4 ------
said alice turtle hatter mock



**Interpreting Topics**

In [19]:
warnings.filterwarnings("ignore", category=DeprecationWarning) 

# Use the visualization in a notebook
pyLDAvis.enable_notebook()

In [24]:
# Function for tokenizing the 
def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

# Read in the text (download to run locally)
with open('data/alice_in_wonderland.txt', 'r', encoding="utf8") as file:
    text_str = file.read()

# Split the string on the newline character
text = text_str.split('\n')

# Tokenize each chunk of text
text_tokens = [tokenize(chunk) for chunk in text]

# Create the term dictionary of our corpus
# every unique term is assigned an index
dictionary = corpora.Dictionary(text_tokens)

# Convert list of documents (corpus) into Document Term Matrix 
# using the dictionary we just created
doc_term_matrix = [dictionary.doc2bow(doc) for doc in text_tokens]

# Create the object for LDA model
lda = gensim.models.ldamodel.LdaModel

# Train LDA model on the document term matrix
# topics = 5
ldamodel = lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)

In [27]:
# Interactive visualization for topic modeling
# (a screenshot will be displayed below)
pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary)

  default_term_info = default_term_info.sort_values(
