# Part 3: Advanced Text Processing - LDA and BERTopic Topic Modeling

### LDA

In [1]:
import spacy
spacy.cli.download("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m78.5 MB/s[0m  [33m0:00:00[0mm0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [12]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
os.makedirs('outputs', exist_ok=True)

from spacy import displacy
from bertopic import BERTopic
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from sklearn.feature_extraction.text import CountVectorizer
import pyLDAvis.gensim_models
from gensim.corpora.dictionary import Dictionary

plt.style.use('seaborn-v0_8-dark') 

sou = pd.read_csv('data/SOTU.csv')
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text): 
    doc = nlp(text) 
    return [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space and len(token.lemma_) > 3]

In [5]:
# Process all texts - note this takes ~ 5 minutes to run
processed_docs = sou['Text'].apply(preprocess_text)

In [7]:
# Build dictionary from processed_docs, which is a list of tokens extracted from our speeches
dic = Dictionary(processed_docs)
dic.filter_extremes(no_below=5, no_above=0.5)
corpus = [dic.doc2bow(doc) for doc in processed_docs]

In [8]:
# train LDA model with 18 topics
lda_model = LdaModel(
    corpus=corpus,
    id2word=dic,
    num_topics=18,
    random_state=42,
    passes=10
)

In [9]:
# print the top 10 words for each topic
print("\n--- LDA Topics ---") 
for idx, topic in lda_model.print_topics(-1): 
    print(f"Topic: {idx} \nWords: {topic}\n")


--- LDA Topics ---
Topic: 0 
Words: 0.004*"cent" + 0.004*"june" + 0.004*"gold" + 0.003*"island" + 0.003*"silver" + 0.003*"bond" + 0.003*"method" + 0.003*"convention" + 0.003*"indian" + 0.003*"note"

Topic: 1 
Words: 0.008*"depression" + 0.007*"program" + 0.007*"recovery" + 0.006*"budget" + 0.006*"unemployment" + 0.006*"loan" + 0.006*"activity" + 0.006*"farm" + 0.005*"emergency" + 0.005*"cent"

Topic: 2 
Words: 0.008*"dictator" + 0.005*"expression" + 0.004*"british" + 0.004*"1914" + 0.003*"impressive" + 0.003*"actual" + 0.003*"revolution" + 0.003*"schedule" + 0.003*"continent" + 0.003*"partisanship"

Topic: 3 
Words: 0.008*"forest" + 0.007*"corporation" + 0.005*"judge" + 0.005*"wrong" + 0.005*"interstate" + 0.004*"employee" + 0.003*"bureau" + 0.003*"body" + 0.003*"mountain" + 0.003*"island"

Topic: 4 
Words: 0.017*"program" + 0.014*"soviet" + 0.009*"1980" + 0.009*"u.s." + 0.008*"area" + 0.007*"major" + 0.006*"goal" + 0.006*"commitment" + 0.006*"challenge" + 0.006*"nuclear"

Topic: 5 
W

In [10]:
# print the topic distribution for the first speech
lda_model.get_document_topics(corpus[0])

[(11, np.float32(0.99942815))]

In [13]:
# make a visualization using pyLDAvis
pyLDAvis.enable_notebook()
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dic)
pyLDAvis.display(lda_display)
pyLDAvis.save_html(lda_display, 'outputs/lda_visualization.html')

### BERTopic