# Part 3: Advanced Text Processing - LDA and BERTopic Topic Modeling

In [1]:
import pandas as pd
import spacy
from tqdm import tqdm
from collections import Counter
from spacy import displacy
from bertopic import BERTopic
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from sklearn.feature_extraction.text import CountVectorizer
import pyLDAvis
import pyLDAvis.gensim_models

  from .autonotebook import tqdm as notebook_tqdm


<h2>LDA<h2>

<h4>Train an LDA model with 18 topics<h4>

In [2]:
nlp = spacy.load("en_core_web_sm")
def preprocess_text(text): 
    doc = nlp(text) 
    return [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space and len(token.lemma_) > 3]

In [3]:
sou = pd.read_csv("data/SOTU.csv")
processed_docs = sou['Text'].apply(preprocess_text)

In [4]:
processed_docs

0      [speak, president, present, prepared, remark, ...
1      [president, speaker, point, president, turn, f...
2      [president, thank, thank, thank, madam, speake...
3      [president, thank, thank, thank, good, mitch, ...
4      [president, thank, thank, thank, madam, speake...
                             ...                        
241    [fellow, citizen, senate, house, representativ...
242    [fellow, citizen, senate, house, representativ...
243    [fellow, citizen, senate, house, representativ...
244    [fellow, citizen, senate, house, representativ...
245    [fellow, citizen, senate, house, representativ...
Name: Text, Length: 246, dtype: object

In [5]:
# Build dictionary from processed_docs
dictionary = Dictionary(processed_docs) 
dictionary.filter_extremes(no_below=5, no_above=0.5) # Filter rare/common words 
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [6]:
# train LDA model with 18 topics
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=18, random_state=42, passes=10)

<h4>Output the top 10 words for each topic.<h4>

In [7]:
# print the top 10 words for each topic
print("\n--- LDA Topics ---") 
for idx, topic in lda_model.print_topics(-1): 
    print(f"Topic: {idx} \nWords: {topic}\n")


--- LDA Topics ---
Topic: 0 
Words: 0.004*"cent" + 0.004*"june" + 0.004*"gold" + 0.003*"island" + 0.003*"silver" + 0.003*"bond" + 0.003*"method" + 0.003*"convention" + 0.003*"indian" + 0.003*"note"

Topic: 1 
Words: 0.008*"depression" + 0.007*"program" + 0.007*"recovery" + 0.006*"budget" + 0.006*"unemployment" + 0.006*"loan" + 0.006*"activity" + 0.006*"farm" + 0.005*"emergency" + 0.005*"cent"

Topic: 2 
Words: 0.008*"dictator" + 0.005*"expression" + 0.004*"british" + 0.004*"1914" + 0.003*"impressive" + 0.003*"actual" + 0.003*"revolution" + 0.003*"schedule" + 0.003*"continent" + 0.003*"partisanship"

Topic: 3 
Words: 0.008*"forest" + 0.007*"corporation" + 0.005*"judge" + 0.005*"wrong" + 0.005*"interstate" + 0.004*"employee" + 0.003*"bureau" + 0.003*"body" + 0.003*"mountain" + 0.003*"island"

Topic: 4 
Words: 0.017*"program" + 0.014*"soviet" + 0.009*"1980" + 0.009*"u.s." + 0.008*"area" + 0.007*"major" + 0.006*"goal" + 0.006*"commitment" + 0.006*"challenge" + 0.006*"nuclear"

Topic: 5 
W

<h4>Output the topic distribution for the first speech<h4>

In [8]:
# print the topic distribution for the first speech
lda_model[corpus][0]

[(11, np.float32(0.99942815))]

<h4>Make a visualization<h4>

In [9]:
# make a visualization using pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis, "outputs/lda_visualization.html")
vis

<h2>BERTopic<h2>

<h4>Train a BERTopic model with a min_topic_size of 3<h4>

In [10]:
# Train a BERTopic model with a min_topic_size of 3
docs = sou['Text'].to_list()
topic_model = BERTopic(min_topic_size=3)
topics, probs = topic_model.fit_transform(docs)

# remove stop words from the topics
vectorizer_model = CountVectorizer(stop_words="english")
topic_model.update_topics(docs, vectorizer_model=vectorizer_model)

<h4>Output the top 10 words for each topic.<h4>

In [11]:
topic_info = topic_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,70,-1_government_states_congress_united,"[government, states, congress, united, year, p...",[\nFellow Citizens of the Senate and of the Ho...
1,0,21,0_states_government_united_public,"[states, government, united, public, congress,...",[\nFellow-Citizens of the Senate and House of ...
2,1,14,1_states_government_united_year,"[states, government, united, year, congress, l...",[\nTo the Senate and House of Representatives:...
3,2,13,2_america_american_people_americans,"[america, american, people, americans, tonight...","[\nThe President. Mr. Speaker, Mr. Vice Presid..."
4,3,13,3_government_united_states_department,"[government, united, states, department, congr...",[\nFellow-Citizens of the Senate and House of ...
5,4,12,4_world_new_american_america,"[world, new, american, america, nation, presid...","[\nMr. Speaker, Mr. President, my colleagues i..."
6,5,12,5_states_government_united_congress,"[states, government, united, congress, great, ...",[\nFellow-Citizens of the Senate and of the Ho...
7,6,11,6_government_law_great_work,"[government, law, great, work, public, busines...",[\nTo the Senate and House of Representatives:...
8,7,10,7_new_america_people_americans,"[new, america, people, americans, american, ma...","[\nMadam Speaker, Mr. Vice President, Members ..."
9,8,10,8_world_peace_nations_soviet,"[world, peace, nations, soviet, nation, econom...","[\nMr. President, Mr. Speaker, Members of the ..."


<h4>Output the topic distribution for the first speech<h4>

In [12]:
# output the topic distribution for the first speech
topic_distr, _ = topic_model.approximate_distribution(docs)
fig = topic_model.visualize_distribution(topic_distr[0])
fig.write_html("outputs/BERT_topic_distribution.html")
fig

<h4>Make a visualization of the topics<h4>

In [13]:
# run this cell to visualize the topics
fig2 = topic_model.visualize_topics()
fig2.write_html("outputs/Intertopic_map.html")
fig2