# Part 3: Advanced Text Processing - LDA and BERTopic Topic Modeling

In [None]:
# Import all required libraries
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")
print("SpaCy model loaded successfully")

In [None]:
# Load the SOTU data
sou = pd.read_csv("data/SOTU.csv")
print(f"Total speeches: {len(sou)}")
print(f"Columns: {sou.columns.tolist()}")
sou.head()

In [None]:
# Define preprocessing function
def preprocess_text(text): 
    doc = nlp(text) 
    return [token.lemma_.lower() for token in doc 
            if not token.is_stop and not token.is_punct and not token.is_space and len(token.lemma_) > 3]

# Process all texts - note this takes ~ 5 minutes to run
print("Processing documents... this will take about 5 minutes")
processed_docs = sou['Text'].apply(preprocess_text)
print(f"Processed {len(processed_docs)} documents")

## LDA Topic Modeling

In [None]:
# LDA using Gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel

# Build dictionary and corpus
dictionary = Dictionary(processed_docs)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

print(f"Dictionary size: {len(dictionary)}")
print(f"Corpus size: {len(corpus)}")

# Train LDA model with 18 topics
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=18,  # Required by assignment
    random_state=42,
    passes=10,
    alpha='auto',
    eta='auto'
)

# Print the top 10 words for each topic
print("\n--- LDA Topics ---")
for idx in range(18):
    print(f"\nTopic: {idx}")
    words = lda_model.show_topic(idx, 10)
    word_list = [word for word, prob in words]
    print(f"Words: {', '.join(word_list)}")

In [None]:
# Get topic distribution for the first speech
first_speech_bow = corpus[0]
topic_dist = lda_model[first_speech_bow]

print("Topic distribution for the first speech:")
print(f"Speech by: {sou.iloc[0]['President']} ({sou.iloc[0]['Year']})")
print("\nTopic probabilities:")
for topic_id, prob in sorted(topic_dist, key=lambda x: x[1], reverse=True):
    if prob > 0.01:  # Only show topics with >1% probability
        print(f"Topic {topic_id}: {prob:.4f}")

In [None]:
# pyLDAvis Visualization
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Enable notebook display
pyLDAvis.enable_notebook()

# Prepare and display the visualization
vis = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)
vis

## BERTopic Topic Modeling

In [None]:
# BERTopic Implementation
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

# Get raw documents
docs = sou['Text'].to_list()

# Train BERTopic model with min_topic_size=3
print("Training BERTopic model... this may take a few minutes")
topic_model = BERTopic(
    min_topic_size=3,
    verbose=False
)

topics, probs = topic_model.fit_transform(docs)
print(f"Number of topics found: {len(set(topics)) - 1}")  # -1 to exclude outlier topic

# Remove stop words from topics
vectorizer_model = CountVectorizer(stop_words="english")
topic_model.update_topics(docs, vectorizer_model=vectorizer_model)

In [None]:
# Display top 10 words for each BERTopic topic
topic_info = topic_model.get_topic_info()
print("BERTopic Topics (showing first 25):")
print(topic_info[['Topic', 'Count', 'Name']].head(25))

print("\n\nDetailed view of top 10 topics:")
for topic_num in topic_info['Topic'].head(10):
    if topic_num != -1:  # Skip outlier topic
        words = topic_model.get_topic(topic_num)
        word_list = [word for word, score in words[:10]]
        print(f"\nTopic {topic_num}: {', '.join(word_list)}")

In [None]:
# Get topic for the first speech
first_speech_topic = topics[0]
print(f"BERTopic assignment for first speech:")
print(f"Speech by: {sou.iloc[0]['President']} ({sou.iloc[0]['Year']})")
print(f"Assigned to Topic: {first_speech_topic}")

if first_speech_topic != -1:
    topic_words = topic_model.get_topic(first_speech_topic)
    print(f"Topic words: {[word for word, score in topic_words[:5]]}")

In [None]:
# Visualize BERTopic topics
topic_model.visualize_topics()

In [None]:
# Create outputs directory if it doesn't exist
import os
os.makedirs('outputs', exist_ok=True)
print("Part 3 Complete!")
print("All requirements satisfied:")
print("✓ LDA with 18 topics")
print("✓ pyLDAvis visualization")
print("✓ BERTopic with min_topic_size=3")
print("✓ All topic distributions shown")