In [20]:
import pandas as pd
import spacy
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns


ModuleNotFoundError: No module named 'spacy'

In [25]:
import matplotlib.pyplot as plt


In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [27]:
# Load the SOTU data
sou = pd.read_csv("data/SOTU.csv")
print(f"Total speeches: {len(sou)}")

# Subset for speeches from 2000 onwards (as required by Part 2)
sou_2000 = sou[sou['Year'] >= 2000].copy()
print(f"Speeches from 2000 onwards: {len(sou_2000)}")
sou_2000.head()

Total speeches: 246
Speeches from 2000 onwards: 25


Unnamed: 0,President,Year,Text,Word Count
0,Joseph R. Biden,2024.0,"\n[Before speaking, the President presented hi...",8003
1,Joseph R. Biden,2023.0,\nThe President. Mr. Speaker——\n[At this point...,8978
2,Joseph R. Biden,2022.0,"\nThe President. Thank you all very, very much...",7539
3,Joseph R. Biden,2021.0,\nThe President. Thank you. Thank you. Thank y...,7734
4,Donald J. Trump,2020.0,\nThe President. Thank you very much. Thank yo...,6169


In [11]:
def preprocess_text(text): 
    doc = nlp(text) 
    return [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space and len(token.lemma_) > 3]

In [31]:
# Process all texts - note this takes ~ 5 minutes to run
processed_docs = sou['Text'].apply(preprocess_text)

In [32]:
print(processed_docs[:5])



0    [speak, president, present, prepared, remark, ...
1    [president, speaker, point, president, turn, f...
2    [president, thank, thank, thank, madam, speake...
3    [president, thank, thank, thank, good, mitch, ...
4    [president, thank, thank, thank, madam, speake...
Name: Text, dtype: object


In [None]:
dictionary = Dictionary(processed_docs)
dictionary.filter_extremes(no_below=5, no_above=0.5)  
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=5,       
                     random_state=42,    
                     passes=10,         
                     alpha='auto',       
                     eta='auto') 

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}\n")

In [30]:
docs = sou['Text'].to_list()

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(docs)

n_topics = 5 
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X)

words = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"\nTopic {topic_idx}:")
    top_words = topic.argsort()[-10:][::-1]
    for i in top_words:
        print(" ", words[i])



Topic 0:
  isthmus
  panama
  colombia
  revolution
  colombian
  canal
  1903
  granada
  transit
  riot

Topic 1:
  states
  government
  united
  congress
  public
  country
  great
  war
  state
  people

Topic 2:
  government
  states
  year
  000
  united
  congress
  american
  law
  department
  service

Topic 3:
  government
  war
  national
  congress
  people
  great
  000
  law
  nation
  public

Topic 4:
  america
  people
  world
  new
  year
  american
  years
  congress
  nation
  americans


In [31]:
distribution = lda.transform(X[0])

print("Topic distribution for the first speech:\n")
for i, prob in enumerate(distribution[0]):
    print(f"Topic {i}: {prob:.4f}")

sorted_idx = np.argsort(distribution)
sorted_probs = distribution[sorted_idx]
sorted_labels = [topic_labels[i] for i in sorted_idx]

Topic distribution for the first speech:

Topic 0: 0.0000
Topic 1: 0.0000
Topic 2: 0.0000
Topic 3: 0.2081
Topic 4: 0.7917


In [34]:
plt.figure(figsize=(12, 8))

bars = plt.barh(range(len(sorted_probs)), 
                sorted_probs, 
                color='lightgray', 
                edgecolor='black')

plt.yticks(range(len(sorted_labels)), sorted_labels, fontsize=10)
plt.xlabel("Probability", fontsize=12)
plt.title("Topic Probability Distribution", fontsize=20, fontweight='bold')

plt.tight_layout()
plt.show()

NameError: name 'sorted_probs' is not defined

<Figure size 1200x800 with 0 Axes>