# Part 3 - Themes Over Time and Party Affiliation

- What are the most common themes in inaugural addresses?  
- How has this changed over time?
- Does a certain party have more polarizing speech?

In [23]:
import pandas as pd
import numpy as np
import spacy
from collections import Counter

In [2]:
inaugural = pd.read_csv('data/inaugural_address.csv').iloc[:, 1:]
#convert to dt
inaugural['date'] = pd.to_datetime(inaugural['date'])
inaugural.head()

Unnamed: 0,president_name,president_number,date,text
0,George Washington,1,1789-04-30 00:00:00+00:00,\nFellow-Citizens of the Senate and of the Hou...
1,George Washington,1,1793-03-04 00:00:00+00:00,\nFellow Citizens:\nI AM again called upon by ...
2,John Adams,2,1797-03-04 00:00:00+00:00,"\nWHEN it was first perceived, in early times,..."
3,Thomas Jefferson,3,1801-03-04 00:00:00+00:00,\nFriends and Fellow-Citizens:\nCALLED upon to...
4,Thomas Jefferson,3,1805-03-04 00:00:00+00:00,"\nPROCEEDING, fellow-citizens, to that qualifi..."


### What are the most common themes in inaugural addresses?

In [None]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models
from spacy import displacy


In [None]:
# If did not get env from Makefile, run:
!python -m spacy download en_core_web_sm

In [4]:
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text): 
    doc = nlp(text) 
    return [
        token.lemma_.lower() for token in doc 
        if not (token.is_stop or token.is_punct or token.is_space) 
        and len(token.lemma_) > 3
    ]

In [None]:
processed_docs = inaugural["text"].apply(preprocess_text)

dictionary = Dictionary(processed_docs) 
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs] # approx 25 seconds

In [6]:
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=42, passes=10)

In [7]:
print("Inaugural Addresses LDA Themes:")
for i, topic in lda_model.print_topics(-1): 
    print(f"Theme: {i}")
    print(f"Words: {topic}")
    print()

Inaugural Addresses LDA Themes:
Theme: 0
Words: 0.005*"method" + 0.004*"establish" + 0.004*"race" + 0.004*"community" + 0.004*"rest" + 0.004*"commercial" + 0.004*"officer" + 0.004*"local" + 0.004*"relation" + 0.004*"election"

Theme: 1
Words: 0.010*"business" + 0.006*"federal" + 0.006*"increase" + 0.005*"ought" + 0.005*"legislation" + 0.004*"tariff" + 0.004*"revenue" + 0.004*"proper" + 0.004*"race" + 0.004*"trade"

Theme: 2
Words: 0.009*"thank" + 0.008*"today" + 0.007*"like" + 0.006*"task" + 0.006*"face" + 0.005*"child" + 0.005*"civilization" + 0.004*"industrial" + 0.004*"help" + 0.004*"wish"

Theme: 3
Words: 0.013*"today" + 0.012*"americans" + 0.009*"century" + 0.007*"democracy" + 0.007*"child" + 0.006*"generation" + 0.006*"earth" + 0.006*"promise" + 0.006*"challenge" + 0.006*"help"

Theme: 4
Words: 0.005*"opinion" + 0.005*"object" + 0.004*"general" + 0.004*"exist" + 0.004*"revenue" + 0.004*"limit" + 0.004*"regard" + 0.003*"circumstance" + 0.003*"effect" + 0.003*"experience"



In [8]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

### How have the major themes changed over time?

In [9]:
inaugural_1700s = inaugural[inaugural['date'].dt.year < 1800]
inaugural_1800s = inaugural[(1800 <= inaugural['date'].dt.year) & (inaugural['date'].dt.year < 1900)]
inaugural_1900s = inaugural[(1900 <= inaugural['date'].dt.year) & (inaugural['date'].dt.year < 2000)]
inaugural_2000s = inaugural[2000 < inaugural['date'].dt.year]

In [10]:
processed_docs_1700s = inaugural_1700s["text"].apply(preprocess_text)
processed_docs_1800s = inaugural_1800s["text"].apply(preprocess_text)
processed_docs_1900s = inaugural_1900s["text"].apply(preprocess_text)
processed_docs_2000s = inaugural_2000s["text"].apply(preprocess_text)

# use same dictionary for all:
dictionary = Dictionary(processed_docs)
dictionary.filter_extremes(no_below=5, no_above=0.5)

corpus_1700s = [dictionary.doc2bow(doc) for doc in processed_docs_1700s]
corpus_1800s = [dictionary.doc2bow(doc) for doc in processed_docs_1800s]
corpus_1900s = [dictionary.doc2bow(doc) for doc in processed_docs_1900s]
corpus_2000s = [dictionary.doc2bow(doc) for doc in processed_docs_2000s]

lda_model_1700s = LdaModel(corpus=corpus_1700s, id2word=dictionary, num_topics=5, random_state=42, passes=10)
lda_model_1800s = LdaModel(corpus=corpus_1800s, id2word=dictionary, num_topics=5, random_state=42, passes=10)
lda_model_1900s = LdaModel(corpus=corpus_1900s, id2word=dictionary, num_topics=5, random_state=42, passes=10)
lda_model_2000s = LdaModel(corpus=corpus_2000s, id2word=dictionary, num_topics=5, random_state=42, passes=10)

In [11]:
print("Inaugural Addresses 18th Century LDA Themes:")
for i, topic in lda_model_1700s.print_topics(-1): 
    print(f"Theme: {i}")
    print(f"Words: {topic}")
    print()

Inaugural Addresses 18th Century LDA Themes:
Theme: 0
Words: 0.003*"voice" + 0.003*"execute" + 0.003*"official" + 0.003*"entertain" + 0.003*"instance" + 0.003*"constitutional" + 0.003*"function" + 0.003*"occasion" + 0.003*"endeavor" + 0.003*"presence"

Theme: 1
Words: 0.001*"knowledge" + 0.001*"general" + 0.001*"legislature" + 0.001*"choice" + 0.001*"establish" + 0.001*"resolution" + 0.001*"virtuous" + 0.001*"wish" + 0.001*"feel" + 0.001*"idea"

Theme: 2
Words: 0.001*"happiness" + 0.001*"ought" + 0.001*"establish" + 0.001*"blessing" + 0.001*"humble" + 0.001*"nature" + 0.001*"influence" + 0.001*"particular" + 0.001*"decide" + 0.001*"expedient"

Theme: 3
Words: 0.007*"ought" + 0.005*"happiness" + 0.005*"establish" + 0.005*"circumstance" + 0.005*"nature" + 0.004*"private" + 0.004*"influence" + 0.004*"blessing" + 0.004*"execute" + 0.004*"particular"

Theme: 4
Words: 0.008*"legislature" + 0.006*"virtuous" + 0.006*"general" + 0.006*"choice" + 0.006*"knowledge" + 0.004*"happiness" + 0.004*"co

In [12]:
print("Inaugural Addresses 19th Century LDA Themes:")
for i, topic in lda_model_1800s.print_topics(-1): 
    print(f"Theme: {i}")
    print(f"Words: {topic}")
    print()

Inaugural Addresses 19th Century LDA Themes:
Theme: 0
Words: 0.006*"protection" + 0.005*"object" + 0.005*"revenue" + 0.005*"extend" + 0.004*"importance" + 0.004*"blessing" + 0.004*"territory" + 0.004*"regard" + 0.004*"opinion" + 0.004*"happy"

Theme: 1
Words: 0.008*"revenue" + 0.007*"business" + 0.006*"legislation" + 0.005*"countryman" + 0.005*"constant" + 0.004*"patriotic" + 0.004*"enterprise" + 0.004*"partisan" + 0.004*"federal" + 0.004*"benefit"

Theme: 2
Words: 0.006*"officer" + 0.006*"revenue" + 0.005*"community" + 0.005*"method" + 0.005*"increase" + 0.005*"expect" + 0.004*"occasion" + 0.004*"debt" + 0.004*"reason" + 0.004*"opinion"

Theme: 3
Words: 0.006*"opinion" + 0.005*"experience" + 0.005*"happiness" + 0.004*"feel" + 0.004*"circumstance" + 0.004*"constitutional" + 0.004*"countryman" + 0.003*"position" + 0.003*"general" + 0.003*"result"

Theme: 4
Words: 0.005*"object" + 0.005*"general" + 0.005*"effect" + 0.005*"exist" + 0.005*"case" + 0.005*"opinion" + 0.005*"grant" + 0.004*"p

In [13]:
print("Inaugural Addresses 20th Century LDA Themes:")
for i, topic in lda_model_1900s.print_topics(-1): 
    print(f"Theme: {i}")
    print(f"Words: {topic}")
    print()

Inaugural Addresses 20th Century LDA Themes:
Theme: 0
Words: 0.007*"island" + 0.007*"rest" + 0.005*"inhabitant" + 0.005*"million" + 0.005*"faithful" + 0.004*"problem" + 0.004*"relation" + 0.004*"treaty" + 0.004*"solve" + 0.004*"establish"

Theme: 1
Words: 0.010*"business" + 0.007*"increase" + 0.006*"ought" + 0.006*"race" + 0.005*"proper" + 0.005*"amendment" + 0.005*"federal" + 0.005*"international" + 0.005*"tariff" + 0.005*"trade"

Theme: 2
Words: 0.008*"task" + 0.006*"civilization" + 0.006*"opportunity" + 0.006*"face" + 0.005*"thought" + 0.005*"industrial" + 0.005*"leadership" + 0.005*"problem" + 0.005*"wish" + 0.005*"today"

Theme: 3
Words: 0.014*"today" + 0.012*"century" + 0.012*"americans" + 0.008*"help" + 0.007*"earth" + 0.007*"democracy" + 0.006*"friend" + 0.006*"child" + 0.006*"begin" + 0.006*"challenge"

Theme: 4
Words: 0.001*"today" + 0.001*"federal" + 0.001*"economic" + 0.001*"business" + 0.001*"opportunity" + 0.001*"increase" + 0.001*"self" + 0.001*"help" + 0.001*"problem" +

In [14]:
print("Inaugural Addresses 21st Century LDA Themes:")
for i, topic in lda_model_2000s.print_topics(-1): 
    print(f"Theme: {i}")
    print(f"Words: {topic}")
    print()

Inaugural Addresses 21st Century LDA Themes:
Theme: 0
Words: 0.001*"thank" + 0.001*"today" + 0.001*"generation" + 0.001*"child" + 0.001*"americans" + 0.001*"like" + 0.001*"woman" + 0.001*"build" + 0.001*"hard" + 0.001*"begin"

Theme: 1
Words: 0.006*"permanent" + 0.006*"excuse" + 0.006*"feel" + 0.005*"tyranny" + 0.005*"deep" + 0.005*"goal" + 0.004*"fulfill" + 0.004*"tradition" + 0.004*"oppression" + 0.004*"idealism"

Theme: 2
Words: 0.018*"thank" + 0.014*"today" + 0.014*"americans" + 0.009*"child" + 0.009*"democracy" + 0.009*"like" + 0.007*"challenge" + 0.007*"promise" + 0.007*"dream" + 0.007*"unity"

Theme: 3
Words: 0.011*"today" + 0.010*"generation" + 0.009*"americans" + 0.007*"woman" + 0.007*"child" + 0.006*"word" + 0.006*"face" + 0.006*"journey" + 0.005*"ideal" + 0.005*"hard"

Theme: 4
Words: 0.001*"today" + 0.001*"child" + 0.001*"generation" + 0.001*"thank" + 0.001*"americans" + 0.001*"like" + 0.001*"word" + 0.001*"forward" + 0.001*"build" + 0.001*"dream"



In [15]:
# 18th century topics visaualization
pyLDAvis.gensim_models.prepare(lda_model_1700s, corpus_1700s, dictionary)

In [16]:
# 19th century topics visaualization
pyLDAvis.gensim_models.prepare(lda_model_1800s, corpus_1800s, dictionary)

In [17]:
# 20th century topics visaualization
pyLDAvis.gensim_models.prepare(lda_model_1900s, corpus_1900s, dictionary)

In [18]:
# 21st century topics visaualization
pyLDAvis.gensim_models.prepare(lda_model_2000s, corpus_2000s, dictionary)

### Does a certain party have more polarizing speech?


In [19]:
inaugural['party'] = np.array([
    'None',
    'None',
    'Federalist',
    'Democratic-Republican',
    'Democratic-Republican',
    'Democratic-Republican',
    'Democratic-Republican',
    'Democratic-Republican',
    'Democratic-Republican',
    'National Republican',
    'Democratic',
    'Democratic',
    'Democratic',
    'Whig',
    'Democratic',
    'Whig',
    'Democratic',
    'Democratic',
    'Republican',
    'Republican',
    'Republican',
    'Republican',
    'Republican',
    'Republican',
    'Democratic',
    'Republican',
    'Democratic',
    'Republican',
    'Republican',
    'Republican',
    'Republican',
    'Democratic',
    'Democratic',
    'Republican',
    'Republican',
    'Republican',
    'Democratic',
    'Democratic',
    'Democratic',
    'Democratic',
    'Republican',
    'Democratic',
    'Republican',
    'Republican',
    'Republican',
    'Republican',
    'Democratic',
    'Democratic',
    'Republican',
    'Republican',
    'Democratic',
    'Democratic',
    'Republican',
    'Democratic',
    'Republican'
])
inaugural = inaugural[['president_name', 'party', 'president_number', 'date', 'text']]
inaugural.head()

Unnamed: 0,president_name,party,president_number,date,text
0,George Washington,,1,1789-04-30 00:00:00+00:00,\nFellow-Citizens of the Senate and of the Hou...
1,George Washington,,1,1793-03-04 00:00:00+00:00,\nFellow Citizens:\nI AM again called upon by ...
2,John Adams,Federalist,2,1797-03-04 00:00:00+00:00,"\nWHEN it was first perceived, in early times,..."
3,Thomas Jefferson,Democratic-Republican,3,1801-03-04 00:00:00+00:00,\nFriends and Fellow-Citizens:\nCALLED upon to...
4,Thomas Jefferson,Democratic-Republican,3,1805-03-04 00:00:00+00:00,"\nPROCEEDING, fellow-citizens, to that qualifi..."


In [20]:
from afinn import Afinn
import warnings
# ignore groupby warnings:
warnings.filterwarnings("ignore")

afinn = Afinn()

def calculate_sentiment_metrics(tokens):
    """
    Calculates sentiment metrics for tokens
    """
    scores = [afinn.score(token) for token in tokens]
    
    # Remove neutral words for polarization_score
    non_zero_scores = [s for s in scores if s != 0]
    
    return {
        'sentiment_score': sum(scores),
        'positive_words': sum(1 for s in scores if s > 0),
        'negative_words': sum(1 for s in scores if s < 0),
        'sentiment_variance': np.var(non_zero_scores) if non_zero_scores else 0
    }

sentiment_metrics = processed_docs.apply(calculate_sentiment_metrics)
sentiment_df = pd.DataFrame(sentiment_metrics.tolist())

# Combine with original data
inaugural_with_sentiment = pd.concat([inaugural, sentiment_df], axis=1)

def calculate_polarization(group):
    """
    Polarization metrics function for groupby
    """
    return pd.Series({
        'mean_sentiment_variance': group['sentiment_variance'].mean(),
        'total_positive_words': group['positive_words'].sum(),
        'total_negative_words': group['negative_words'].sum(),
        'net_sentiment': group['sentiment_score'].mean(),
        'num_speeches': len(group)
    })

polarization_by_party = inaugural_with_sentiment.groupby('party').apply(calculate_polarization)

polarization_by_party['avg_positive_per_speech'] = (
    polarization_by_party['total_positive_words'] / polarization_by_party['num_speeches']
)
polarization_by_party['avg_negative_per_speech'] = (
    polarization_by_party['total_negative_words'] / polarization_by_party['num_speeches']
)

  self._word_pattern = re.compile('\w+', flags=re.UNICODE)


In [21]:
(
    polarization_by_party
    .sort_values('net_sentiment', ascending=False)
    [['total_positive_words', 'total_negative_words', 'net_sentiment', 'num_speeches']]
)

Unnamed: 0_level_0,total_positive_words,total_negative_words,net_sentiment,num_speeches
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Whig,478.0,180.0,245.0,2.0
National Republican,161.0,45.0,227.0,1.0
Federalist,147.0,40.0,203.0,1.0
Democratic-Republican,887.0,288.0,194.0,6.0
Republican,3718.0,1400.0,189.086957,23.0
Democratic,2638.0,1016.0,151.05,20.0
,92.0,23.0,69.0,2.0


In [22]:
(
    polarization_by_party
    .sort_values('avg_positive_per_speech', ascending=False)
    [['avg_positive_per_speech', 'avg_negative_per_speech', 'mean_sentiment_variance']]
)

Unnamed: 0_level_0,avg_positive_per_speech,avg_negative_per_speech,mean_sentiment_variance
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Whig,239.0,90.0,2.588256
Republican,161.652174,60.869565,3.243731
National Republican,161.0,45.0,2.751744
Democratic-Republican,147.833333,48.0,3.147726
Federalist,147.0,40.0,3.329578
Democratic,131.9,50.8,3.15804
,46.0,11.5,2.705604
