# Part 4: Choose your own adventure! 


(7 Points; Optional for Extra Credit)

This section is open ended and your chance to explare any advanced analysis. Please perform any additional analysis you find interesting! Suggested analyses (only do one max):

I have chosen to conduct Semantic similarity. 

First I will investigate which two presidents have the most similar speeches.

In [3]:
import spacy
spacy.cli.download("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m67.2 MB/s[0m  [33m0:00:00[0mm0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
#import packages
from tqdm import tqdm
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from gensim.corpora.dictionary import Dictionary

nlp = spacy.load("en_core_web_sm")

# read in data
sotu = pd.read_csv('data/SOTU.csv')

In [5]:
sotu

Unnamed: 0,President,Year,Text,Word Count
0,Joseph R. Biden,2024.0,"\n[Before speaking, the President presented hi...",8003
1,Joseph R. Biden,2023.0,\nThe President. Mr. Speaker——\n[At this point...,8978
2,Joseph R. Biden,2022.0,"\nThe President. Thank you all very, very much...",7539
3,Joseph R. Biden,2021.0,\nThe President. Thank you. Thank you. Thank y...,7734
4,Donald J. Trump,2020.0,\nThe President. Thank you very much. Thank yo...,6169
...,...,...,...,...
241,George Washington,1791.0,\nFellow-Citizens of the Senate and House of R...,2264
242,George Washington,1790.0,\nFellow-Citizens of the Senate and House of R...,1069
243,George Washington,1790.0,\nFellow-Citizens of the Senate and House of R...,1069
244,George Washington,1790.0,\nFellow-Citizens of the Senate and House of R...,1069


In [6]:
# preprocess text to clean for lemmas
def preprocess_text(text): 
    doc = nlp(text) 
    return [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.is_space and len(token.lemma_) > 3]

In [36]:
# Preprocess text and keep president info
sotu['processed_text'] = sotu['Text'].apply(preprocess_text)

In [12]:
# Build of list of each president
presidents = sotu['President'].unique()

# Append the lemmas of each presidents' speeches into one long string for each president
# Keep name of president with string of text
# This builds a dictionary of all speeches called combined_speeches
combined_speeches = {
    p: " ".join([" ".join(doc) for doc in sotu[sotu['President'] == p]['processed_text']])
    for p in presidents
}

# Convert to lists
names = list(combined_speeches.keys())            # Keys are presidents' names
texts = list(combined_speeches.values())          # corresponding speech list of each president

# Turn texts into spaCy docs
docs = [nlp(text) for text in texts]

# create a similarity matrix
n = len(docs)

similarity_matrix = np.zeros((n, n))  # initialize

for i in range(n):
    for j in range(n):
        similarity_matrix[i, j] = docs[i].similarity(docs[j])


  similarity_matrix[i, j] = docs[i].similarity(docs[j])


I chose to hide the diagonal as this is where each presidents' speeches are compared to their own so that would be disregarded when looking for the most similar presidents' speeches.

In [15]:
np.fill_diagonal(similarity_matrix, 0)
similarity_matrix

array([[0.        , 0.99640006, 0.99574405, ..., 0.9859761 , 0.99067962,
        0.98598027],
       [0.99640006, 0.        , 0.99003059, ..., 0.98297691, 0.99239999,
        0.98647565],
       [0.99574405, 0.99003059, 0.        , ..., 0.99360591, 0.99029672,
        0.99022341],
       ...,
       [0.9859761 , 0.98297691, 0.99360591, ..., 0.        , 0.99268615,
        0.9971866 ],
       [0.99067962, 0.99239999, 0.99029672, ..., 0.99268615, 0.        ,
        0.99722457],
       [0.98598027, 0.98647565, 0.99022341, ..., 0.9971866 , 0.99722457,
        0.        ]], shape=(43, 43))

In [33]:
# extract the most similar pairs
max_score = similarity_matrix.max()
rows, cols = np.where(similarity_matrix == max_score)
# turn into (,) format
pairs = list(zip(rows, cols))
pairs_int = [(int(a), int(b)) for a, b in pairs]
print(pairs_int)
print(max_score)

[(22, 23), (23, 22)]
0.9995580911636353


In [35]:
# pairs_int[0] is the most similar pair
i, j = pairs_int[0]  # unpack the tuple

print(f"The most similar speeches were delivered by {names[i]} and {names[j]} with a similarity score of {max_score}")

The most similar speeches were delivered by Grover Cleveland and Benjamin Harrison with a similarity score of 0.9995580911636353


In [37]:
sotu

Unnamed: 0,President,Year,Text,Word Count,processed_text
0,Joseph R. Biden,2024.0,"\n[Before speaking, the President presented hi...",8003,"[speak, president, present, prepared, remark, ..."
1,Joseph R. Biden,2023.0,\nThe President. Mr. Speaker——\n[At this point...,8978,"[president, speaker, point, president, turn, f..."
2,Joseph R. Biden,2022.0,"\nThe President. Thank you all very, very much...",7539,"[president, thank, thank, thank, madam, speake..."
3,Joseph R. Biden,2021.0,\nThe President. Thank you. Thank you. Thank y...,7734,"[president, thank, thank, thank, good, mitch, ..."
4,Donald J. Trump,2020.0,\nThe President. Thank you very much. Thank yo...,6169,"[president, thank, thank, thank, madam, speake..."
...,...,...,...,...,...
241,George Washington,1791.0,\nFellow-Citizens of the Senate and House of R...,2264,"[fellow, citizen, senate, house, representativ..."
242,George Washington,1790.0,\nFellow-Citizens of the Senate and House of R...,1069,"[fellow, citizen, senate, house, representativ..."
243,George Washington,1790.0,\nFellow-Citizens of the Senate and House of R...,1069,"[fellow, citizen, senate, house, representativ..."
244,George Washington,1790.0,\nFellow-Citizens of the Senate and House of R...,1069,"[fellow, citizen, senate, house, representativ..."


Now I am interested in how similar Trump's first speech (2017) is to his more recent speech in 2020.

In [56]:
trump_speeches = sotu[sotu['President'] == "Donald J. Trump"]

# 2017 Trump speech
trump_2017 = trump_speeches[(trump_speeches['Year'] >= 2017.0) & (trump_speeches['Year'] < 2018.0)]

# 2020 Trump speech
trump_2020 = trump_speeches[(trump_speeches['Year'] >= 2020.0) & (trump_speeches['Year'] < 2021.0)]

# Combine list of words from processed_text into one string for each year
text_2017 = " ".join([" ".join(doc) for doc in trump_2017['processed_text']])
text_2020 = " ".join([" ".join(doc) for doc in trump_2020['processed_text']])

In [58]:
# get the nlp for each year
nlp_2017 = nlp(text_2017)
nlp_2020 = nlp(text_2020)

In [63]:
print(f"The similarity score between the State of the Union address given by Trump in 2017 and 2020 is {nlp_2017.similarity(nlp_2020)}, showing he has maintained his talking points throughout his first term and continues to address the same issues")

The similarity score between the State of the Union address given by Trump in 2017 and 2020 is 0.997931182384491, showing he has maintained his talking points throughout his first term and continues to address the same issues


  print(f"The similarity score between the State of the Union address given by Trump in 2017 and 2020 is {nlp_2017.similarity(nlp_2020)}, showing he has maintained his talking points throughout his first term and continues to address the same issues")
