# Analysis of Presidential Speeches 

### Group Members
<font colour =blue> 
1. Diego Alonso Delgado Caceres dd2941  <br>
2. Harry Peter Kershaw hpk2114 <br>
3. Martin Feuerstein Mendik maf2250 <br>
4. Subey Dengur sd3231 <br>
</font>

### Speech Data
This project draws heavily on the work by the Miller Center, an affiliate of the University of Virginia. The Miller Center's website contains transcripts of all presidential speeches:
[millercenter.org](https://millercenter.org/the-presidency/presidential-speeches)


#### Packages Used:
1. Beautiful Soup<br>
2. Time<br>
3. Regex<br>
4. Selenium<br>
5. NLTK<br>





In [1]:
# Import all the packages at once 
import time
import re
import urllib3 
import pandas as pd
import string

from datetime import datetime
from bs4 import BeautifulSoup #using beautifulsoup to parse html
from selenium import webdriver #using selenum to deal with the millercenter.org infite scroll

In [4]:
# Use Selenium to overcome the challenge of scaping a webpage with an infinite scroll feature.    
# First: we will set the url and then we will go to Millercenter.org.
# Second: we will select (click on) the presidents we are interested in.
# Third: we will scroll down on the website, allow the website to load (if it can),
# and we'll stop trying to scroll when we hit the bottom of the page. 
# Fourth: we'll parse the page with beautiful soup.

# First: we will set the url and then we will go to Millercenter.org.
url = "http://millercenter.org/the-presidency/presidential-speeches"
browser = webdriver.Chrome()
browser.get(url) 

# Second: we will select (click on) the presidents we are interested in.
first_pres_id = 43
last_pres_id = 44
for i in range(first_pres_id,last_pres_id):
    x = i
    absolutepath_president = f'/html/body/div[2]/div/main/div[2]/div/div[2]/article/div/div[2]/div/div/div/form/div[3]/fieldset/div/div/div/div[{x}]/label'
    president1 = browser.find_element_by_xpath(absolutepath_president)
    president1.click()
    time.sleep(2)

# Third: we will scroll down on the website, allow the website to load (if it can),
# and we'll stop trying to scroll when we hit the bottom of the page. 
scroll_pause_time = 2
last_height = browser.execute_script("return document.body.scrollHeight")

while True:
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(scroll_pause_time) # Wait to load page
    new_height = browser.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break 
    last_height = new_height

# Fourth: we'll parse the page with beautiful soup.
soup = BeautifulSoup(browser.page_source, 'lxml')    

In [5]:
# Now we have the html of the Miller Center, we can to use scrape the speech transcripts
# First: From the html get the urls of the speeches
# Second: Use Urllib3 and BeautifulSoup to parse the html data for each speech - Urllib3 is much faster than Selenium.
# Third: Some speeches aren't in the "transript-inner" - make sure we get them as well.

# First: From the html get the urls of the speeches
speech_urls = []
for link in soup.findAll('a', href=True):
    if link != None:
        if link['href'].find("/presidential-speeches/") >= 0: 
            speech_urls.append(link['href'])

# Second: Use Urllib3 and BeautifulSoup to parse the html data for each speech - Urllib3 is much faster than Selenium.
http = urllib3.PoolManager()
raw_speeches = []
for element in speech_urls:
    url = "http://millercenter.org" + element
    response = http.request('GET', url)
    speech_soup = BeautifulSoup(response.data, 'lxml')
    raw_speeches.append(speech_soup.findAll('div', {"class":"transcript-inner"}))
    
# Third: Some speeches aren't in the "transript-inner" - make sure we get them as well.
raw_speeches_all = []
for i in range(len(raw_speeches)):
    if raw_speeches[i]:
        raw_speeches_all.append(str(raw_speeches[i]))
    else:
        url = "http://millercenter.org" + speech_urls[i]
        response = http.request('GET', url)
        speech_soup = BeautifulSoup(response.data, 'lxml')
        raw_speeches_all.append(str(speech_soup.findAll('div', {"class":"view-transcript"})))


/the-presidency/presidential-speeches/march-19-2018-remarks-combating-opioid-crisis
/the-presidency/presidential-speeches/february-23-2018-remarks-conservative-political-action
/the-presidency/presidential-speeches/february-15-2018-statement-school-shooting-parkland-florida
/the-presidency/presidential-speeches/february-1-2018-remarks-house-and-senate-republican-member
/the-presidency/presidential-speeches/january-30-2018-state-union-address
/the-presidency/presidential-speeches/january-26-2018-address-world-economic-forum
/the-presidency/presidential-speeches/december-18-2017-remarks-national-security-strategy
/the-presidency/presidential-speeches/september-19-2017-address-united-nations-general-assembly
/the-presidency/presidential-speeches/july-24-2017-speech-boy-scout-jamboree
/the-presidency/presidential-speeches/june-29-2017-speech-unleashing-american-energy-event
/the-presidency/presidential-speeches/february-28-2017-address-joint-session-congress
/the-presidency/presidential-sp

In [6]:
# We have the speech data and have some cleaning up to do.

#this will be used to clean the text from html tags
def tag_cleanr(text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

cleaned_speeches = []

#these replace lines could and should be replaced with regex since we covered that in the syllabus more extensively

for i in range(len(raw_speeches_all)):
    x = tag_cleanr(raw_speeches_all[i])
    x = x.replace('Transcript', '').replace('\r\n', '').replace('\n', '')
    cleaned_speeches.append(x)


#Get all the speech dates into a list

pattern = r'(\w+-\d+-\d{4})'
speech_dates=[]


for i in range(len(speech_urls)):
    match = re.findall(pattern, speech_urls[i])
    speech_dates.append(str(match))


for i in range(len(speech_dates)):
    #this feels like its terrible syntax but not sure if there is a nicer way, maybe I will replace it with regex
    
    speech_dates[i] = speech_dates[i].replace('[', '').replace(']', ''.replace("'", "")).replace("'", "")
    speech_dates[i] = datetime.strptime(speech_dates[i], '%B-%d-%Y')
    speech_dates[i] = datetime.date(speech_dates[i])


speeches = pd.DataFrame({'Date':speech_dates, 'Speeches':cleaned_speeches})

print(speeches)

          Date                                           Speeches
0   2018-03-19  [THE PRESIDENT: Thank you to our First Lady, M...
1   2018-02-23  [THE PRESIDENT: Thank you very much. Thank you...
2   2018-02-15  [THE PRESIDENT: My fellow Americans, today I s...
3   2018-02-01  [THE PRESIDENT: Thank you, Paul and Mitch, for...
4   2018-01-30  [Mr. Speaker, Mr. Vice President, Members of C...
5   2018-01-26  [PRESIDENT TRUMP: Thank you, Klaus, very much....
6   2017-12-18  [THE PRESIDENT: Thank you very much. Thank you...
7   2017-09-19  [PRESIDENT TRUMP: Mr. Secretary General, Mr. P...
8   2017-07-24  [TRUMP: Thank you, everybody. Thank you very m...
9   2017-06-29  [THE PRESIDENT: Thank you, everybody. Thank yo...
10  2017-02-28  [Mr. Speaker, Mr. Vice President, Members of C...
11  2017-01-20  [Chief Justice Roberts, President Carter, Pres...
12  2016-05-15  [Hello Rutgers!  (Applause.)  R-U rah-rah!  (A...
13  2016-03-22  [Thank you.  (Applause.)  Muchas gracias.  Tha...
14  2016-0

In [9]:
speeches.to_pickle("speeches_1.pkl")


Add a function to map from a date to a President

In [None]:
from sympy import Piecewise
import numpy
from sympy.utilities.lambdify import lambdify
from sympy.abc import x
from datetime import date
import re
def pres_from_date(date_):
    p = lambdify(x, Piecewise((('Theodore_Roosevelt', "R"), ((x >= date(1901, 9, 14).toordinal()) & (x < date(1909, 3, 4).toordinal()))),
                              (('William_Howard_Taft', "R"), ((x >= date(1909, 3, 4).toordinal()) & (x < date(1913, 3, 4).toordinal()))),
                              (('Woodrow_Wilson', "D"), ((x >= date(1913, 3, 4).toordinal()) & (x < date(1921, 3, 4).toordinal()))),
                              (('Warren_G_Harding', "R"), ((x >= date(1921, 3, 4).toordinal()) & (x < date(1923, 8, 3).toordinal()))),
                              (('Calvin_Coolidge', "R"), ((x >= date(1923, 8, 3).toordinal()) & (x < date(1929, 3, 4).toordinal()))),
                              (('Herbert_Hoover', "R"), ((x >= date(1929, 3, 4).toordinal()) & (x < date(1933, 3, 4).toordinal()))),
                              (('Franklin_D_Roosevelt', "D"), ((x >= date(1933, 3, 4).toordinal()) & (x < date(1945, 4, 12).toordinal()))),
                              (('Harry_S_Truman', "D"), ((x >= date(1945, 4, 12).toordinal()) & (x < date(1953, 1, 20).toordinal()))),
                              (('Dwight_D_Eisenhower', "R"), ((x >= date(1953, 1, 20).toordinal()) & (x < date(1961, 1, 20).toordinal()))),
                              (('John_F_Kennedy', "D"), ((x >= date(1961, 1, 20).toordinal()) & (x < date(1963, 11, 22).toordinal()))),
                              (('Lyndon_B_Johnson', "D"), ((x >= date(1963, 11, 22).toordinal()) & (x < date(1969, 1, 20).toordinal()))),
                              (('Richard_M_Nixon', "R"), ((x >= date(1969, 1, 20).toordinal()) & (x < date(1974, 8, 9).toordinal()))),
                              (('Gerald_R_Ford', "R"), ((x >= date(1974, 8, 9).toordinal()) & (x < date(1977, 1, 20).toordinal()))),
                              (('Jimmy_Carter', "D"), ((x >= date(1977, 1, 20).toordinal()) & (x < date(1981, 1, 20).toordinal()))),
                              (('Ronald_Reagan', "R"), ((x >= date(1981, 1, 20).toordinal()) & (x < date(1989, 1, 20).toordinal()))),
                              (('George_H_W_Bush', "R"), ((x >= date(1989, 1, 20).toordinal()) & (x < date(1993, 1, 20).toordinal()))),
                              (('William_J_Clinton', "D"), ((x >= date(1993, 1, 20).toordinal()) & (x < date(2001, 1, 20).toordinal()))),
                              (('George_W_Bush', "R"), ((x >= date(2001, 1, 20).toordinal()) & (x < date(2009, 1, 20).toordinal()))),
                              (('Barack_Obama', "D"), ((x >= date(2009, 1, 20).toordinal()) & (x < date(2017, 1, 20).toordinal()))),
                              (('Donald_J_Trump', "R"), (x >= date(2017, 1, 20).toordinal()))
                              ),
                              "numpy")
    final = p(date_.toordinal())
    return [re.sub('_', " ", str(final[0])), str(final[1])]

In [None]:
## set up data frame
from datetime import datetime, date, timedelta
import pandas as pd

df1 = pd.read_csv('Dow_Jones_data_since_1900.csv')
df2 = pd.read_csv('Nasdaq_since_1971.csv')
df3 = pd.read_csv('S&P_data_since_1950.csv')
df1 = df1[df1.Dow_Change != 0]
df1 = df1.drop(columns = 'Close')
df2 = df2.drop(columns = 'Close')
df3 = df3.drop(columns = 'Close')
fin_data = df1.merge(df2, how = "left")
fin_data = fin_data.merge(df3, how = "left")
dates = []
for i in range(len(fin_data.Date)):
    dates.append(datetime.strptime(fin_data.Date[i], '%m/%d/%Y'))
    dates[i] = datetime.date(dates[i])
fin_data.Date = dates


## This function should work if there are no instances where two speeches happened on consecutive days where
def add_speeches_to_fin_data(df, fin_data):
    df['President'] = ""
    df['Party'] = ""
    pres = []
    parties = []
    for i in range(len(df.Date)):
        temp = pres_from_date(df.Date[i])
        pres.append(temp[0])
        parties.append(temp[1])
    df.President = pres
    df.Party = parties
    dates = []
    for i in range(len(df.Date)):
        dates.append(df.Date[i] + timedelta(days = 1))
    df.Date = dates
    final = fin_data.merge(df, how = "outer")
    final = final.sort_values(by=['Date'])
    print(list(pd.isnull(final.Dow_Change).nonzero()[0]))
    for i in list(pd.isnull(final.Dow_Change).nonzero()[0]):
        final.Date[i + 1] = final.Date[i]
        final.Speeches[i + 1] = final.Speeches[i]
        final.President[i + 1] = final.President[i]
        final.Party[i + 1] = final.Party[i]
    final = final[pd.notna(final.Dow_Change)]
    return final

In [None]:
from nltk.tokenize import sent_tokenize
sent_tokenize_list = sent_tokenize(text)
len(sent_tokenize_list)
# sent_tokenize_list

In [None]:
import nltk
from statistics import mean
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
score = []
sid = SentimentIntensityAnalyzer()
for sentence in sent_tokenize_list:
#    print(sentence)
    ss = sid.polarity_scores(sentence)
    score.append(ss['compound'])

    for k in ss:
        print('{0}: {1}, '.format(k, ss[k]), end='\n')
#     print()
mean(score)

In [None]:
text = 'this is a shitty sentence'
ss = sid.polarity_scores(text)
for k in ss:
    print('{0}: {1}, '.format(k, ss[k]), end='\n')
#compound -1 to 1