## Statistics in Presidential Debates

In this notebook, I scrape data from the Presidential Debates from the Commission of Presidential Debates website: https://www.debates.org/voter-education/debate-transcripts/

I analyze the counts of specific values from each of the speeches and sentiment score each of the candidate's sentiment score using the VADAR NLP Sentiment analysis toolkit.

Using `requests` and `BeautifulSoup` to find all links/ URLs on the website and use the links found to get the text from each presidential debate.

This project was inspired by a project I did for my DataX class.



In [1]:
import requests
import nltk
from textblob import TextBlob
import numpy as np
import bs4 as bs
from collections import Counter
import re
import pandas as pd
source = requests.get("https://www.debates.org/voter-education/debate-transcripts/") 
soup = bs.BeautifulSoup(source.content, features='html.parser')

In [2]:
urllist = []
titlelist = []
for a in soup.find_all('a'):
        stringurl = "https://www.debates.org" + a.get('href')
        if stringurl not in urllist and 'debate-transcript' in a.get('href'):
            urllist.append(stringurl)
            titlelist.append(a.text)

titlelist

#deal with first val/last val
#will have to combine
#print(titlelist)

['Debate Transcripts',
 'October 19, 2016 Debate Transcript',
 'October 9, 2016 Debate Transcript',
 'October 4, 2016 Debate Transcript',
 'September 26, 2016 Debate Transcript',
 'October 22, 2012 Debate Transcript',
 'October 16, 2012 Debate Transcript',
 'October 11, 2012 Debate Transcript',
 'October 3, 2012 Debate Transcript',
 'September 26, 2008 Debate Transcript',
 'October 2, 2008 Debate Transcript',
 'October 7, 2008 Debate Transcript',
 'October 15, 2008 Debate Transcript',
 'October 13, 2004 Debate Transcript',
 'October 8, 2004 Debate Transcript',
 'October 5, 2004 Transcript',
 'September 30, 2004 Debate Transcript',
 'October 17, 2000 Debate Transcript',
 'October 11, 2000 Debate Transcript',
 'October 5, 2000 Debate Transcript',
 'October 3, 2000 Transcript',
 'October 16, 1996 Debate Transcript',
 'October 9, 1996 Debate Transcript',
 'October 6, 1996 Debate Transcript',
 'October 19, 1992 Debate Transcript',
 'October 15, 1992 First Half Debate Transcript',
 'October 

In [3]:
#removing first and last speeches manually (scraping isn't perfect)
urllist = urllist[1:-1]
titlelist = titlelist[1:-1]


In [4]:
#checks whether two strings are similar to one another, to deal with typos in the labeling of the speaker's name/ inconsistencies in naming
from difflib import SequenceMatcher
def isSimilar(x,y):
    simthreshhold = 0.5
    if x in y:
        return True
    if y in x:
        return True
    return SequenceMatcher(None, x, y).ratio() > simthreshhold

In [5]:
speechlist = []
for i in urllist:
    source1 = requests.get(i) 
    soup1 = bs.BeautifulSoup(source1.content, features='html.parser')

    speech = ''
    speechdf = pd.DataFrame()

    for p in soup1.find_all('p'):
        currspeaker = ''
        splittext = p.text.split(':',1)
        
        #if there's colon
        if len(splittext) == 2:
            name = splittext[0]
            speechtext =splittext[1]
            #if there's new speaker
            if (name.isupper()):
                currspeaker = name
                #if already in list of speakers
                if any(isSimilar(name,col) for col in speechdf.columns):
                    for cols in speechdf.columns:
                        if isSimilar(name, cols):
                        
                            currspeaker = cols
                    #set currspeaker to similar column
                    speechdf.loc[len(speechdf.index), currspeaker] = speechtext
                else: #if not in list of speakers
                    #create new column for new speaker
                    speechdf[currspeaker] = np.NaN
                    speechdf.loc[len(speechdf.index), currspeaker] = speechtext

   
    #continue paragraph
        elif len(currspeaker) > 1:
            speechdf.loc[len(speechdf.index), currspeaker] = p.text

    for col in speechdf.columns:
        if len(speechdf[col]) - speechdf[col].isna().sum() < 5:
            speechdf = speechdf.drop(col, axis = 1)     
    speechlist.append(speechdf)

    
    

In [6]:
#speechlist contains a list of dataframes, with each DF representing a presidential debate
len(speechlist)

44

In [16]:
#for speech in speechlist:

for speech in speechlist:
    sentdf = pd.DataFrame()
    for col in speech.columns:
        speechstring = ''
        for index,row in speech.iterrows():
            if isinstance(row[col],str):
                speechstring += row[col]
        
        blob = TextBlob(speechstring)
        #angry:-1 to happy: +1
        polarity = blob.sentiment.polarity
        sentdf[col] = np.NaN
        sentdf.loc[0, col] = polarity
    display(sentdf)
        
    

    
        

Unnamed: 0,WALLACE,CLINTON,TRUMP
0,0.137721,0.147258,0.051951


Unnamed: 0,RADDATZ,COOPER,QUESTION,CLINTON,TRUMP
0,0.08171,0.200084,0.329449,0.118943,0.179464


Unnamed: 0,QUIJANO,KAINE,PENCE
0,0.139518,0.140694,0.111796


Unnamed: 0,HOLT,CLINTON,TRUMP
0,0.099839,0.140567,0.150481


Unnamed: 0,[*]SCHIEFFER,ROMNEY,OBAMA
0,0.183612,0.163351,0.185358


Unnamed: 0,QUESTION,ROMNEY,OBAMA
0,0.059999,0.157999,0.19446


Unnamed: 0,"MARTHA RADDATZ, MODERATOR[*]RADDATZ",BIDEN,RYAN
0,0.098877,0.156753,0.131269


Unnamed: 0,LEHRER,OBAMA,ROMNEY
0,0.121303,0.2093,0.16512


Unnamed: 0,[*] LEHRER,OBAMA,MCCAIN
0,0.107725,0.163514,0.063617


Unnamed: 0,[*] BROKAW,QUESTION,OBAMA,MCCAIN
0,0.124046,0.20191,0.169888,0.076794


Unnamed: 0,[*] SCHIEFFER,MCCAIN,OBAMA
0,0.116798,0.103819,0.151659


Unnamed: 0,SCHIEFFER,KERRY,BUSH
0,0.18102,0.111211,0.216786


Unnamed: 0,GIBSON,KERRY,BUSH
0,-0.032372,0.133919,0.168347


Unnamed: 0,SPEAKERS,IFILL,CHENEY
0,0.120839,0.029707,0.075054


Unnamed: 0,LEHRER,KERRY,BUSH
0,0.140383,0.150042,0.129134


Unnamed: 0,MODERATOR,GORE,MEMBER OF AUDIENCE,BUSH
0,0.146839,0.133959,0.12406,0.179683


Unnamed: 0,MODERATOR,BUSH,GORE
0,0.048598,0.156145,0.111565


Unnamed: 0,MODERATOR,LIEBERMAN,CHENEY
0,0.071224,0.159763,0.129839


Unnamed: 0,MODERATOR,GORE,BUSH
0,0.129383,0.120964,0.174886


Unnamed: 0,LEHRER,DOLE,CLINTON,MR. MILLIGAN,MS. SIEFERT
0,0.104149,0.148787,0.158749,-0.013889,0.264497


Unnamed: 0,LEHRER,KEMP,GORE
0,-0.009558,0.108954,0.134937


Unnamed: 0,LEHRER,CLINTON,DOLE
0,0.151273,0.129296,0.10272


Unnamed: 0,JIM LEHRER,GOVERNOR CLINTON,PRESIDENT BUSH,PEROT,SUSAN ROOK (CNN)
0,0.10344,0.109308,0.106713,0.047447,0.091667


Unnamed: 0,CAROLE SIMPSON,PRESIDENT GEORGE BUSH,AUDIENCE QUESTION,ROSS PEROT,GOVERNOR CLINTON
0,0.207636,0.142792,0.140893,0.093295,0.045079


Unnamed: 0,SIMPSON,AUDIENCE QUESTION,PEROT,BUSH,CLINTON
0,0.117883,0.173706,0.163148,0.136011,0.145336


Unnamed: 0,HAL BRUNO,SENATOR GORE,VICE PRESIDENT QUAYLE,ADMIRAL STOCKDALE
0,0.107672,0.114545,0.056491,0.15183


Unnamed: 0,LEHRER,PEROT,CLINTON,PRESIDENT BUSH
0,0.129315,0.127002,0.095978,0.121407


Unnamed: 0,LEHRER,COMPTON,BUSH,PEROT
0,0.12886,0.214516,0.092535,0.149987


Unnamed: 0,SHAW,DUKAKIS,BUSH,COMPTON,WARNER,MITCHELL
0,0.2,0.156935,0.146035,0.062484,0.155823,-0.015666


Unnamed: 0,WOODRUFF,QUAYLE,BENTSEN,MARGOLIS,BROKAW,HUME
0,0.139329,0.103693,0.164637,0.098054,0.023076,0.160119


Unnamed: 0,LEHRER,BUSH,DUKAKIS,MASHEK,JENNINGS
0,0.117583,0.135749,0.120103,0.079609,0.163084


Unnamed: 0,MR. NEWMAN,MS. GEYER,THE PRESIDENT
0,0.039647,0.047799,0.138806


Unnamed: 0,VANOCUR,MASHEK,BUSH,FERRARO,WHITE,QUARLES,BOYD
0,-0.022791,0.056085,0.125409,0.081097,0.105588,0.152646,0.052426


Unnamed: 0,MR. MONDALE,THE PRESIDENT,MS. SAWYER
0,0.088018,0.058585,0.030102


Unnamed: 0,MR. CARTER,BARBARA WALTERS
0,0.085359,0.11158


Unnamed: 0,"MR. MOYERS, HOST AND EXECUTIVE EDITOR, “BILL MOYERS’ JOURNAL,” PUBLIC BROADCASTING SYSTEM",REP. JOHN B. ANDERSON,GOV. RONALD REAGAN
0,0.083503,0.102216,0.152397


Unnamed: 0,MS. WALTERS,MR. KRAFT,MR. MAYNARD,MR. NELSON
0,0.215977,0.101286,0.137044,0.123739


Unnamed: 0,MS. FREDERICK,MR. FRANKEL
0,0.129261,0.10578


Unnamed: 0,MR. REYNOLDS,MR. NEWMAN
0,0.131177,0.090412


Unnamed: 0,MR. NIXON,MR. EDWARDS
0,0.109811,0.129084


Unnamed: 0,MR. KENNEDY,MR. SHADEL,MR. DRUMMOND
0,0.080955,0.073138,0.152258


Unnamed: 0,MR. NIVEN
0,0.117754


Unnamed: 0,MR. SMITH,MR. WARREN
0,0.118435,0.100435


In [12]:
#example of how sentiment analysis works
string = 'hello happy person'
blob = TextBlob(string)
print(blob.sentiment.polarity)


0.8
