# Web scraping debate transcripts
## Author: Oliver Gladfelter
### Date: Jan 2020

In [1]:
import pandas as pd
import lxml.html as lh
import io
import requests
from bs4 import BeautifulSoup
import re
import itertools
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import json
import os

# Function for transcript format 1
## June - Sept

In [2]:
def scrapeTranscriptFormat1(url):
    html = requests.get(url)
    html = html.text
    bs = BeautifulSoup(html, "lxml")
    paragraphs = bs.findAll("p")

    # remove 'u' tags embedded within each paragraph tag
    for paragraph in paragraphs:
        try:
            paragraph.find('u').decompose()
        except:
            continue

    # Retrieve speaker and speech from each paragraph tag
    speaker = []
    speech = []
    pattern = r'\[.*?\]'

    for paragraph in paragraphs:
        try:
            speechText = paragraph.text.replace(u'\xa0', u'') # remove space
            speechText = re.sub(pattern, '', speechText) # remove brackets and their contents
            speaker.append(re.search("^(.*?):", speechText).group(1)) # search for speaker name, append to list
            speech.append(re.search("^(.*):(.*)", speechText).group(2).strip(" ")) # search for speaker's speech, append to list
        except:
            continue

    # Convert to dataframe
    return pd.DataFrame({'name': speaker, 'speech': speech})

# Function transcript format 2
## Oct - Jan

In [2]:
def scrapeTranscriptFormat2(url):
    html = requests.get(url)
    html = html.text
    bs = BeautifulSoup(html, "lxml")
    paragraphs = bs.findAll("p")

    # remove 'u' tags embedded within each paragraph tag
    for paragraph in paragraphs:
        try:
            paragraph.find('a').decompose()
        except:
            continue

    # Retrieve speaker and speech from each paragraph tag
    speaker = []
    speech = []
    pattern = r'\[.*?\]'

    for paragraph in paragraphs:
        try:
            speechText = paragraph.text.replace(u'()', u'') # remove space
            speechText = re.sub(pattern, '', speechText) # remove brackets and their contents
            speaker.append(re.search("^(.*?):", speechText).group(1)) # search for speaker name, append to list
            speech.append(re.search("^(.*):(.*)", speechText).group(2).strip(" ")) # search for speaker's speech, append to list
        except:
            continue

    # Convert to dataframe
    return pd.DataFrame({'name': speaker, 'speech': speech})

# Scrape 'em

In [None]:
june1 = scrapeTranscriptFormat1("https://www.rev.com/blog/transcript-from-first-night-of-democratic-debates")
june2 = scrapeTranscriptFormat1("https://www.rev.com/blog/transcript-from-night-2-of-the-2019-democratic-debates")
july1 = scrapeTranscriptFormat1("https://www.rev.com/blog/transcript-of-july-democratic-debate-night-1-full-transcript-july-30-2019")
july2 = scrapeTranscriptFormat1("https://www.rev.com/blog/transcript-of-july-democratic-debate-2nd-round-night-2-full-transcript-july-31-2019")
sept = scrapeTranscriptFormat1("https://www.rev.com/blog/democratic-debate-transcript-houston-september-12-2019")
october = scrapeTranscriptFormat2("https://www.rev.com/blog/october-democratic-debate-transcript-4th-debate-from-ohio")
nov = scrapeTranscriptFormat2("https://www.rev.com/blog/november-democratic-debate-transcript-atlanta-debate-transcript")
dec = scrapeTranscriptFormat2("https://www.rev.com/blog/december-democratic-debate-transcript-sixth-debate-from-los-angeles")
jan = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/january-iowa-democratic-debate-transcript")

june1['month'] = 'June'
june2['month'] = 'June'
july1['month'] = 'July'
july2['month'] = 'July'
sept['month'] = 'Sept'
october['month'] = 'Oct'
nov['month'] = 'Nov'
dec['month'] = 'Dec'
jan['month'] = 'Jan'

df = pd.concat([june1, june2, july1, july2, sept, october, nov, dec, jan])
df.to_csv(os.getcwd() + "\\Documents\\DemDebateTranscripts.csv",index=False)

# Trump 2019-2020 Rallies Transcript Scrape

In [21]:
rally1 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donald-trump-maga-event-speech-transcript-north-carolina-rally")
rally2 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donald-trump-ohio-rally-speech-transcript-full-transcript-of-august-1-2019-rally-in-cincinnati")
rally3 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donald-trump-new-hampshire-rally-transcript-august-15-2019")
rally4 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donald-trump-north-carolina-rally-transcript-in-fayetteville-nc-september-9-2019")
rally5 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donald-trump-new-mexico-rally-transcript-full-speech-transcript")
rally6 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donald-trump-minnesota-rally-speech-transcript-minneapolis-mn-rally-october-10-2019")
rally7 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donald-trump-dallas-rally-speech-transcript-october-17-2019")
rally8 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donald-trump-mississippi-rally-speech-transcript-2019-rally-in-tupelo-mississippi")
rally9 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donald-trump-kentucky-rally-speech-transcript-lexington-kentucky-rally")
rally10 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donald-trump-hershey-pennsylvania-rally-transcript-december-10-2019")
rally11 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donald-trump-michigan-rally-transcript-trump-holds-a-rally-in-battle-creek-during-impeachment")
rally12 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donald-trump-ohio-campaign-rally-transcript-transcript-of-toledo-ohio-rally")
rally13 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donald-trump-milwaukee-rally-transcript-trump-holds-rally-during-iowa-democratic-debate")
rally14 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donald-trump-new-jersey-rally-speech-transcript-trump-holds-rally-in-wildwood-nj")
rally15 = scrapeTranscriptFormat2("https://www.rev.com/blog/transcripts/donal-trump-iowa-rally-transcript-trump-holds-rally-in-des-moines-iowa")

In [25]:
data = pd.concat([rally1,rally2,rally3,rally4,rally5,rally6,rally7,rally8,rally9,rally10,rally11,rally12,rally13,rally14,rally15]).reset_index()
del data['index']

data.to_csv(os.getcwd() + "\\Documents\\TrumpRallies.csv",index=False)