### I'm using this script to test out webscraping using beautifulsoup package.

Basic setup and site access.

In [1]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

MainURL='https://www.americanrhetoric.com/top100speechesall.html'
MainPage= requests.get(MainURL, headers={'user-agent': 'Mozilla/5.0'})
soup= BeautifulSoup(MainPage.content, 'html.parser')

Obtaining name of the speakers and titles

In [2]:
names = soup.find_all(attrs={"width": "203"})
nameList = [n.get_text(strip=True) for n in names]

def findTitles (x) :
    titles = x.find_all(attrs={"color":"#BA1D01"})
    titleList1 = [n.get_text(strip=True) for n in titles]
    pattern=re.compile("([\r])|([\t])|([\n])|([\ ]{2,})")
    titleListCleaned = [re.sub(pattern, '', str(title)) for title in titleList1]
    titleListCleaned.pop(0)
    return titleListCleaned

Obtaining URL list from main webpage and storing them as list variable.

In [3]:
mainlinks=[a['href'] for a in soup.find_all('a',href=True)   
    if 'off site' in a.text 
    or a['href'].startswith('speeches') and 'PDFFiles' not in a['href']
    or 'Belief and Public Morality' in a.text]

Obtaining where and when the speech is given 

In [4]:
deliveredLists = []
speeches = []
for i, link in enumerate(mainlinks) : 
    URL='https://www.americanrhetoric.com/'+mainlinks[i]
    Page=requests.get(URL, headers={'user-agent': 'Mozilla/5.0'})
    speechsoup=BeautifulSoup(Page.content, 'html.parser')
    findWhere = speechsoup.find_all(attrs={"color":"#CE0A04"})
    deliveredLists.append([n.get_text(strip=True) for n in findWhere])
    findspeeches = speechsoup.find_all("font",{'face':'Verdana', 'size':2})
    speeches.append([n.get_text(strip=True) for n in findspeeches])



In [6]:
deliveredAtCleaned = []

for i, place in enumerate(deliveredLists) : 
    if len(place) == 0 : 
        deliveredAtCleaned.append(None) 
    else : 
        for p in place : 
            if p.startswith('deliver') or p.startswith('Deliver') or p.startswith('Radio') or p.startswith('Broadcast') or p.startswith('broadcast') or p.startswith('presented') or p.startswith('Air') or p.startswith('original') or p.startswith('Paper') : 
                pattern=re.compile("([\r])|([\t])|([\n])|([\ ]{2,})")
                re.sub(pattern, '', str(p))
                deliveredAtCleaned.append(p)  

In [7]:
speechesCleaned = []

for i, speech in enumerate(speeches) :
    if len(speeches[i]) < 3 :
        speechesCleaned.append(None)
    else : 
        script=' '.join([str(line) for line in speech])
        script=' '.join(script.split())
        speechesCleaned.append(script)


In [8]:
# Creating DF
df = pd.DataFrame(list(zip(nameList, findTitles(soup), mainlinks, deliveredAtCleaned, speechesCleaned)),
               columns =['Speaker', 'Title', 'Links', "Delivered", 'Speeches'])

df.head(10)

Unnamed: 0,Speaker,Title,Links,Delivered,Speeches
0,"Martin Luther King, Jr.",I Have A Dream,speeches/mlkihaveadream.htm,"delivered \r\n 28 August 1963, at the Lin...",I am happy to join with you today in what will...
1,John Fitzgerald Kennedy,Inaugural Address,speeches/jfkinaugural.htm,"delivered 20 January 1961, \r\nWashington, D.C.","Vice President Johnson, Mr. Speaker, Mr. Chief..."
2,Franklin Delano Roosevelt,First Inaugural Address,speeches/fdrfirstinaugural.html,Delivered 4 March 1933,"President Hoover, Mr. Chief Justice, my friend..."
3,Franklin Delano Roosevelt,Pearl Harbor Address to the Nation,speeches/fdrpearlharbor.htm,"delivered 8 \r\nDecember 1941, Washington, D.C.",
4,Barbara Charline Jordan,1976 DNC Keynote Address,speeches/barbarajordan1976dnc.html,"delivered 12 July 1976, New York, NY",Thank you ladies and gentlemen for a very warm...
5,Richard Milhous Nixon,Checkers,speeches/richardnixoncheckers.html,delivered and broadcast live on television 23 ...,"My Fellow Americans, I come before you tonight..."
6,Malcolm X,The Ballot or the B,http://americanradioworks.publicradio.org/feat...,,
7,Ronald Wilson Reagan,Shuttle 'Challenger' Disaster Address,speeches/ronaldreaganchallenger.htm,delivered 28 January 1986,"Ladies and Gentlemen, I'd planned to speakto y..."
8,John Fitzgerald Kennedy,Houston Ministerial Association,speeches/jfkhoustonministers.html,delivered 12 September 1960 at the Rice Hotel ...,"Reverend Meza, Reverend Reck, I'm grateful for..."
9,Lyndon Baines Johnson,We Shall Overcome,speeches/lbjweshallovercome.htm,"delivered 15 March 1965, \r\nWashington, D.C.","Mr. Speaker, Mr. President, Members of the Con..."


In [9]:
df[df['Speeches'].isna()]

Unnamed: 0,Speaker,Title,Links,Delivered,Speeches
3,Franklin Delano Roosevelt,Pearl Harbor Address to the Nation,speeches/fdrpearlharbor.htm,"delivered 8 \r\nDecember 1941, Washington, D.C.",
6,Malcolm X,The Ballot or the B,http://americanradioworks.publicradio.org/feat...,,
13,(Gen) Douglas MacArthur,Farewell Address to Congress,speeches/douglasmacarthurfarewelladdress.htm,"delivered 19 April 1951, \r\nWashington, D.C.",
43,William Jennings Bryan,Against Imperialism,speeches/wjbryanimperialism.htm,"delivered 8 August 1900, Indianapolis, IN",
45,John Fitzgerald Kennedy,Civil Rights Address,speeches/jfkcivilrights.htm,"delivered 11 June 1963, White House, \r\nWashi...",
58,Mario Matthew Cuomo,Religious Belief and Public Morality,http://archives.nd.edu/research/texts/cuomo.ht...,,
70,Henry Louis (Lou) Gehrig,Farewell to Baseball Address,speeches/lougehrigfarewelltobaseball.htm,"delivered 4 July 1939, Yankee \r\nStadium, New...",
73,Edward Moore Kennedy,1980 DNC Address,speeches/tedkennedy1980dnc.htm,"delivered 12 August 1980, New \r\nYork, NY",
79,Eugene Victor Debs,The Issue (off site),https://www.marxists.org/archive/debs/works/19...,,
82,Crystal Eastman,Now We Can Begi,https://womenshistory.info/now-can-begin-whats...,,


In [10]:
malcomXURL ='http://americanradioworks.publicradio.org/features/blackspeech/mx.html'
MainPage= requests.get(malcomXURL, headers={'user-agent': 'Mozilla/5.0'})
soup= BeautifulSoup(MainPage.content, 'html.parser')    

malcom = soup.select('blockquote p')
malcomspeech = [n.get_text(strip=True) for n in malcom]


listToStr = ' '.join([str(elem) for elem in malcomspeech ])
df['Speeches'][6] = str(listToStr)
df['Delivered'][6] = ' Delivered King Solomon Baptist Church, Detroit, Michigan - April 12, 1964'

In [15]:
# The link is not working!!!

#speech58 ='http://archives.nd.edu/research/texts/cuomo.htm?DocID=14'
#MainPage58= requests.get(speech58, headers={'user-agent': 'Mozilla/5.0'})
#soup= BeautifulSoup(MainPage58.content, 'html.parser')    

#soup58 = soup.select(".mainbody")

#cuomospeech = [n.get_text(strip=True) for n in soup58]

#print(cuomospeech)

#del cuomospeech[0:2]

#listToStr58 = ' '.join([str(elem) for elem in cuomospeech])
#df['Speeches'][58] = str(listToStr58)
#df['Delivered'][58] = "delivered September 13, 1984, as a John A. O'Brien Lecture in the University of Notre Dame's Department of Theology"

[]


In [None]:
df.to_csv('speeches.csv', encoding='utf-8')