In [1]:
!pip install bs4


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from datetime import datetime

In [3]:
url = "https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/presidential-campaigns-debates-and-endorsements-0"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
# Extract the debate date
date_td = soup.find("td", style=lambda x: x and "width:112pt" in x)
debate_date = date_td.get_text(strip=True) if date_td else "No date found"

print("Debate Date:", debate_date)

Debate Date: September 10, 2024


In [5]:
# Extract the debate document name
name_td = date_td.find_next("td")  # Move to the next <td> in the row
debate_name = name_td.get_text(strip=True) if name_td else "No name found"

print("Debate Name:", debate_name)

Debate Name: Presidential Debate in Philadelphia, Pennsylvania(Harris-Trump)


In [6]:
# Step 5: Extract the hyperlink
link_tag = name_td.find("a")  # Find the <a> tag inside the <td>
hyperlink = link_tag["href"] if link_tag else "No link found"

print("Debate Link:", hyperlink)

Debate Link: https://www.presidency.ucsb.edu/documents/presidential-debate-philadelphia-pennsylvania


In [None]:
url = "https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/presidential-campaigns-debates-and-endorsements-0"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")

# Store links in a structured format
link_dict = {}
for link in soup.find_all('tr'):
    # Extracting the date
    date_td = link.find('td',style=lambda x: x and "width:112pt" in x)
    # Dealing with errors
    if not date_td:
        continue

    debate_date = date_td.get_text(strip=True)

    if len(debate_date) > 30:
        print(f"invalid debate date: {debate_date}")
        continue



    # Extracting the name of the debate document
    name_td = date_td.find_next("td")
    if not name_td:
        print("debugging: No name td found ")
        continue
    debate_name = name_td.get_text(strip=True)

    #setting up some keywords that trucates our output
    invalid_words = ['cancelled']
    if any(word in debate_name.lower() for word in invalid_words):
        continue

    # Extracting the hyperlink
    link_tag = name_td.find("a")
    if not link_tag:
        print("debugging: No url found ")
        continue
    hyperlink = link_tag["href"]

    # Store in dictionary
    link_dict[debate_name] = [debate_date, hyperlink]


invalid debate date: (1) The second presidential debate, a "town hall" style format, scheduled for October 15, 2020 was cancelled on October 9, 2020.  The Commission on Presidential Debates modified the format of this debate stating that it would be held "virtually" because of concerns about public health due to President Trump's COVID-19 diagnosis.  The President then chose not to participate in this modified format.  Both Donald Trump and Joe Biden held "town-hall" events in lieu of the debate.  These event transcripts are below:President Trump:Remarks in a Town Hall Meeting with Savannah Guthrie of NBC News at the Perez Art Museum in Miami, FloridaFormer Vice President Biden:Remarks in a Town Hall Meeting with George Stephanopoulos of ABC News at the National Constitution Center in Philadelphia, Pennsylvania


In [None]:
# Formatting filename

def format_filename(debate_name,debate_date):
    """Formating the filename to split correctly the debates"""
    try:
        date_obj = datetime.strptime(debate_date, "%B %d, %Y")
        formatted_date = date_obj.strftime("%m_%d_%Y")
    except ValueError:
        print(f"invalid date: {debate_date}")


    if "Vice Presidential" in debate_name:
        suffix = "vp"
    elif "Republican" in debate_name:
        suffix = "rpd"
    elif "Democratic" in debate_name:
        suffix = "dcd"
    else:
        suffix = "pb"
    # returns structure
    filename = f"{formatted_date}_{suffix}.txt"
    return filename


In [None]:
save_dir = "data"
os.makedirs(save_dir, exist_ok=True)

In [None]:
for title, list in link_dict.items():
    date, url = list
    # Fetching
    response = requests.get(url)
    response.raise_for_status()

    # Parsing HTML
    soup = BeautifulSoup(response.content, "html.parser")

    # Finding body transcript
    content_div = soup.find("div", class_="field-docs-content")
    if not content_div:
        continue

    # Extract all paragraphs inside the content div
    paragraphs = content_div.find_all("p")
    text_content = "\n\n".join([p.get_text(strip=True) for p in paragraphs])

    # Saving debates as .txt files
    file_name = os.path.join(save_dir, format_filename(title, date))
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(text_content)


Filename: 09_10_2024_pb.txt
Filename: 06_27_2024_pb.txt
Filename: 10_01_2024_vp.txt
Filename: 12_10_2011_rpd.txt
Filename: 12_06_2023_rpd.txt
Filename: 12_09_2007_rpd.txt
Filename: 05_03_2007_rpd.txt
Filename: 11_10_2015_rpd.txt
Filename: 10_22_2020_pb.txt
Filename: 09_29_2020_pb.txt
Filename: 10_07_2020_vp.txt
Filename: 03_15_2020_dcd.txt
Filename: 07_23_2007_dcd.txt
Filename: 11_15_2007_dcd.txt
Filename: 01_26_2000_dcd.txt
Filename: 01_08_2000_dcd.txt
Filename: 03_01_2000_dcd.txt
Filename: 11_20_2019_dcd.txt
Filename: 10_15_2019_dcd.txt
Filename: 09_12_2019_dcd.txt
Filename: 07_31_2019_dcd.txt
Filename: 07_30_2019_dcd.txt
Filename: 06_27_2019_dcd.txt
Filename: 06_26_2019_dcd.txt
Filename: 10_19_2016_pb.txt
Filename: 10_09_2016_pb.txt
Filename: 10_16_2012_pb.txt
Filename: 10_04_2016_vp.txt
Filename: 04_14_2016_dcd.txt
Filename: 09_09_2007_dcd.txt
Filename: 03_06_2016_dcd.txt
Filename: 02_11_2016_dcd.txt
Filename: 01_05_2000_dcd.txt
Filename: 01_25_2016_dcd.txt
Filename: 03_03_2016_rpd