In [None]:
import requests
from bs4 import BeautifulSoup
import os
import re
import csv
from datetime import datetime
import time
import random

# Constants
URL = "https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/presidential-campaigns-debates-and-endorsements-0"
SAVE_DIR = "debate_data"
CSV_FILE = os.path.join(SAVE_DIR, "vp_primary_debates_by_statement.csv")

# Create output directory
os.makedirs(SAVE_DIR, exist_ok=True)

def fetch_debate_links():
    """Fetch VP and primary debate transcript links from the UCSB website."""
    try:
        response = requests.get(URL)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
    except requests.RequestException as e:
        print(f"Error fetching URL: {e}")
        return {}

    link_dict = {}

    for link in soup.find_all('tr'):
        date_td = link.find('td', style=lambda x: x and "width:112pt" in x)
        if not date_td:
            continue

        debate_date = date_td.get_text(strip=True)
        if len(debate_date) > 30:
            continue

        name_td = date_td.find_next("td")
        if not name_td:
            continue

        debate_name = name_td.get_text(strip=True)

        # Filter for VP and primary debates only
        is_vp = "Vice" in debate_name
        is_primary = any(word in debate_name for word in ["Republican", "Democratic"]) and "Presidential" not in debate_name

        if not (is_vp or is_primary) or any(word in debate_name.lower() for word in ['cancelled']):
            continue

        link_tag = name_td.find("a")
        hyperlink = link_tag.get("href", None)
        if not hyperlink:
            continue

        # Extract year from the date
        try:
            date_obj = datetime.strptime(debate_date, "%B %d, %Y")
            year = date_obj.year
        except ValueError:
            print(f"Skipping debate with invalid date: {debate_date}")
            continue

        # Create a unique ID for the debate
        if is_vp:
            debate_type = "VP"
        elif "Republican" in debate_name:
            debate_type = "Republican"
        elif "Democratic" in debate_name:
            debate_type = "Democratic"
        else:
            debate_type = "Other"

        debate_id = f"{year}_{debate_type}_{date_obj.strftime('%m%d')}"

        link_dict[debate_name] = {
            'date': debate_date,
            'url': hyperlink,
            'year': year,
            'debate_id': debate_id,
            'party': debate_type if debate_type != "VP" else "VP",
            'type': "Primary" if debate_type in ["Republican", "Democratic"] else "VP"
        }

    return link_dict

def parse_debate_transcript(soup, debate_info):
    """Parse the debate transcript into statements with speaker information."""
    content_div = soup.find("div", class_="field-docs-content")
    if not content_div:
        return []

    paragraphs = content_div.find_all("p")
    statements = []

    current_speaker = "MODERATOR"  # Default speaker
    for p in paragraphs:
        text = p.get_text(strip=True)
        if not text:
            continue

        # Try to identify speaker patterns
        # Common patterns: "SPEAKER:", "SPEAKER."
        speaker_match = re.match(r'^([A-Z][A-Z\s\.\-\']+)(?::|\.)\s*(.*)', text)

        if speaker_match:
            speaker = speaker_match.group(1).strip()
            statement_text = speaker_match.group(2).strip()

            # If we only have a speaker with no statement, skip it
            if not statement_text:
                current_speaker = speaker
                continue

            statements.append({
                'speaker': speaker,
                'statement': statement_text,
                'debate_id': debate_info['debate_id'],
                'year': debate_info['year'],
                'party': debate_info['party']
            })
            current_speaker = speaker
        else:
            # Some debate formats don't specify the speaker for each paragraph
            if len(text) > 10:  # Avoid very short fragments
                statements.append({
                    'speaker': current_speaker,
                    'statement': text,
                    'debate_id': debate_info['debate_id'],
                    'year': debate_info['year'],
                    'party': debate_info['party']
                })

    return statements

def extract_participants(soup):
    """Try to extract debate participants from the transcript header."""
    participants = []
    content_div = soup.find("div", class_="field-docs-content")

    if content_div:
        # Look for participant lists at the beginning
        first_paragraphs = content_div.find_all("p")[:5]  # Check first few paragraphs
        for p in first_paragraphs:
            text = p.get_text(strip=True)
            if "PARTICIPANTS" in text.upper():
                participant_section = text.split(":", 1)[1] if ":" in text else text
                # Extract names (this is a simplistic approach)
                names = re.findall(r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})', participant_section)
                participants.extend(names)

    return participants

def scrape_and_save_to_csv(link_dict):
    """Scrape debate transcripts and save statements to CSV file."""
    # Create or overwrite the CSV file with headers
    with open(CSV_FILE, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['speaker', 'statement', 'party', 'debate_id', 'year']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

    total_statements = 0

    for title, debate_info in link_dict.items():
        print(f"Processing: {title} ({debate_info['date']})")

        try:
            response = requests.get(debate_info['url'])
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")
        except requests.RequestException as e:
            print(f"Error fetching transcript: {e}")
            continue

        # Extract statements from the transcript
        statements = parse_debate_transcript(soup, debate_info)

        # Save statements to CSV
        with open(CSV_FILE, 'a', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=['speaker', 'statement', 'party', 'debate_id', 'year'])
            for statement in statements:
                writer.writerow(statement)

        debate_statements = len(statements)
        total_statements += debate_statements
        print(f"Added {debate_statements} statements from debate: {title}")

        # Add a delay to avoid overloading the server
        time.sleep(random.uniform(1, 3))

    print(f"Scraping complete. Total statements: {total_statements}")
    print(f"CSV file saved to: {CSV_FILE}")

def main():
    """Main function to execute the script."""
    print("Fetching VP and primary debate links...")
    debate_links = fetch_debate_links()

    if not debate_links:
        print("No relevant debate links found. Exiting...")
        return

    print(f"Found {len(debate_links)} VP and primary debates. Scraping transcripts...")
    scrape_and_save_to_csv(debate_links)

if __name__ == "__main__":
    main()

Fetching VP and primary debate links...
Found 96 VP and primary debates. Scraping transcripts...
Processing: Vice Presidential Debate in New York City (October 1, 2024)
Added 184 statements from debate: Vice Presidential Debate in New York City
Processing: Republican Candidates Debate in Des Moines, Iowa (December 10, 2011)
Added 346 statements from debate: Republican Candidates Debate in Des Moines, Iowa
Processing: Republican Candidates Debate in Tuscaloosa, Alabama (December 6, 2023)
Added 423 statements from debate: Republican Candidates Debate in Tuscaloosa, Alabama
Processing: Republican Candidates Debate in Miami, Florida (December 9, 2007)
Added 469 statements from debate: Republican Candidates Debate in Miami, Florida
Processing: Republican Candidates Debate in Simi Valley, California (May 3, 2007)
Added 768 statements from debate: Republican Candidates Debate in Simi Valley, California
Processing: Republican Candidates Debate in Milwaukee, Wisconsin (November 10, 2015)
Added 