<a href="https://colab.research.google.com/github/Shreyash-prog/U.S-Presidential-Speech-Analysis/blob/main/Miller%20Center/Miller_Center_Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Code to scrape date from specific page intervals

# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime
from google.colab import drive
from google.colab import files

drive.mount('/content/drive')

# Function to capture the title of the speech
def capture_text_after_pattern(text, pattern):
    # Regular expression pattern to match the specified pattern
    pattern_regex = re.compile(pattern)

    # Using search() to find the pattern in the text
    match = pattern_regex.search(text)

    if match:
        # Return the text after the pattern
        return text[match.end():]
    else:
        return None

# Function to scrape content from individual speech pages
def scrape_content_website(url):
    # Make a GET request to the provided URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract text from paragraph tags (speech content)
        paragraphs = soup.find_all('p')
        data = '\n'.join([p.get_text() for p in paragraphs])

        if '\xa0\n' in data[:-160]:
            try:
                occurrences = re.finditer('\xa0\n', data)
                indices_u = [match.start() for match in occurrences]
                #indices_u[1]

                occurrences = re.finditer('\n', data)
                indices_n = [match.start() for match in occurrences]
                #indices_n[2]

                speech_summary = data[indices_n[2]+1:indices_u[1]]

                speech = data[indices_u[1]+6:]

                # Remove Unicode characters
                speech = re.sub(r'[^\x00-\x7F]+', '', speech)
                speech_summary = re.sub(r'[^\x00-\x7F]+', '', speech_summary)
                # Remove newline characters
                speech = re.sub(r'\n', '', speech)
                speech_summary = re.sub(r'\n', '', speech_summary)
                # Remove other non-printable characters
                speech = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+', '', speech)
                speech_summary = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+', '', speech_summary)
            except Exception as e:
                speech = data
                speech_summary = data

            return speech, speech_summary
        else:
            try:
                occurrences = re.finditer('\n', data)
                indices_n = [match.start() for match in occurrences]

                speech_summary = data[indices_n[2]+1:indices_n[3]]
                speech = data[indices_n[3]+1:]

                # Remove Unicode characters
                speech = re.sub(r'[^\x00-\x7F]+', '', speech)
                speech_summary = re.sub(r'[^\x00-\x7F]+', '', speech_summary)
                # Remove newline characters
                speech = re.sub(r'\n', '', speech)
                speech_summary = re.sub(r'\n', '', speech_summary)
                # Remove other non-printable characters
                speech = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+', '', speech)
                speech_summary = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+', '', speech_summary)
            except Exception as e:
                speech = data
                speech_summary = data

            return speech, speech_summary

    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
        return None

# Function to scrape the main website for speech links and dates
def scrape_main_website(president_name, url, speech_link_lst, speech_date_lst, speech_title_lst, speech_lst, speech_summary_lst, president_lst):

    # Make a GET request to the provided URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract URLs from anchor tags (links)
        paragraphs = soup.find_all('span')
        para_str_list = [str(i) for i in paragraphs]
        filtered_para_list = [j for j in para_str_list if '"field-content"><a href=' in j]

    for k in filtered_para_list:
        #Extracting the url of the speech
        url_pattern = r'https?://\S+'
        url = re.findall(url_pattern, k)
        speech_link_lst.append(url[0])

        #Extracting the date of the speech
        date_pattern = r'((January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4})'
        date_string = re.findall(date_pattern, k)
        speech_date_lst.append(date_string[0][0])

        #Extracting the title of the speech
        captured_text = capture_text_after_pattern(k, date_pattern)
        speech_title_lst.append(captured_text[2:-11])

    #Iterating over the speech links of each president to extract the speech transcript and summary
    for link in speech_link_lst:
        speech, speech_summary = scrape_content_website(link[:-1])
        speech_lst.append(speech)
        speech_summary_lst.append(speech_summary)

    president_lst = [president_name] * len(speech_title_lst)

    return speech_link_lst, speech_date_lst, speech_title_lst, speech_lst, speech_summary_lst, president_lst

def scraping():

    president_id_list = [44,45,3,4,141,6,7,8,9,10,11,12,13,14,15,16,18,19,22,20,21,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,8396,30721]
    president_name_list = ['George Washington', 'John Adams', 'Thomas Jefferson', 'James Madison', 'James Monroe', 'John Quincy Adams', 'Andrew Jackson', 'Martin Van Buren', 'William Harrison', 'John Tyler', 'James K. Polk', 'Zachary Taylor', 'Millard Fillmore', 'Franklin Pierce', 'James Buchanan', 'Abraham Lincoln', 'Ulysses S. Grant', 'Rutherford B. Hayes', 'Grover Cleveland', 'James A. Garfield', 'Chester A. Arthur', 'Benjamin Harrison', 'William McKinley', 'Theodore Roosevelt', 'William Taft', 'Woodrow Wilson', 'Warren G. Harding', 'Calvin Coolidge', 'Herbert Hoover', 'Franklin D. Roosevelt', 'Harry S. Truman', 'Dwight D. Eisenhower', 'John F. Kennedy', 'Lyndon B. Johnson', 'Richard M. Nixon', 'Gerald Ford', 'Jimmy Carter', 'Ronald Reagan', 'George H. W. Bush', 'Bill Clinton', 'George W. Bush', 'Barack Obama', 'Donald Trump', 'Joe Biden']

    speech_link_lst, speech_date_lst, speech_title_lst, speech_lst, speech_summary_lst, president_lst = [], [], [], [], [], []

    # Iterating over presidents to extract their speeches one by one
    for id in range(len(president_id_list)):
        url_to_scrape = f'https://millercenter.org/the-presidency/presidential-speeches?field_president_target_id[{president_id_list[id]}]={president_id_list[id]}'
        sl, sd, st, s, ss, p = scrape_main_website(president_name_list[id], url_to_scrape, [], [], [], [], [], [])

        speech_link_lst+=sl
        speech_date_lst+=sd
        speech_title_lst+=st
        speech_lst+=s
        speech_summary_lst+=ss
        president_lst+=p

    scraped_df = pd.DataFrame({'Speech Link': speech_link_lst, 'Date of Speech': speech_date_lst, 'Speech Title': speech_title_lst, 'Related Person': president_lst, 'Speech Summary': speech_summary_lst, 'Speech Content': speech_lst})

    scraped_df.to_csv('scraped_data_w2.csv', index=False)
    files.download('/content/drive/My Drive/scraped_data_w2.csv')

    return speech_link_lst, speech_date_lst, speech_title_lst, speech_lst, speech_summary_lst, president_lst, scraped_df