<a href="https://colab.research.google.com/github/Shreyash-prog/U.S-Presidential-Speech-Analysis/blob/main/Presidency/Presidency_Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Code to scrape the first (n) pages

# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
from google.colab import drive
from google.colab import files

drive.mount('/content/drive')

# Function to scrape the main website for speech links and dates
def scrape_main_website(url):
    # Initialize empty lists to store URLs and dates
    url_list = []#List of the URLs on the page
    date_list = []#List of all dates

    # Make a GET request to the provided URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract URLs from anchor tags (links)
        paragraphs = soup.find_all('p')
        data = '\n'.join([p.get_text() for p in paragraphs])

        links = soup.find_all('a', href=True)

        # Print or process the URLs
        if links:
            for link in links:
                url = link['href']
                # Filter URLs to include only speech-related links
                if '/documents/' in url:
                    if 'presidential-documents-archive-guidebook' not in url and 'category-attributes' not in url and 'presidential-documents-archive-guidebook' not in url and 'app-categories' not in url:
                        url_list.append('https://www.presidency.ucsb.edu'+url)

        # Extract dates from span tags with class 'date-display-single'
        dates = soup.find_all('span', class_='date-display-single')
        if dates:
            for date in dates:
                date_list.append(date.text)

        # Return scraped data
        return data, url_list, date_list
    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
        return None

# Function to write data to a file
def write_to_file(data, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(data)
    print(f"Data written to {filename}")

# Function to scrape content from individual speech pages
def scrape_content_website(url):
    # Make a GET request to the provided URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract text from paragraph tags (speech content)
        paragraphs = soup.find_all('p')
        data = '\n'.join([p.get_text() for p in paragraphs])

        # Return scraped speech content
        return data
    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
        return None

# Function to perform scraping across multiple pages
def scraping(n, base_url):
    number_of_pages_to_scrape = n
    # Extract number of results per page from the base URL
    num_results_per_page = int(base_url[len(base_url)-2:len(base_url)])

    # Initialize empty lists to store final scraped data
    final_date_list, final_related_lst, final_title_lst, final_url_list, final_content_lst = [],[],[],[],[]

    #Check if number of pages to be scraped is equal to 1
    if number_of_pages_to_scrape == 1:
        url_to_scrape = base_url
        date_list, related_lst, title_lst, url_list, content_lst = scrape(url_to_scrape, num_results_per_page)
        final_date_list, final_related_lst, final_title_lst, final_url_list, final_content_lst = date_list, related_lst, title_lst, url_list, content_lst

    #Check if number of pages to be scraped is greater than 1
    elif number_of_pages_to_scrape > 1:
        for page_num in range(1, number_of_pages_to_scrape+1):
            url_to_scrape = f'{base_url}&page={page_num}'
            date_list, related_lst, title_lst, url_list, content_lst = scrape(url_to_scrape, num_results_per_page)
            final_date_list+=date_list
            final_related_lst+=related_lst
            final_title_lst+=title_lst
            final_url_list+=url_list
            final_content_lst+=content_lst

    #Handle the case in which a number lesser than 1 is given by the user
    else:
        print('Error: Please provide a number greater than or equal to 1')

    return final_date_list, final_related_lst, final_title_lst, final_url_list, final_content_lst

def scrape(url_to_scrape, num_results_per_page):
    output_filename = 'output.txt'

    scraped_data, url_list, date_list = scrape_main_website(url_to_scrape)

    if scraped_data:
        write_to_file(scraped_data, output_filename)

    lst=[]
    try:
        with open(output_filename, 'r', encoding='utf-8') as read_file:
            for line in read_file:
                lst.append(line.strip())
    except FileNotFoundError:
        print(f"The file '{file_path}' does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")
    lst=lst[2:len(lst)-3]
    title_lst = [lst[i] for i in range(len(lst)) if i%2==0] #List of all titles on the page
    related_lst = [lst[i] for i in range(len(lst)) if i%2!=0] #List of the related persons

    content_lst = []# List of speech content for the page - for all links
    for url in url_list:
        scraped_speech_data = scrape_content_website(url)

        if scraped_data:
            write_to_file(scraped_speech_data, output_filename)

        content_str=''
        try:
            with open(output_filename, 'r', encoding='utf-8') as read_file:
                for line in read_file:
                    if 'The American Presidency Project' not in line and 'Twitter Facebook' not in line:
                        content_str+=line
        except FileNotFoundError:
            print(f"The file '{file_path}' does not exist.")
        except Exception as e:
            print(f"An error occurred: {e}")

        content_lst.append(content_str)

    return date_list[:num_results_per_page], related_lst, title_lst, url_list[:num_results_per_page], content_lst[:num_results_per_page]


#User input
url = input('Enter the base url of the website:')
num_of_pages = int(input('Enter the number of pages for which you want to extract the data:'))
date_list, related_lst, title_lst, url_list, content_lst = scraping(num_of_pages, url)

data = {
    'Speech Link': url_list,
    'Date of Speech': date_list,
    'Speech Title': title_lst,
    'Related Person': related_lst,
    'Speech Content': content_lst
}

scraped_data_df = pd.DataFrame(data)

#Uncomment to store the DataFrame to a CSV file
scraped_data_df.to_csv('/Users/shreyashkalal/Desktop/scraped_data.csv', index=False)
files.download('/content/drive/My Drive/scraped_data.csv')

#Uncomment to store the DataFrame to a Feather file
#A Feather file is more effecient as compared to a CSV file in terms of memory and time
#feather_file_path = 'scraped_data.feather'
#scraped_data_df.to_feather(feather_file_path)
# files.download('/content/drive/My Drive/scraped_data.feather')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Enter the base url of the website:https://www.presidency.ucsb.edu/documents/app-categories/elections-and-transitions/campaign-documents?items_per_page=60
Enter the number of pages for which you want to extract the data:1
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Data written to output.txt
Da

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#Code to scrape date from specific page intervals

# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
from google.colab import drive
from google.colab import files

drive.mount('/content/drive')

# Function to scrape the main website for speech links and dates
def scrape_main_website(url):
    # Initialize empty lists to store URLs and dates
    url_list = []#List of the URLs on the page
    date_list = []#List of all dates

    # Make a GET request to the provided URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract URLs from anchor tags (links)
        paragraphs = soup.find_all('p')
        data = '\n'.join([p.get_text() for p in paragraphs])

        links = soup.find_all('a', href=True)

        # Print or process the URLs
        if links:
            for link in links:
                url = link['href']
                # Filter URLs to include only speech-related links
                if '/documents/' in url:
                    if 'presidential-documents-archive-guidebook' not in url and 'category-attributes' not in url and 'presidential-documents-archive-guidebook' not in url and 'app-categories' not in url:
                        url_list.append('https://www.presidency.ucsb.edu'+url)

        # Extract dates from span tags with class 'date-display-single'
        dates = soup.find_all('span', class_='date-display-single')
        if dates:
            for date in dates:
                date_list.append(date.text)

        # Return scraped data
        return data, url_list, date_list
    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
        return None

# Function to write data to a file
def write_to_file(data, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(data)
    print(f"Data written to {filename}")

# Function to scrape content from individual speech pages
def scrape_content_website(url):
    # Make a GET request to the provided URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract text from paragraph tags (speech content)
        paragraphs = soup.find_all('p')
        data = '\n'.join([p.get_text() for p in paragraphs])

        # Return scraped speech content
        return data
    else:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
        return None

# Function to perform scraping across multiple pages
def scraping(start_page, end_page, base_url):
    # Extract number of results per page from the base URL
    num_results_per_page = int(base_url[len(base_url)-2:len(base_url)])

    # Initialize empty lists to store final scraped data
    final_date_list, final_related_lst, final_title_lst, final_url_list, final_content_lst = [],[],[],[],[]

    # Check if start_page is equal to end_page
    if start_page == end_page:
        url_to_scrape = base_url
        date_list, related_lst, title_lst, url_list, content_lst = scrape(url_to_scrape, num_results_per_page)
        final_date_list, final_related_lst, final_title_lst, final_url_list, final_content_lst = date_list, related_lst, title_lst, url_list, content_lst

    # Check if end_page is greater than start_page
    elif end_page > start_page:
        for page_num in range(start_page, end_page+1):
            url_to_scrape = f'{base_url}&page={page_num}'
            date_list, related_lst, title_lst, url_list, content_lst = scrape(url_to_scrape, num_results_per_page)
            final_date_list+=date_list
            final_related_lst+=related_lst
            final_title_lst+=title_lst
            final_url_list+=url_list
            final_content_lst+=content_lst

    # Handle the case where end_page is less than start_page
    else:
        print('Error: The end_page is lesser than the start_page')

    # Return the final scraped data
    return final_date_list, final_related_lst, final_title_lst, final_url_list, final_content_lst

def scrape(url_to_scrape, num_results_per_page):
    output_filename = 'output.txt'

    scraped_data, url_list, date_list = scrape_main_website(url_to_scrape)

    if scraped_data:
        write_to_file(scraped_data, output_filename)

    lst=[]
    try:
        with open(output_filename, 'r', encoding='utf-8') as read_file:
            for line in read_file:
                lst.append(line.strip())
    except FileNotFoundError:
        print(f"The file '{file_path}' does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")
    lst=lst[2:len(lst)-3]
    title_lst = [lst[i] for i in range(len(lst)) if i%2==0] #List of all titles on the page
    related_lst = [lst[i] for i in range(len(lst)) if i%2!=0] #List of the related persons

    content_lst = []# List of speech content for the page - for all links
    for url in url_list:
        scraped_speech_data = scrape_content_website(url)

        if scraped_data:
            write_to_file(scraped_speech_data, output_filename)

        content_str=''
        try:
            with open(output_filename, 'r', encoding='utf-8') as read_file:
                for line in read_file:
                    if 'The American Presidency Project' not in line and 'Twitter Facebook' not in line:
                        content_str+=line
        except FileNotFoundError:
            print(f"The file '{file_path}' does not exist.")
        except Exception as e:
            print(f"An error occurred: {e}")

        content_lst.append(content_str)

    return date_list[:num_results_per_page], related_lst, title_lst, url_list[:num_results_per_page], content_lst[:num_results_per_page]


#User input
url = input('Enter the base url of the website:')
start_page = int(input('Enter the start page from which you want to extract the data:'))
end_page = int(input('Enter the end page till which you want to extract the data:'))
date_list, related_lst, title_lst, url_list, content_lst = scraping(start_page, end_page, url)

data = {
    'Speech Link': url_list,
    'Date of Speech': date_list,
    'Speech Title': title_lst,
    'Related Person': related_lst,
    'Speech Content': content_lst
}

scraped_data_df = pd.DataFrame(data)

#Uncomment to store the DataFrame to a CSV file
scraped_data_df.to_csv('scraped_data.csv', index=False)
files.download('/content/drive/My Drive/scraped_data.csv')

#Uncomment to store the DataFrame to a Feather file
#A Feather file is more effecient as compared to a CSV file in terms of memory and time
#feather_file_path = 'scraped_data.feather'
#scraped_data_df.to_feather(feather_file_path)
# files.download('/content/drive/My Drive/scraped_data.feather')