#  Web Scraping Exercises:

# Question 1:
Scrape the following website and store the data as json file(url = 'http://www.bu.edu/president/boston-university-facts-stats/').

In [4]:
import requests
from bs4 import BeautifulSoup
import json

# URL of the webpage to scrape
url = 'http://www.bu.edu/president/boston-university-facts-stats/'

# Send a GET request to fetch the HTML content of the page
response = requests.get(url)

# Check if the request was successful
if response.status_code != 200:
    print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
else:
    print("Page fetched successfully!")

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extracting the facts and statistics data
    # The data seems to be organized in sections like facts and figures, and these are typically inside divs or sections
    facts_section = soup.find_all('div', class_='fact-list-item')

    # Initialize a list to store facts and stats
    facts_data = []

    for fact in facts_section:
        # Extract the fact title (e.g., "Founded", "Number of students", etc.)
        fact_title = fact.find('div', class_='fact-list-item-title')
        # Extract the fact value (e.g., "1839", "32,000", etc.)
        fact_value = fact.find('div', class_='fact-list-item-value')

        if fact_title and fact_value:
            fact_title = fact_title.get_text(strip=True)
            fact_value = fact_value.get_text(strip=True)
            facts_data.append({"title": fact_title, "value": fact_value})

    # Store the facts data as a JSON file
    json_file_path = 'boston_university_facts.json'

    with open(json_file_path, 'w') as json_file:
        json.dump(facts_data, json_file, indent=4)

    print(f"Data successfully scraped and stored in {json_file_path}")


Page fetched successfully!
Data successfully scraped and stored in boston_university_facts.json


# Question 2:
Extract the table in this url (https://archive.ics.uci.edu/ml/datasets.php) and change it to a json file

In [3]:
import requests
from bs4 import BeautifulSoup
import json

# URL for the UCI Machine Learning Repository dataset page
url = 'https://archive.ics.uci.edu/ml/datasets.php'

# Send a GET request to fetch the HTML content of the page
response = requests.get(url)

# Check if the request was successful
if response.status_code != 200:
    print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
else:
    print("Page fetched successfully!")

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the table containing the datasets
    table = soup.find('table', {'cellspacing': '2'})

    # Extract all the rows of the table (skipping the header row)
    rows = table.find_all('tr')[1:]

    # Initialize a list to store dataset information
    datasets = []

    for row in rows:
        cols = row.find_all('td')
        
        # If the row has the necessary columns, extract the information
        if len(cols) > 1:
            dataset_name = cols[0].get_text(strip=True)
            dataset_link = 'https://archive.ics.uci.edu' + cols[0].find('a')['href'] if cols[0].find('a') else None
            dataset_info = cols[1].get_text(strip=True)
            
            # Store the dataset information in a dictionary
            dataset = {
                'name': dataset_name,
                'link': dataset_link,
                'info': dataset_info
            }
            datasets.append(dataset)

    # Store the datasets information in a JSON file
    json_file_path = 'uci_datasets.json'

    with open(json_file_path, 'w') as json_file:
        json.dump(datasets, json_file, indent=4)

    print(f"Data successfully scraped and stored in {json_file_path}")


Failed to retrieve data from https://archive.ics.uci.edu/ml/datasets.php. Status code: 404


# Question 3:
Scrape the presidents table and store the data as json(https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). The table is not very structured and the scrapping may take very long time.

In [2]:
import requests
from bs4 import BeautifulSoup
import json

# URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States"

# Send a GET request to fetch the HTML content of the page
response = requests.get(url)

# Check if the request was successful
if response.status_code != 200:
    print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
else:
    print("Page fetched successfully!")

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the table containing the presidents
    # The table has the class 'wikitable', and it's the first table on the page
    table = soup.find('table', {'class': 'wikitable'})

    # Extract all rows of the table (excluding the header row)
    rows = table.find_all('tr')[1:]

    # Initialize a list to store presidents' data
    presidents_data = []

    for row in rows:
        cols = row.find_all('td')
        
        # If there are enough columns, we proceed to extract the data
        if len(cols) > 3:
            try:
                # Extracting the president's name (column 1)
                name = cols[1].get_text(strip=True)
                
                # Extracting the term in office (column 2)
                term = cols[2].get_text(strip=True)
                
                # Extracting the political party (column 3)
                party = cols[3].get_text(strip=True)
                
                # Storing the data as a dictionary
                president = {
                    'name': name,
                    'term': term,
                    'party': party
                }
                
                # Append the data to the list
                presidents_data.append(president)
            except IndexError:
                # Handle cases where rows may not have enough columns
                print("Skipping row due to missing data:", row)

    # Save the data as a JSON file
    json_file_path = 'us_presidents.json'
    with open(json_file_path, 'w') as json_file:
        json.dump(presidents_data, json_file, indent=4)

    print(f"Data successfully scraped and stored in {json_file_path}")


Page fetched successfully!
Data successfully scraped and stored in us_presidents.json
