# EXERCISES: Day 22

In [19]:
import requests
from bs4 import BeautifulSoup
import json

In [20]:
url = 'http://www.bu.edu/president/boston-university-facts-stats/'

# Make a GET request to the URL
response = requests.get(url)
# check if the request was successful
status = response.status_code
print(status)

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Extract the relevant data from the webpage
data = {}
for fact in soup.find_all('div', class_='info-panel'):
    category = fact.find('div', class_='category').text.strip()
    value = fact.find('div', class_='value').text.strip()
    data[category] = value

# Store the data as a JSON file
if status == 200:
    with open('bu_facts.json', 'w') as json_file:
        json.dump(data, json_file, indent=2)

    print('Data scraped and saved as bu_facts.json')
else:
    print(f'Failed to retrieve data. Status Code: {status}')


200
Data scraped and saved as bu_facts.json


In [21]:
# Specify the filename
filename = 'bu_facts.json'

# Open and read the JSON file
with open(filename, 'r') as json_file:
    data = json.load(json_file)


# print the data
print(data)


{}


In [22]:
url = 'https://archive.ics.uci.edu/ml/datasets.php'

# Make a GET request to the URL
response = requests.get(url)

# check if the request was successful
status = response.status_code
print(status)

if status == 200:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the table data
    table = soup.find('table')

    # Process table data into a list of dictionaries
    table_data = []
    headers = [header.text.strip() for header in table.find_all('th')]
    for row in table.find_all('tr')[1:]:
        row_data = {}
        for idx, cell in enumerate(row.find_all('td')):
            row_data[headers[idx]] = cell.text.strip()
        table_data.append(row_data)

    # Store the data as a JSON file
    with open('uci_datasets.json', 'w') as json_file:
        json.dump(table_data, json_file, indent=2)

    print('Table data scraped and saved as uci_datasets.json')
else:
    print(f'Failed to retrieve data. Status Code: {status}')


404
Failed to retrieve data. Status Code: 404


In [23]:
def scrape_presidents_table(url):
    # Specify a user-agent in the headers
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    
    # Fetch page content using requests with specified headers
    response = requests.get(url, headers=headers)

    # check if the request was successful
    status = response.status_code

    if status != 200:
        print(f'Failed to retrieve data. Status Code: {status}')
        return None

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the Presidents table
    presidents_table = soup.find('table', {'class': 'wikitable'})

    if not presidents_table:
        print("Presidents table not found.")
        return None

    # Extract the table data
    table_data = []
    headers = None
    for row_num, row in enumerate(presidents_table.find_all('tr')):
        if row_num == 0:
            # Extract headers from the first row
            headers = [header.text.strip() for header in row.find_all(['th', 'td'])]
        else:
            # Extract data from subsequent rows
            row_data = {}
            for idx, cell in enumerate(row.find_all(['th', 'td'])):
                if idx < len(headers):
                    row_data[headers[idx]] = cell.text.strip()
            table_data.append(row_data)

    return table_data

def save_as_json(data, filename):
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=2)

url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'
presidents_data = scrape_presidents_table(url)

if presidents_data:
    save_as_json(presidents_data, 'presidents_data.json')
    print('Data scraped and saved as presidents_data.json')
else:
    print('Failed to retrieve and process data.')


Data scraped and saved as presidents_data.json


In [24]:

# Specify the filename
filename = 'presidents_data.json'

# Open and read the JSON file
with open(filename, 'r') as json_file:
    data = json.load(json_file)


# print the data
print(data)


[{'No.[a]': '1', 'Portrait': '', 'Name(Birth–Death)': 'George Washington(1732–1799)[17]', 'Term[14]': 'April 30, 1789–March 4, 1797', 'Party[b][15]': '', 'Election': 'Unaffiliated', 'Vice President[16]': '1788–1789\n\n1792'}, {'No.[a]': '2', 'Portrait': '', 'Name(Birth–Death)': 'John Adams(1735–1826)[19]', 'Term[14]': 'March 4, 1797–March 4, 1801', 'Party[b][15]': '', 'Election': 'Federalist', 'Vice President[16]': '1796'}, {'No.[a]': '3', 'Portrait': '', 'Name(Birth–Death)': 'Thomas Jefferson(1743–1826)[21]', 'Term[14]': 'March 4, 1801–March 4, 1809', 'Party[b][15]': '', 'Election': 'Democratic-Republican', 'Vice President[16]': '1800\n\n1804'}, {'No.[a]': '4', 'Portrait': '', 'Name(Birth–Death)': 'James Madison(1751–1836)[22]', 'Term[14]': 'March 4, 1809–March 4, 1817', 'Party[b][15]': '', 'Election': 'Democratic-Republican', 'Vice President[16]': '1808\n\n1812'}, {'No.[a]': '5', 'Portrait': '', 'Name(Birth–Death)': 'James Monroe(1758–1831)[24]', 'Term[14]': 'March 4, 1817–March 4, 1