## Exercises: Day 22

### Python Web Scraping

In [11]:
#1 
import requests
from bs4 import BeautifulSoup
url = 'http://www.bu.edu/president/boston-university-facts-stats/'
response = requests.get(url)
status = response.status_code
print(status)


200


In [13]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_bu_website(url, output_file):
    # Send a request to the website
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract data from the website (modify this part based on the structure of the website)
        # Example: Extracting text from all paragraphs
        paragraphs = [p.get_text() for p in soup.find_all('p')]

        # Store the extracted data as a JSON file
        with open(output_file, 'w', encoding='utf-8') as json_file:
            json.dump(paragraphs, json_file, ensure_ascii=False, indent=2)
        
        print(f'Data has been successfully scraped and stored in {output_file}')
    else:
        print(f'Failed to retrieve data. Status code: {response.status_code}')

bu_url = 'http://www.bu.edu/president/boston-university-facts-stats/'
output_json_file = 'bu_data.json'

# Call the scraping function
scrape_bu_website(bu_url, output_json_file)


Data has been successfully scraped and stored in bu_data.json


In [7]:
#2
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

def extract_table_to_json(url, output_json_file):
    # Send a request to the website
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the table using pandas read_html
        tables = pd.read_html(str(soup))

        # Assuming the first table is the one you want (modify if needed)
        if tables:
            data_table = tables[0]

            # Convert the table to JSON with each record on a new line
            json_data = data_table.to_json(orient='records', lines=True)

            # Store the JSON data in a file
            with open(output_json_file, 'w', encoding='utf-8') as json_file:
                json_file.write(json_data)

            print(f'Table data has been successfully extracted and stored in {output_json_file}')
        else:
            print('No tables found on the webpage.')
    else:
        print(f'Failed to retrieve data. Status code: {response.status_code}')

# Specify the URL and output file name
uci_url = 'https://archive.ics.uci.edu/dataset/545/rice+cammeo+and+osmancik'
output_json_file = 'uci_dataset.json'

# Call the extraction function
extract_table_to_json(uci_url, output_json_file)


Table data has been successfully extracted and stored in uci_dataset.json


In [5]:
#3
import requests
from bs4 import BeautifulSoup
import json

def scrape_presidents_table(url, output_json_file):
    # Send a request to the Wikipedia page
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the table containing the list of Presidents (modify this based on the structure)
        presidents_table = soup.find('table', {'class': 'wikitable'})

        if presidents_table:
            # Initialize an empty list to store the data
            presidents_data = []

            # Iterate over rows in the table
            for row in presidents_table.find_all('tr')[1:]:  # Skip the header row
                columns = row.find_all(['td', 'th'])
                president_info = [col.get_text(strip=True) for col in columns]
                presidents_data.append(president_info)

            # Store the extracted data as a JSON file
            with open(output_json_file, 'w', encoding='utf-8') as json_file:
                json.dump(presidents_data, json_file, ensure_ascii=False, indent=2)

            print(f'Data has been successfully scraped and stored in {output_json_file}')
        else:
            print('Presidents table not found on the webpage.')
    else:
        print(f'Failed to retrieve data. Status code: {response.status_code}')

# Specify the URL and output file name
wiki_url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'
output_json_file = 'presidents_data.json'

# Call the scraping function
scrape_presidents_table(wiki_url, output_json_file)


Data has been successfully scraped and stored in presidents_data.json
