In [1]:
#Installing Libraries
!pip install beautifulsoup4
!pip install requests
!pip install pandas
!pip install lxml
!pip install matplotlib
!pip install numpy



In [2]:
# 'requests' library is used to make HTTP requests to a web page, allowing you to retrieve the HTML content of the page.
import requests

# 'BeautifulSoup' from the 'bs4' library is used to parse the HTML content retrieved from a web page. It provides methods to search and navigate the HTML tree, making it easier to extract the data you need.
from bs4 import BeautifulSoup

# 'pandas' is a powerful library for data manipulation and analysis. It's commonly used in web scraping to store and manipulate the scraped data in a tabular format (DataFrame), making it easier to analyze, filter, and export the data.
import pandas as pd

import matplotlib.pyplot as plt

import numpy as np


In [3]:
# List of URLs for the last 5 years
urls = [
    'http://pacificpuddlejump.com/fleet.html', # 2023
    'http://pacificpuddlejump.com/Alumni/2022fleet.html', # 2022 
    'http://pacificpuddlejump.com/Alumni/2019fleet.html', # 2019
    'http://www.pacificpuddlejump.com/Alumni/2018fleet.html', # 2018
    'http://www.pacificpuddlejump.com/Alumni/2017fleet.html', # 2017
]

# Iterate through the URLs and perform the scraping process
for url in urls:
    page = requests.get(url) # allows you to fetch the content of each URL one by one
    if page.status_code == 200: # If we get the response 200 then the server allows us to collect data from the sites. 
        # Extract the data you need
        # Store the data in a structured format
        print("Success")
    else:
        print("Failed to fetch data from " + url) # added else statement to identify if there was an invalid URL


Success
Success
Success
Success
Success


In [9]:
def get_year(url):  # Function to extract the year from the URL
    year_part = url.split('/')[-1][:4] #extracts the year. It splits the URL string into a list using / then "[-1] takes that last element. - the year. The "4" takes the first 4 char of that element which is the year
    return '2023' if year_part == 'flee' else year_part # the first four character is flee for the first link, therfore the year is 2023

def process_row(row):  # Function to process a single row of the table
    return [cell.text.strip() for cell in row.find_all('td')] #find all gives that list of td elements.cell.text extracts the text content between the cell (opening and closing tags). Strip() just removes any white spaces

def scrape_url(url):  # Function to handle the scraping of a single URL
    page = requests.get(url) #sends that HTTP GET request to the URL then the repsonse is stored in the 'page' variable. Its the raw HTML content that has yet to be parsed for easy retrieval of data
    soup = BeautifulSoup(page.text, 'lxml') #parses the HTML content of the page to make it easier to navigate in a more pythonic/lxml way. page.text grabs all HTML content in a single string
    year = get_year(url)
    table = soup.find('table', id='fleet') #locates the table(s) where extracting from
    return [[*process_row(row), year] for row in table.find_all('tr')[1:]]
            #Calls the previously defined function and extracts the cell data
            #, year appends year.. it adds year to the end of the list



In [11]:
final_data = []  # Initialize an empty list to store the final data. Starts enmpty since its outside the loop but will later be filled once the loop starts itetating through the urls

for url in urls:  # Process each URL using the 'scrape_url' function
    final_data.extend(scrape_url(url)) #Unlike the append method, which would add the entire list as a single element, extend adds each element of the list separately

headers = ['Boat Name', 'Boat Model', 'Homeport', "Owner's Name", 'Departing From', 'When', 'Year']  # Define column headers

df = pd.DataFrame(final_data, columns=headers)  # Create DataFrame using combined data. columns=headers(list contains the name of the columns)Telling pandas to use these names for the columns. It aligns the data from final_data to each of these columns
#df is just the varibale with the now structured dataframe object
df.set_index("Boat Name", inplace=True)  # Set index(pandas dataframe) to 'Boat Name' inplace=True tells pandas to modify the existing dataframe otherwise a new one will be created

print(df)  # Display the final DataFrame
#PANDAS.APPLY

                      Boat Model                Homeport  \
Boat Name                                                  
Aloha Toa        Wharram Tiki 30                  La Paz   
Ana Maria    Pacific Seacraft 34             Seattle, WA   
Arctic Tern     Island Packet 40              Kailua, HI   
Beleza                  Hylas 46           Hermosa Beach   
Best Life             Leopard 47      Salt Lake City, UT   
...                          ...                     ...   
Windsong                   CT 47            Portland, OR   
Wiz                      Swan 44             Seattle, WA   
Xirene            X-Yachts X-482         Muiderzand, NED   
Y2K          Beneteau Oceanis 50  Marina di Carrara, ITA   
Zatara       Beneteau Oceanis 55            Montana, USA   

                            Owner's Name   Departing From            When  \
Boat Name                                                                   
Aloha Toa               Tymoteo Shaddock             Baja        

This code block performs the following tasks to scrape fleet information from five different URLs:
1.	Initialization: Creates an empty list final_data to store the scraped data.
2.	URL Iteration: Iterates through the provided URLs, sending a GET request and parsing the HTML content.
3.	Year Extraction: Extracts the year from the URL.
4.	Table Identification: Finds the table containing the fleet information.
5.	Row Iteration: Iterates through the table rows, extracting text from each cell and appending it to final_data along with the year.
6.	DataFrame Creation: Constructs a DataFrame from the combined data with specific column headers.
7.	Year Replacement: Replaces the value 'flee' with '2023' in the 'Year' column.
8.	Index Setting: Sets the DataFrame index to 'Boat Name'.
9.	Output: Prints the DataFrame, showing the combined data from all five tables, including the additional year column.


In [6]:
# Save the DataFrame to a CSV file
df.to_csv('combinedSauce.csv')
