In [1]:
import sys
import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd

# --- Helper functions for processing web scraped HTML table data ---

def date_time(table_cells):
    """
    This function returns the date and time from the HTML table cell.
    It takes an element representing a table data cell.
    """
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

def booster_version(table_cells):
    """
    This function returns the booster version from the HTML table cell.
    It takes an element representing a table data cell.
    """
    out = ''.join([booster_version for i, booster_version in enumerate(table_cells.strings) if i % 2 == 0][0:-1])
    return out

def landing_status(table_cells):
    """
    This function returns the landing status from the HTML table cell.
    It takes an element representing a table data cell.
    """
    out = [i for i in table_cells.strings][0]
    return out

def get_mass(table_cells):
    """
    This function extracts and returns the payload mass in kilograms from the HTML table cell.
    It handles unicode characters and ensures the mass is in 'kg' format.
    If no mass is found, it returns 0.
    """
    mass = unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        mass_end_index = mass.find("kg")
        if mass_end_index != -1:
            new_mass = mass[0:mass_end_index + 2]
        else:
            new_mass = mass
    else:
        new_mass = 0
    return new_mass

def extract_column_from_header(row):
    """
    This function extracts and cleans column names from HTML table header cells.
    It removes <br>, <a>, and <sup> tags, then joins the remaining content.
    It filters out purely digit or empty names.
    """
    if (row.br):
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()
        
    colunm_name = ' '.join(row.contents)
    
    if not(colunm_name.strip().isdigit()):
        colunm_name = colunm_name.strip()
        return colunm_name
    return None

# --- Main Assignment Code ---

# Define the static URL for the Wikipedia page snapshot
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

# TASK 1: Request the Falcon9 Launch Wiki page from its URL
print(f"Fetching HTML content from: {static_url}")
response = requests.get(static_url)
print(f"HTTP Response Status Code: {response.status_code}")

# Create a BeautifulSoup object from the HTML response
soup = BeautifulSoup(response.text, 'html.parser')
print("BeautifulSoup object 'soup' created.")

# Print the page title to verify
print(f"Page Title: {soup.title.string}")

# TASK 2: Extract all column/variable names from the HTML table header

# Use the find_all function to find all tables
html_tables = soup.find_all('table')
print(f"Found {len(html_tables)} tables on the page.")

# The third table (index 2) is the one containing the launch data
first_launch_table = html_tables[2]
# print("Content of the first launch table (excerpt):\n", first_launch_table) # For detailed verification if needed

# Find the header row (tr) within the first launch table
header_row = first_launch_table.find('tr')

column_names = []
for th_element in header_row.find_all('th'):
    name = extract_column_from_header(th_element)
    if name is not None and len(name) > 0:
        column_names.append(name)
print("\nExtracted Column Names:", column_names)


# TASK 3: Create a data frame by parsing the launch HTML tables

# Initialize launch_dict with keys from extracted column names
launch_dict = dict.fromkeys(column_names)

# Remove the irrelevant column
del launch_dict['Date and time ( )']

# Re-initialize specific lists in the dictionary to be empty lists
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []

# Add new columns as specified in the assignment
launch_dict['Version Booster'] = []
launch_dict['Booster landing'] = []
launch_dict['Date'] = []
launch_dict['Time'] = []
print("\nInitialized launch_dict for data population.")

extracted_row = 0
# Extract each table
for table_number, table in enumerate(soup.find_all('table', "wikitable plainrowheaders collapsible")):
    # get table row
    for rows in table.find_all("tr"):
        # check to see if first table heading is a number corresponding to launch a number
        if rows.th:
            if rows.th.string:
                flight_number = rows.th.string.strip()
                flag = flight_number.isdigit()
            else:
                flag = False
        else:
            flag = False

        # get table elements (td tags) for the row
        row_td_elements = rows.find_all('td')

        # if it is number (a launch record), save cells in the dictionary
        if flag:
            extracted_row += 1
            
            # Flight Number value
            launch_dict['Flight No.'].append(flight_number)

            datatimelist = date_time(row_td_elements[0])

            # Date value
            date = datatimelist[0].strip(',')
            launch_dict['Date'].append(date)

            # Time value
            time = datatimelist[1]
            launch_dict['Time'].append(time)

            # Booster version
            bv = booster_version(row_td_elements[1])
            if not(bv):
                if row_td_elements[1].a:
                    bv = row_td_elements[1].a.string
                else:
                    bv = None
            launch_dict['Version Booster'].append(bv)

            # Launch Site
            launch_site = row_td_elements[2].a.string if row_td_elements[2].a else None
            launch_dict['Launch site'].append(launch_site)

            # Payload
            payload = row_td_elements[3].a.string if row_td_elements[3].a else None
            launch_dict['Payload'].append(payload)

            # Payload Mass
            payload_mass = get_mass(row_td_elements[4])
            launch_dict['Payload mass'].append(payload_mass)

            # Orbit
            orbit = row_td_elements[5].a.string if row_td_elements[5].a else None
            launch_dict['Orbit'].append(orbit)

            # Customer
            customer = None
            if row_td_elements[6].a:
                customer = row_td_elements[6].a.string
            elif row_td_elements[6].div and row_td_elements[6].div.ul and row_td_elements[6].div.ul.li and row_td_elements[6].div.ul.li.a:
                customer = row_td_elements[6].div.ul.li.a.string
            else:
                customer_strings = list(row_td_elements[6].strings)
                customer = customer_strings[0].strip() if customer_strings else None
            launch_dict['Customer'].append(customer)

            # Launch outcome
            launch_outcome = list(row_td_elements[7].strings)[0].strip() if list(row_td_elements[7].strings) else None
            launch_dict['Launch outcome'].append(launch_outcome)

            # Booster landing
            booster_landing = landing_status(row_td_elements[8])
            launch_dict['Booster landing'].append(booster_landing)

print(f"\nCompleted filling launch_dict with {extracted_row} launch records.")

# Create DataFrame from launch_dict
df = pd.DataFrame({ key:pd.Series(value) for key, value in launch_dict.items() })

print("\nFirst 5 rows of the created DataFrame:")
print(df.head())
print("\nDataFrame shape:", df.shape)

# Export the DataFrame to CSV
# Using a relative path so it saves to the current working directory of the notebook
output_filename = 'spacex_launch_data.csv'
df.to_csv(output_filename, index=False)
print(f"\nDataFrame successfully exported to '{output_filename}' in the current working directory.")


Fetching HTML content from: https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922
HTTP Response Status Code: 200
BeautifulSoup object 'soup' created.
Page Title: List of Falcon 9 and Falcon Heavy launches - Wikipedia
Found 25 tables on the page.

Extracted Column Names: ['Flight No.', 'Date and time ( )', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome']

Initialized launch_dict for data population.

Completed filling launch_dict with 121 launch records.

First 5 rows of the created DataFrame:
  Flight No. Launch site                               Payload Payload mass  \
0          1       CCAFS  Dragon Spacecraft Qualification Unit            0   
1          2       CCAFS                                Dragon            0   
2          3       CCAFS                                Dragon       525 kg   
3          4       CCAFS                          SpaceX CRS-1     4,700 kg   
4          5       CCAFS  