In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import unicodedata

# Function to extract the date and time from the HTML table cell
def date_time(table_cells):
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

# Function to extract the booster version from the HTML table cell
def booster_version(table_cells):
    out = ''.join([booster_version for i, booster_version in enumerate(table_cells.strings) if i % 2 == 0][0:-1])
    return out

# Function to extract the landing status from the HTML table cell
def landing_status(table_cells):
    out = [i for i in table_cells.strings][0]
    return out

# Function to extract the mass of the payload from the HTML table cell
def get_mass(table_cells):
    mass = unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        new_mass = mass[0:mass.find("kg") + 2] if "kg" in mass else 0
    else:
        new_mass = 0
    return new_mass

# Function to clean up the column names
def extract_column_from_header(row):
    if row.br:
        row.br.extract()  # Remove <br> tags
    if row.a:
        row.a.extract()  # Remove <a> tags
    if row.sup:
        row.sup.extract()  # Remove <sup> tags
    
    column_name = ' '.join(row.stripped_strings)
    
    # Filter out any column names that are purely digits or empty
    if not column_name.strip().isdigit():
        column_name = column_name.strip()
        return column_name
    return None  # If the column name is empty or just digits, return None

# URL of the Wikipedia page to scrape
static_url = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches"

# Fetch the HTML content using requests
response = requests.get(static_url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Initialize the dictionary to store the data
    launch_dict = {
        'Flight No.': [],
        'Launch site': [],
        'Payload': [],
        'Payload mass': [],
        'Orbit': [],
        'Customer': [],
        'Launch outcome': [],
        'Version Booster': [],
        'Booster landing': [],
        'Date': [],
        'Time': []
    }

    # Debugging: Print first few lines of HTML to ensure correct table
    print("Parsing the page...")
    tables = soup.find_all('table', "wikitable plainrowheaders collapsible")
    print(f"Found {len(tables)} tables.")
    
    # Extract each table 
    extracted_row = 0
    for table_number, table in enumerate(tables):
        print(f"Processing table {table_number + 1}...")
        # get table row 
        for rows in table.find_all("tr"):
            # Check if the row starts with a flight number
            flag = False
            if rows.th:
                flight_number = rows.th.string.strip() if rows.th.string else ''
                flag = flight_number.isdigit()
            else:
                flag = False

            # If it's a valid row (with a flight number), extract the data
            if flag:
                extracted_row += 1
                cells = rows.find_all('td')

                # Print debug output for row data
                print(f"Row {extracted_row}: {flight_number} - {cells}")

                # Extract the Date and Time
                datatimelist = date_time(cells[0])
                date = datatimelist[0].strip(',') if len(datatimelist) > 0 else ''
                time = datatimelist[1] if len(datatimelist) > 1 else ''

                # Extract other fields
                bv = booster_version(cells[1])
                if not bv:  # If booster version is missing, use the anchor text
                    bv = cells[1].a.string if cells[1].a else ''

                launch_site = cells[2].a.string if cells[2].a else ''
                payload = cells[3].a.string if cells[3].a else ''
                payload_mass = get_mass(cells[4])
                orbit = cells[5].a.string if cells[5].a else ''
                customer = cells[6].a.string if cells[6].a else ''
                launch_outcome = list(cells[7].strings)[0] if len(cells[7].strings) > 0 else ''
                booster_landing = landing_status(cells[8]) if len(cells) > 8 else ''

                # Append data to the dictionary
                launch_dict['Flight No.'].append(flight_number)
                launch_dict['Launch site'].append(launch_site)
                launch_dict['Payload'].append(payload)
                launch_dict['Payload mass'].append(payload_mass)
                launch_dict['Orbit'].append(orbit)
                launch_dict['Customer'].append(customer)
                launch_dict['Launch outcome'].append(launch_outcome)
                launch_dict['Version Booster'].append(bv)
                launch_dict['Booster landing'].append(booster_landing)
                launch_dict['Date'].append(date)
                launch_dict['Time'].append(time)

    # After processing all rows, check how many rows were extracted
    print(f"Total extracted rows: {extracted_row}")
    
    # Convert the dictionary into a pandas DataFrame
    df = pd.DataFrame({key: pd.Series(value) for key, value in launch_dict.items()})

    # Display the DataFrame
    print("Generated DataFrame:")
    print(df.head())
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

         

Parsing the page...
Found 0 tables.
Total extracted rows: 0
Generated DataFrame:
Empty DataFrame
Columns: [Flight No., Launch site, Payload, Payload mass, Orbit, Customer, Launch outcome, Version Booster, Booster landing, Date, Time]
Index: []
