<a href="https://colab.research.google.com/github/ThrishaJawahar/Falcon-9-Launch-Prediction/blob/main/WebScrapingDataCollection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Objectives


1.   Extract a Falcon 9 launch records HTML table from Wikipedia
2.   Parse the table and convert it into a Pandas data frame

In [1]:
!pip3 install beautifulsoup4
!pip3 install requests



In [2]:
import sys

import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd

In [7]:
import unicodedata

def landing_status(table_cells):
  landing = ''.join([status.strip() for status in table_cells.strings]).strip()
  return landing if landing else 'None'

def date_time(table_cells):
  return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

def booster_version(table_cells):
  out = ''.join([booster_version.strip() for i, booster_version in enumerate(table_cells.strings) if i % 2 == 0][0:-1])
  return out.strip()

def get_mass(table_cells):
  mass = unicodedata.normalize("NFKD", table_cells.text).strip()
  if mass:

        mass_index = mass.find("kg")
        if mass_index != -1:
            new_mass = mass[0:mass_index + 2]
        else:
            new_mass = '0'
  else:
      new_mass = '0'
  return new_mass

def extract_column_from_header(row):

    if row.br:
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()

    column_name = ' '.join(row.contents).strip()


    if not column_name.isdigit() and column_name.strip():
        return column_name
    else:
        return None



In [8]:
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

TASK 1: Request the Falcon9 Launch Wiki page from its URL

In [10]:

response = requests.get(static_url)
response_status = response.status_code
print(response_status)


200


In [12]:
soup = BeautifulSoup(response.content, 'html.parser')
page_title = soup.title.string
print("Page Title:", page_title)


Page Title: List of Falcon 9 and Falcon Heavy launches - Wikipedia


TASK 2: Extract all column/variable names from the HTML table header

In [19]:

tables = soup.find_all('table')


table = tables[0]
header_cells = table.find_all('th')
if not header_cells:
    header_cells = table.find_all('td')

html_tables = [cell.get_text(strip=True) for cell in header_cells]
print("Column Names:", html_tables)



Column Names: ["Rocket configurations51015202530'11'12'13'14'15'16'17'18'19'20'21Falcon 9 v1.0Falcon 9 v1.1Falcon 9 Full ThrustFalcon 9 FT (reused)Falcon 9 Block 5Falcon 9 B5 (reused)Falcon Heavy", "Launch sites51015202530'10'11'12'13'14'15'16'17'18'19'20'21CCSFS,SLC-40KSC,LC-39AVAFB,SLC-4E"]


In [15]:

if len(html_tables) > 2:
    first_launch_table = html_tables[2]
    print(first_launch_table)
else:
    print("html_tables does not have an element at index 2. Length of html_tables:", len(html_tables))
    print("html_tables content:", html_tables)

html_tables does not have an element at index 2. Length of html_tables: 2
html_tables content: ["Rocket configurations51015202530'10'11'12'13'14'15'16'17'18'19'20'21Falcon 9 v1.0Falcon 9 v1.1Falcon 9 Full ThrustFalcon 9 FT (reused)Falcon 9 Block 5Falcon 9 B5 (reused)Falcon Heavy", "Launch sites51015202530'10'11'12'13'14'15'16'17'18'19'20'21CCSFS,SLC-40KSC,LC-39AVAFB,SLC-4E"]


Task 3: Create a data frame by parsing the launch HTML tables

In [20]:
def extract_column_from_header(row):

    if row.br:
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()


    text_elements = [text for text in row.stripped_strings]


    column_name = ' '.join(text_elements).strip()

    if not column_name.isdigit() and column_name.strip():
        return column_name
    else:
        return None

In [22]:

column_names = ['Flight No.', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome', 'Version Booster', 'Booster landing', 'Date', 'Time', 'Date and time ( )']
launch_dict = dict.fromkeys(column_names)
del launch_dict['Date and time ( )']
launch_dict['Flight No.'] = []
launch_dict['Launch site'] = []
launch_dict['Payload'] = []
launch_dict['Payload mass'] = []
launch_dict['Orbit'] = []
launch_dict['Customer'] = []
launch_dict['Launch outcome'] = []
launch_dict['Version Booster'] = []
launch_dict['Booster landing'] = []
launch_dict['Date'] = []
launch_dict['Time'] = []

In [24]:

launch_dict = {
    "Flight No.": [],
    "Date": [],
    "Time": [],
    "Version Booster": [],
    "Launch Site": [],
    "Payload": [],
    "Payload mass": [],
    "Orbit": [],
    "Customer": [],
    "Launch outcome": [],
    "Booster landing": []
}

extracted_row = 0


for table_number, table in enumerate(soup.find_all('table', "wikitable plainrowheaders collapsible")):

    for rows in table.find_all("tr"):

        if rows.th:
            if rows.th.string:
                flight_number = rows.th.string.strip()
                flag = flight_number.isdigit()
        else:
            flag = False


        row = rows.find_all('td')


        if flag:
            extracted_row += 1


            launch_dict["Flight No."].append(flight_number)


            datatimelist = date_time(row[0]) if row[0] else ["", ""]
            date = datatimelist[0].strip(',') if datatimelist[0] else "N/A"
            time = datatimelist[1] if datatimelist[1] else "N/A"
            launch_dict["Date"].append(date)
            launch_dict["Time"].append(time)


            bv = booster_version(row[1]) if row[1] else None
            bv = bv or (row[1].a.string if row[1] and row[1].a else "N/A")
            launch_dict["Version Booster"].append(bv)


            launch_site = row[2].a.string if row[2] and row[2].a else "N/A"
            launch_dict["Launch Site"].append(launch_site)


            payload = row[3].a.string if row[3] and row[3].a else "N/A"
            launch_dict["Payload"].append(payload)


            payload_mass = get_mass(row[4]) if row[4] else "N/A"
            launch_dict["Payload mass"].append(payload_mass)


            orbit = row[5].a.string if row[5] and row[5].a else "N/A"
            launch_dict["Orbit"].append(orbit)


            customer = row[6].a.string if row[6] and row[6].a else "N/A"
            launch_dict["Customer"].append(customer)


            launch_outcome = list(row[7].strings)[0] if row[7] else "N/A"
            launch_dict["Launch outcome"].append(launch_outcome)


            booster_landing = landing_status(row[8]) if row[8] else "N/A"
            launch_dict["Booster landing"].append(booster_landing)

print(f"Total rows extracted: {extracted_row}")


Total rows extracted: 121


In [25]:
df= pd.DataFrame({ key:pd.Series(value) for key, value in launch_dict.items() })

In [26]:
df.to_csv('spacex_web_scraped.csv', index=False)