In [1]:
!pip install beautifulsoup4
!pip install requests



In [2]:
import sys
import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd

In [3]:
def date_time(table_cells):
    """
    This function returns the data and time from the HTML  table cell
    Input: the  element of a table data cell extracts extra row
    """
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

def booster_version(table_cells):
    """
    This function returns the booster version from the HTML  table cell
    Input: the  element of a table data cell extracts extra row
    """
    out=''.join([booster_version for i,booster_version in enumerate( table_cells.strings) if i%2==0][0:-1])
    return out

def landing_status(table_cells):
    """
    This function returns the landing status from the HTML table cell
    Input: the  element of a table data cell extracts extra row
    """
    out=[i for i in table_cells.strings][0]
    return out


def get_mass(table_cells):
    mass=unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        mass.find("kg")
        new_mass=mass[0:mass.find("kg")+2]
    else:
        new_mass=0
    return new_mass


def extract_column_from_header(row):
    """
    This function returns the landing status from the HTML table cell
    Input: the  element of a table data cell extracts extra row
    """
    if (row.br):
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()

    colunm_name = ' '.join(row.contents)

    # Filter the digit and empty names
    if not(colunm_name.strip().isdigit()):
        colunm_name = colunm_name.strip()
        return colunm_name

In [4]:
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"

In [6]:
#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
# TASK 1: Request & Normalize the SpaceX Launch Data
#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

# 1.1 Fetch the static JSON and verify status
static_json_url = (
  "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/"
  "IBM-DS0321EN-SkillsNetwork/datasets/API_call_spacex_api.json"
)
resp = requests.get(static_json_url)
print(resp)
resp.raise_for_status()
assert resp.status_code == 200, f"Got {resp.status_code}"

<Response [200]>


In [7]:
# 1.2 Parse JSON and normalize into a DataFrame
raw = resp.json()
df  = pd.json_normalize(raw)
print("All columns:", df.columns.tolist())
print("Raw launch data (first 5 rows):")
display(df.head())

All columns: ['static_fire_date_utc', 'static_fire_date_unix', 'tbd', 'net', 'window', 'rocket', 'success', 'details', 'crew', 'ships', 'capsules', 'payloads', 'launchpad', 'auto_update', 'failures', 'flight_number', 'name', 'date_utc', 'date_unix', 'date_local', 'date_precision', 'upcoming', 'cores', 'id', 'fairings.reused', 'fairings.recovery_attempt', 'fairings.recovered', 'fairings.ships', 'links.patch.small', 'links.patch.large', 'links.reddit.campaign', 'links.reddit.launch', 'links.reddit.media', 'links.reddit.recovery', 'links.flickr.small', 'links.flickr.original', 'links.presskit', 'links.webcast', 'links.youtube_id', 'links.article', 'links.wikipedia', 'fairings']
Raw launch data (first 5 rows):


Unnamed: 0,static_fire_date_utc,static_fire_date_unix,tbd,net,window,rocket,success,details,crew,ships,...,links.reddit.media,links.reddit.recovery,links.flickr.small,links.flickr.original,links.presskit,links.webcast,links.youtube_id,links.article,links.wikipedia,fairings
0,2006-03-17T00:00:00.000Z,1142554000.0,False,False,0.0,5e9d0d95eda69955f709d1eb,False,Engine failure at 33 seconds and loss of vehicle,[],[],...,,,[],[],,https://www.youtube.com/watch?v=0a_00nJ_Y88,0a_00nJ_Y88,https://www.space.com/2196-spacex-inaugural-fa...,https://en.wikipedia.org/wiki/DemoSat,
1,,,False,False,0.0,5e9d0d95eda69955f709d1eb,False,Successful first stage burn and transition to ...,[],[],...,,,[],[],,https://www.youtube.com/watch?v=Lk4zQ2wP-Nc,Lk4zQ2wP-Nc,https://www.space.com/3590-spacex-falcon-1-roc...,https://en.wikipedia.org/wiki/DemoSat,
2,,,False,False,0.0,5e9d0d95eda69955f709d1eb,False,Residual stage 1 thrust led to collision betwe...,[],[],...,,,[],[],,https://www.youtube.com/watch?v=v0w9p3U8860,v0w9p3U8860,http://www.spacex.com/news/2013/02/11/falcon-1...,https://en.wikipedia.org/wiki/Trailblazer_(sat...,
3,2008-09-20T00:00:00.000Z,1221869000.0,False,False,0.0,5e9d0d95eda69955f709d1eb,True,Ratsat was carried to orbit on the first succe...,[],[],...,,,[],[],,https://www.youtube.com/watch?v=dLQ2tZEH6G0,dLQ2tZEH6G0,https://en.wikipedia.org/wiki/Ratsat,https://en.wikipedia.org/wiki/Ratsat,
4,,,False,False,0.0,5e9d0d95eda69955f709d1eb,True,,[],[],...,,,[],[],http://www.spacex.com/press/2012/12/19/spacexs...,https://www.youtube.com/watch?v=yTaIDooc8Og,yTaIDooc8Og,http://www.spacex.com/news/2013/02/12/falcon-1...,https://en.wikipedia.org/wiki/RazakSAT,


In [10]:
import datetime
# 1.3 Select & wrangle the subset we need
data = df[['rocket','payloads','launchpad','cores','flight_number','date_utc']].copy()

# — drop any launches with multiple payloads or multiple cores
data = data[data['payloads'].map(len)==1]
data = data[data['cores'].map(len)==1]

# — unwrap the single‐element lists
data['payloads'] = data['payloads'].map(lambda x: x[0])
data['cores']    = data['cores'].map(lambda x: x[0])

# — convert and filter by date
data['date'] = pd.to_datetime(data['date_utc']).dt.date
cutoff = datetime.date(2020, 11, 13)
data = data[data['date'] <= cutoff]

print(f"\nFiltered down to {len(data)} launches up to {cutoff}:")
display(data.head())


Filtered down to 94 launches up to 2020-11-13:


Unnamed: 0,rocket,payloads,launchpad,cores,flight_number,date_utc,date
0,5e9d0d95eda69955f709d1eb,5eb0e4b5b6c3bb0006eeb1e1,5e9e4502f5090995de566f86,"{'core': '5e9e289df35918033d3b2623', 'flight':...",1,2006-03-24T22:30:00.000Z,2006-03-24
1,5e9d0d95eda69955f709d1eb,5eb0e4b6b6c3bb0006eeb1e2,5e9e4502f5090995de566f86,"{'core': '5e9e289ef35918416a3b2624', 'flight':...",2,2007-03-21T01:10:00.000Z,2007-03-21
3,5e9d0d95eda69955f709d1eb,5eb0e4b7b6c3bb0006eeb1e5,5e9e4502f5090995de566f86,"{'core': '5e9e289ef3591855dc3b2626', 'flight':...",4,2008-09-28T23:15:00.000Z,2008-09-28
4,5e9d0d95eda69955f709d1eb,5eb0e4b7b6c3bb0006eeb1e6,5e9e4502f5090995de566f86,"{'core': '5e9e289ef359184f103b2627', 'flight':...",5,2009-07-13T03:35:00.000Z,2009-07-13
5,5e9d0d95eda69973a809d1ec,5eb0e4b7b6c3bb0006eeb1e7,5e9e4501f509094ba4566f84,"{'core': '5e9e289ef359185f2b3b2628', 'flight':...",6,2010-06-04T18:45:00.000Z,2010-06-04


In [11]:
#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
# TASK 1 (continued): Enrich with Rocket / Payload / LaunchPad / Core details
#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

# Prepare global lists
BoosterVersion, PayloadMass, Orbit = [], [], []
LaunchSite, Longitude, Latitude   = [], [], []
Outcome, Flights, GridFins, Reused, Legs = [], [], [], [], []
LandingPad, Block, ReusedCount, Serial     = [], [], [], []

def getBoosterVersion(df):
    for rid in df['rocket']:
        info = requests.get(f"https://api.spacexdata.com/v4/rockets/{rid}").json()
        BoosterVersion.append(info['name'])

def getLaunchSite(df):
    for lid in df['launchpad']:
        info = requests.get(f"https://api.spacexdata.com/v4/launchpads/{lid}").json()
        LaunchSite.append(info['name'])
        Longitude.append(info['longitude'])
        Latitude.append(info['latitude'])

def getPayloadData(df):
    for pid in df['payloads']:
        info = requests.get(f"https://api.spacexdata.com/v4/payloads/{pid}").json()
        PayloadMass.append(info['mass_kg'])
        Orbit.append(info['orbit'])

def getCoreData(df):
    """
    Assumes df['cores'] is already a dict per row containing at least:
      {'core': <core_id>,
       'flight': int, 'gridfins': bool, 'legs': bool,
       'reused': bool, 'landing_success': bool,
       'landing_type': str, 'landpad': <pad_id> or None}
    Populates the global lists for core metadata + landing info.
    """
    for core_dict in df['cores']:
        core_id = core_dict.get('core')
        if core_id:
            url = f"https://api.spacexdata.com/v4/cores/{core_id}"
            resp = requests.get(url)
            resp.raise_for_status()
            info = resp.json()
            Block.append(info.get('block'))
            ReusedCount.append(info.get('reuse_count'))
            Serial.append(info.get('serial'))
        else:
            Block.append(None)
            ReusedCount.append(None)
            Serial.append(None)

        Outcome.append(f"{core_dict.get('landing_success')} {core_dict.get('landing_type')}")
        Flights.append(core_dict.get('flight'))
        GridFins.append(core_dict.get('gridfins'))
        Reused.append(core_dict.get('reused'))
        Legs.append(core_dict.get('legs'))
        LandingPad.append(core_dict.get('landpad'))

# Run enrichment
getBoosterVersion(data)
getLaunchSite   (data)
getPayloadData  (data)
getCoreData     (data)

# Assemble into final DataFrame
launch_dict = {
    'FlightNumber'  : data['flight_number'].tolist(),
    'Date'          : data['date'].tolist(),
    'BoosterVersion': BoosterVersion,
    'PayloadMass'   : PayloadMass,
    'Orbit'         : Orbit,
    'LaunchSite'    : LaunchSite,
    'Longitude'     : Longitude,
    'Latitude'      : Latitude,
    'Outcome'       : Outcome,
    'Flights'       : Flights,
    'GridFins'      : GridFins,
    'Reused'        : Reused,
    'Legs'          : Legs,
    'LandingPad'    : LandingPad,
    'Block'         : Block,
    'ReusedCount'   : ReusedCount,
    'Serial'        : Serial
}
launch_df = pd.DataFrame(launch_dict)
print("\nEnriched launches (first 5 rows):")
display(launch_df.head())


Enriched launches (first 5 rows):


Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Longitude,Latitude,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial
0,1,2006-03-24,Falcon 1,20.0,LEO,Kwajalein Atoll,167.743129,9.047721,None None,1,False,False,False,,,0,Merlin1A
1,2,2007-03-21,Falcon 1,,LEO,Kwajalein Atoll,167.743129,9.047721,None None,1,False,False,False,,,0,Merlin2A
2,4,2008-09-28,Falcon 1,165.0,LEO,Kwajalein Atoll,167.743129,9.047721,None None,1,False,False,False,,,0,Merlin2C
3,5,2009-07-13,Falcon 1,200.0,LEO,Kwajalein Atoll,167.743129,9.047721,None None,1,False,False,False,,,0,Merlin3C
4,6,2010-06-04,Falcon 9,,LEO,CCSFS SLC 40,-80.577366,28.561857,None None,1,False,False,False,,1.0,0,B0003


In [12]:
#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
# TASK 2: Filter to Falcon 9 launches & reset flight numbers
#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

data_falcon9 = launch_df[launch_df['BoosterVersion']!='Falcon 1'].copy()
data_falcon9['FlightNumber'] = range(1, len(data_falcon9)+1)

print(f"\nAfter filtering for Falcon 9 only: {len(data_falcon9)} rows")
display(data_falcon9.head())


After filtering for Falcon 9 only: 90 rows


Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Longitude,Latitude,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial
4,1,2010-06-04,Falcon 9,,LEO,CCSFS SLC 40,-80.577366,28.561857,None None,1,False,False,False,,1.0,0,B0003
5,2,2012-05-22,Falcon 9,525.0,LEO,CCSFS SLC 40,-80.577366,28.561857,None None,1,False,False,False,,1.0,0,B0005
6,3,2013-03-01,Falcon 9,677.0,ISS,CCSFS SLC 40,-80.577366,28.561857,None None,1,False,False,False,,1.0,0,B0007
7,4,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,-120.610829,34.632093,False Ocean,1,False,False,False,,1.0,0,B1003
8,5,2013-12-03,Falcon 9,3170.0,GTO,CCSFS SLC 40,-80.577366,28.561857,None None,1,False,False,False,,1.0,0,B1004


In [13]:
#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
# TASK 3: Handle Missing PayloadMass values
#––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

mean_mass = data_falcon9['PayloadMass'].mean()
print(f"\nMean payload mass (Falcon 9): {mean_mass:.2f} kg")

data_falcon9['PayloadMass'] = data_falcon9['PayloadMass'].fillna(mean_mass)
print("Nulls in PayloadMass after fill:", data_falcon9['PayloadMass'].isnull().sum())

# (Optional) Export to CSV
data_falcon9.to_csv('dataset_part_1.csv', index=False)
print("\nExported cleaned Falcon 9 dataset to dataset_part_1.csv")


Mean payload mass (Falcon 9): 6123.55 kg
Nulls in PayloadMass after fill: 0

Exported cleaned Falcon 9 dataset to dataset_part_1.csv
