In [None]:
#Data fetch
# data fetch from API 
# task 1
# Import Libraries
import requests
import pandas as pd
import numpy as np
import datetime

# Setting pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Step 1: Request the static JSON file (consistent dataset)
static_json_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/API_call_spacex_api.json'

response = requests.get(static_json_url)
print("Status Code:", response.status_code)  # should be 200

# Step 2: Convert response to JSON and normalize into DataFrame
data = response.json()
data = pd.json_normalize(data)

# Display first 5 rows
print("\nOriginal DataFrame:")
print(data.head())

# Step 3: Select only useful columns
data = data[['rocket', 'payloads', 'launchpad', 'cores', 'flight_number', 'date_utc']]

# Step 4: Remove rows with multiple cores/payloads (keep only 1 each)
data = data[data['cores'].map(len) == 1]
data = data[data['payloads'].map(len) == 1]

# Step 5: Extract single values from list (since each row has list of size 1)
data['cores'] = data['cores'].map(lambda x: x[0])
data['payloads'] = data['payloads'].map(lambda x: x[0])

# Step 6: Convert date_utc to datetime and keep only date
data['date'] = pd.to_datetime(data['date_utc']).dt.date

# Step 7: Restrict dates before 2020-11-13
data = data[data['date'] <= datetime.date(2020, 11, 13)]

# ----------------------------
# Global Variables (lists to hold extracted data)
# ----------------------------
BoosterVersion = []
PayloadMass = []
Orbit = []
LaunchSite = []
Outcome = []
Flights = []
GridFins = []
Reused = []
Legs = []
LandingPad = []
Block = []
ReusedCount = []
Serial = []
Longitude = []
Latitude = []

# ----------------------------
# Helper Functions
# ----------------------------

# 1. Booster Version
def getBoosterVersion(data):
    for x in data['rocket']:
        if x:
            response = requests.get("https://api.spacexdata.com/v4/rockets/" + str(x)).json()
            BoosterVersion.append(response['name'])

# 2. Launch Site
def getLaunchSite(data):
    for x in data['launchpad']:
        if x:
            response = requests.get("https://api.spacexdata.com/v4/launchpads/" + str(x)).json()
            Longitude.append(response['longitude'])
            Latitude.append(response['latitude'])
            LaunchSite.append(response['name'])

# 3. Payload Data
def getPayloadData(data):
    for load in data['payloads']:
        if load:
            response = requests.get("https://api.spacexdata.com/v4/payloads/" + load).json()
            PayloadMass.append(response['mass_kg'])
            Orbit.append(response['orbit'])

# 4. Core Data
def getCoreData(data):
    for core in data['cores']:
        if core['core'] != None:
            response = requests.get("https://api.spacexdata.com/v4/cores/" + core['core']).json()
            Block.append(response['block'])
            ReusedCount.append(response['reuse_count'])
            Serial.append(response['serial'])
        else:
            Block.append(None)
            ReusedCount.append(None)
            Serial.append(None)

        Outcome.append(str(core['landing_success']) + ' ' + str(core['landing_type']))
        Flights.append(core['flight'])
        GridFins.append(core['gridfins'])
        Reused.append(core['reused'])
        Legs.append(core['legs'])
        LandingPad.append(core['landpad'])

# ----------------------------
# Apply helper functions
# ----------------------------
getBoosterVersion(data)
getLaunchSite(data)
getPayloadData(data)
getCoreData(data)

# ----------------------------
# Build Final DataFrame
# ----------------------------
launch_dict = {
    'FlightNumber': list(data['flight_number']),
    'Date': list(data['date']),
    'BoosterVersion': BoosterVersion,
    'PayloadMass': PayloadMass,
    'Orbit': Orbit,
    'LaunchSite': LaunchSite,
    'Outcome': Outcome,
    'Flights': Flights,
    'GridFins': GridFins,
    'Reused': Reused,
    'Legs': Legs,
    'LandingPad': LandingPad,
    'Block': Block,
    'ReusedCount': ReusedCount,
    'Serial': Serial,
    'Longitude': Longitude,
    'Latitude': Latitude
}

final_df = pd.DataFrame(launch_dict)

# Show summary
print("\nFinal Cleaned DataFrame:")
print(final_df.head())


Status Code: 200

Original DataFrame:
       static_fire_date_utc  static_fire_date_unix    tbd    net  window  \
0  2006-03-17T00:00:00.000Z           1.142554e+09  False  False     0.0   
1                      None                    NaN  False  False     0.0   
2                      None                    NaN  False  False     0.0   
3  2008-09-20T00:00:00.000Z           1.221869e+09  False  False     0.0   
4                      None                    NaN  False  False     0.0   

                     rocket  success  \
0  5e9d0d95eda69955f709d1eb    False   
1  5e9d0d95eda69955f709d1eb    False   
2  5e9d0d95eda69955f709d1eb    False   
3  5e9d0d95eda69955f709d1eb     True   
4  5e9d0d95eda69955f709d1eb     True   

                                                                                                                                                                                details  \
0                                                                            

In [6]:
# Task 2: Filter the dataframe to only include Falcon 9 launches

# Step 1: Filter out Falcon 1 launches -> Keep only Falcon 9
data_falcon9 = data[data['rocket'].str.contains("Falcon 9", case=False, na=False)].copy()

# Step 2: Reset the flight_number column (1,2,3,...)
data_falcon9.loc[:, 'flight_number'] = list(range(1, data_falcon9.shape[0] + 1))

# Step 3: Check missing values
print("Missing values in each column:\n")
print(data_falcon9.isnull().sum())

# Step 4: Show first few rows of Falcon 9 launches
print("\nFirst 5 rows of Falcon 9 dataset:\n")
print(data_falcon9.head())

# Optional: Dataset size check
print("\nShape of Falcon 9 dataset:", data_falcon9.shape)


Missing values in each column:

rocket           0
payloads         0
launchpad        0
cores            0
flight_number    0
date_utc         0
date             0
dtype: int64

First 5 rows of Falcon 9 dataset:

Empty DataFrame
Columns: [rocket, payloads, launchpad, cores, flight_number, date_utc, date]
Index: []

Shape of Falcon 9 dataset: (0, 7)


In [7]:
# task 3
import numpy as np

# Step 1: Extract Payload Mass (in kg) from the 'payloads' dictionary
# Sometimes payloads column has nested dict/list, so handle safely
data_falcon9.loc[:, "PayloadMass"] = data_falcon9["payloads"].apply(
    lambda x: x[0]["mass_kg"] if isinstance(x, list) and len(x) > 0 and "mass_kg" in x[0] else np.nan
)

# Step 2: Calculate mean of PayloadMass
payload_mean = data_falcon9["PayloadMass"].mean()
print("Mean Payload Mass:", payload_mean)

# Step 3: Replace NaN values with mean
data_falcon9["PayloadMass"].replace(np.nan, payload_mean, inplace=True)

# Step 4: Verify missing values
print("\nMissing values after replacement:\n")
print(data_falcon9.isnull().sum())

# Step 5: Export cleaned dataset to CSV
data_falcon9.to_csv("dataset_part_1.csv", index=False)
print("\n✅ Cleaned dataset exported as 'dataset_part_1.csv'")


Mean Payload Mass: nan

Missing values after replacement:

rocket           0
payloads         0
launchpad        0
cores            0
flight_number    0
date_utc         0
date             0
PayloadMass      0
dtype: int64

✅ Cleaned dataset exported as 'dataset_part_1.csv'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_falcon9["PayloadMass"].replace(np.nan, payload_mean, inplace=True)


In [8]:
# data fetch through web scarping
# Task 1: Request the Falcon9 Launch Wiki page and create BeautifulSoup object

import requests
from bs4 import BeautifulSoup

# Step 1: Falcon 9 Launch Wiki page ka URL
static_url = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches"

# Step 2: GET request bhejna
response = requests.get(static_url)

# Step 3: Response se BeautifulSoup object banana
soup = BeautifulSoup(response.text, "html.parser")

# Step 4: Page ka title print karke verify karna
print(soup.title)
print(soup.title.string)



<title>List of Falcon 9 and Falcon Heavy launches - Wikipedia</title>
List of Falcon 9 and Falcon Heavy launches - Wikipedia


In [10]:
# task 2
# Import libraries
import requests
from bs4 import BeautifulSoup

# Wikipedia Falcon9 page
static_url = "https://en.wikipedia.org/wiki/List_of_Falcon_9_and_Falcon_Heavy_launches"
response = requests.get(static_url)
soup = BeautifulSoup(response.text, "html.parser")

# Function to clean column header text
def extract_column_from_header(th):
    if th.br:
        th.br.extract()
    if th.sup:
        th.sup.extract()
    # don't remove <a>, keep its text
    col_name = ' '.join(th.stripped_strings)
    if not col_name.isdigit() and len(col_name) > 0:
        return col_name
    return None

# Find all tables
html_tables = soup.find_all('table')

# Target the 3rd table (index 2)
first_launch_table = html_tables[2]

# Extract column names
column_names = []
for th in first_launch_table.find_all('th'):
    name = extract_column_from_header(th)
    if name is not None:
        column_names.append(name)

# Print column names
print("Extracted Column Names:")
print(column_names)



Extracted Column Names:
['Flight No.', 'Date and time ( UTC )', 'Version, booster', 'Launch site', 'Payload', 'Payload mass', 'Orbit', 'Customer', 'Launch outcome', 'Booster landing']


In [None]:
#task 3
import re
import unicodedata
import pandas as pd
import numpy as np

# ---------- Helper functions (robust) ----------
def date_time(td):
    parts = [s.strip() for s in td.stripped_strings if s.strip()]
    # Often looks like: ['2010-06-04', '18:45', 'UTC'] or with commas/notes
    # Try to return [date, time] best-effort
    if len(parts) >= 2:
        return [parts[0].rstrip(','), parts[1]]
    elif len(parts) == 1:
        # fallback: split by space or '·'
        tmp = re.split(r'[·\s]+', parts[0])
        if len(tmp) >= 2:
            return [tmp[0].rstrip(','), tmp[1]]
        return [parts[0], ""]
    return ["", ""]

def booster_version(td):
    # Keep visible text, drop references
    if td.sup: 
        for s in td.find_all('sup'):
            s.extract()
    text = ' '.join(td.stripped_strings)
    return text

def landing_status(td):
    # First visible text sans references
    if td.sup: 
        for s in td.find_all('sup'):
            s.extract()
    return td.get_text(" ", strip=True)

def get_mass(td):
    # Normalize and pull "... kg" part
    text = unicodedata.normalize("NFKD", td.get_text(" ", strip=True))
    m = re.search(r'([\d,]+)\s*kg', text)
    if m:
        # return numeric kg
        return float(m.group(1).replace(',', ''))
    return np.nan

# ---------- Build dict with stable columns (independent of header quirks) ----------
cols = [
    'Flight No.', 'Date', 'Time', 'Version Booster', 'Launch site',
    'Payload', 'Payload mass', 'Orbit', 'Customer',
    'Launch outcome', 'Booster landing'
]
launch_dict = {k: [] for k in cols}

# ---------- Find relevant tables ----------
# Tables usually carry these classes; include both to be safe
tables = soup.select('table.wikitable.plainrowheaders.collapsible, table.wikitable.plainrowheaders')

extracted_row = 0

for table in tables:
    for tr in table.find_all('tr'):
        th = tr.find('th')
        if not th:
            continue
        # Flight number often pure digits in the row header
        flight_text = th.get_text(" ", strip=True)
        if not re.fullmatch(r'\d+', flight_text):
            continue  # skip header/sub-rows

        tds = tr.find_all('td')
        if len(tds) < 9:
            # Not a full data row
            continue

        # Extract cells safely
        dt_date, dt_time = date_time(tds[0])
        bv = booster_version(tds[1])
        launch_site = tds[2].get_text(" ", strip=True)
        payload = tds[3].get_text(" ", strip=True)
        payload_mass = get_mass(tds[4])
        orbit = tds[5].get_text(" ", strip=True)
        customer = tds[6].get_text(" ", strip=True)

        launch_outcome = tds[7].get_text(" ", strip=True)
        # drop any trailing reference bracket text like "... [e]"
        launch_outcome = launch_outcome.split('[', 1)[0].strip()

        booster_landing = landing_status(tds[8])
        booster_landing = booster_landing.split('[', 1)[0].strip()

        # Append to dict
        launch_dict['Flight No.'].append(flight_text)
        launch_dict['Date'].append(dt_date)
        launch_dict['Time'].append(dt_time)
        launch_dict['Version Booster'].append(bv)
        launch_dict['Launch site'].append(launch_site)
        launch_dict['Payload'].append(payload)
        launch_dict['Payload mass'].append(payload_mass)
        launch_dict['Orbit'].append(orbit)
        launch_dict['Customer'].append(customer)
        launch_dict['Launch outcome'].append(launch_outcome)
        launch_dict['Booster landing'].append(booster_landing)

        extracted_row += 1

print("Rows extracted:", extracted_row)

# ---------- DataFrame + CSV ----------
df = pd.DataFrame(launch_dict)
print(df.head())
df.to_csv('spacex_web_scraped.csv', index=False)
print("Saved to spacex_web_scraped.csv")


Rows extracted: 234
  Flight No.              Date   Time Version Booster  \
0        286   January 3, 2024  03:44   F9 B5 B1082‑1   
1        287   January 3, 2024  23:04  F9 B5 B1076‑10   
2        288   January 7, 2024  22:35  F9 B5 B1067‑16   
3        289  January 14, 2024  08:59  F9 B5 B1061‑18   
4        290  January 15, 2024  01:52  F9 B5 B1073‑12   

               Launch site                                Payload  \
0      Vandenberg , SLC‑4E   Starlink : Group 7-9 (22 satellites)   
1  Cape Canaveral , SLC‑40                                Ovzon-3   
2  Cape Canaveral , SLC‑40  Starlink : Group 6-35 (23 satellites)   
3      Vandenberg , SLC‑4E  Starlink : Group 7-10 (22 satellites)   
4  Cape Canaveral , SLC‑40  Starlink : Group 6-37 (23 satellites)   

   Payload mass Orbit Customer Launch outcome     Booster landing  
0       16800.0   LEO   SpaceX        Success  Success ( OCISLY )  
1        1800.0   GTO    Ovzon        Success    Success ( LZ‑1 )  
2       17100.0   