# Raw Data Filtering Notebook

In this notebook we will be carrying out the inital exploration of the data to locate and select specific variables that we wish to include in our future cleaned dataset.



## Imports and Installs

In [2]:
import json
from pprint import pprint
import zipfile
import pandas as pd
import numpy as np

## Loading the Data

In [3]:
#The path to the zipfile
zip_path = "../raw data/raw_launch_data.json.zip"
launch_data_filename = 'raw_launch_data.json'

#loading the raw launch data 
with zipfile.ZipFile(zip_path, "r") as z:
    with z.open(launch_data_filename) as f:
        raw_launch_data = json.load(f)

#Firstly what type of data is it
type(raw_launch_data)

#printing out the first entry to ensure data was loaded correctly
#pprint(raw_launch_data[:5])

dict

The first level of the data is a dictionary, so to figure out what nested levels exist within the dictionary we will have to take a look at the key variables for this level.

In [4]:
#What are the keys for the raw_launch_data dictionary
raw_launch_data.keys()

dict_keys(['collector', 'total_launches', 'collection_date', 'launches'])

In [5]:
raw_launch_data['collection_date']

'2025-11-07 05:20:58'

In [6]:
type(raw_launch_data['launches'])

list

In [7]:
#What is the length of the raw launches list?
len(raw_launch_data['launches'])

7336

Ok so we can treat every entry within the list as their own individual launches

In [8]:
#So to access the first launch data we will use the following
#decided to change discovery to most recent launch 
raw_launch_data['launches'][-1]

{'id': '6602c88f-cbff-4495-b417-a184ddb0a426',
 'url': 'https://ll.thespacedevs.com/2.3.0/launches/6602c88f-cbff-4495-b417-a184ddb0a426/',
 'name': 'Falcon 9 Block 5 | Starlink Group 11-14',
 'response_mode': 'detailed',
 'slug': 'falcon-9-block-5-starlink-group-11-14',
 'launch_designator': '2025-254',
 'status': {'id': 3,
  'name': 'Launch Successful',
  'abbrev': 'Success',
  'description': 'The launch vehicle successfully inserted its payload(s) into the target orbit(s).'},
 'last_updated': '2025-11-07T03:05:58Z',
 'net': '2025-11-06T21:13:50Z',
 'net_precision': {'id': 0,
  'name': 'Second',
  'abbrev': 'SEC',
  'description': 'The T-0 is accurate to the second.'},
 'window_end': '2025-11-07T00:56:00Z',
 'window_start': '2025-11-06T20:56:00Z',
 'image': {'id': 1296,
  'name': 'Starlink night fairing',
  'image_url': 'https://thespacedevs-prod.nyc3.digitaloceanspaces.com/media/images/falcon2520925_image_20221009234147.png',
  'thumbnail_url': 'https://thespacedevs-prod.nyc3.digital

In [9]:
launch_data = raw_launch_data['launches']

In [10]:
launch_data[-1]['rocket']['configuration'].keys()

dict_keys(['response_mode', 'id', 'url', 'name', 'families', 'full_name', 'variant', 'active', 'is_placeholder', 'manufacturer', 'program', 'reusable', 'image', 'info_url', 'wiki_url', 'description', 'alias', 'min_stage', 'max_stage', 'length', 'diameter', 'maiden_flight', 'launch_cost', 'launch_mass', 'leo_capacity', 'gto_capacity', 'geo_capacity', 'sso_capacity', 'to_thrust', 'apogee', 'total_launch_count', 'consecutive_successful_launches', 'successful_launches', 'failed_launches', 'pending_launches', 'attempted_landings', 'successful_landings', 'failed_landings', 'consecutive_successful_landings', 'fastest_turnaround'])

## Initial Data Capture

**In this section we will be capturing the variables we wish to use for our data set.**

We will start this process by following the breif guidline on the data we planned to collect in our project proposal



In [11]:
### Rocket Data

# Rocket Name and Company Name
print("Unique Rocket identifier : ", launch_data[-1]['id']) #Will be usefull if we plan to break data up
print("Rocket Name : ", launch_data[-1]['rocket']['configuration']['name'])
print("Company Name : ", launch_data[-1]['rocket']['configuration']['families'][0]['manufacturer'][0]['name'])
print("Company Founding Year : ", launch_data[-1]['rocket']['configuration']['families'][0]['manufacturer'][0]['founding_year'])
print("Country Affiliation : ", launch_data[-1]['rocket']['configuration']['families'][0]['manufacturer'][0]['country'][0]['name'])
print("Company Type : ", launch_data[-1]['rocket']['configuration']['families'][0]['manufacturer'][0]['type']['name'])

Unique Rocket identifier :  6602c88f-cbff-4495-b417-a184ddb0a426
Rocket Name :  Falcon 9
Company Name :  SpaceX
Company Founding Year :  2002
Country Affiliation :  United States of America
Company Type :  Commercial


In [12]:
#Rocket Related Parameters
print("Rocket related parameters")
print("---------------")
print("Is the Rocket Reusable : ", launch_data[-1]['rocket']['configuration']['reusable'])
print("Min. No. Stages : ", launch_data[-1]['rocket']['configuration']['min_stage'])
print("Max No. Stages : ", launch_data[-1]['rocket']['configuration']['max_stage'])
print("Rocket Length (meters): ", launch_data[-1]['rocket']['configuration']['length'])
print("Rocket Diameter (meters): ", launch_data[-1]['rocket']['configuration']['diameter'])
print("Rocket Launch Cost : ", launch_data[-1]['rocket']['configuration']['launch_cost'])
print("Rocket Liftoff Mass (Tons): ", launch_data[-1]['rocket']['configuration']['launch_mass'])
print("Rocket Payload Mass to LEO (kg): ", launch_data[-1]['rocket']['configuration']['leo_capacity'])
print("Rocket Payload Mass to GTO (kg): ", launch_data[-1]['rocket']['configuration']['gto_capacity'])
print("Rocket Payload Mass to GEO (kg): ", launch_data[-1]['rocket']['configuration']['geo_capacity'])  #Could be removed if all null
print("Rocket Payload Mass to SSO (kg): ", launch_data[-1]['rocket']['configuration']['sso_capacity'])  #Could be removed if all null
print("Rocket Liftoff Thrust (kN): ", launch_data[-1]['rocket']['configuration']['to_thrust'])
print("Rocket Apogee: ", launch_data[-1]['rocket']['configuration']['apogee']) #Need to figure out what unit is used for apogee


 #'geo_capacity', 'sso_capacity', 'to_thrust', 'apogee'

Rocket related parameters
---------------
Is the Rocket Reusable :  True
Min. No. Stages :  1
Max No. Stages :  2
Rocket Length (meters):  70.0
Rocket Diameter (meters):  3.65
Rocket Launch Cost :  52000000
Rocket Liftoff Mass (Tons):  549.0
Rocket Payload Mass to LEO (kg):  22800.0
Rocket Payload Mass to GTO (kg):  8300.0
Rocket Payload Mass to GEO (kg):  None
Rocket Payload Mass to SSO (kg):  None
Rocket Liftoff Thrust (kN):  7607.0
Rocket Apogee:  200.0


In [13]:
# Launch related parameters
print("Launch related parameters")
print("---------------")
print("Date of Launch : ", launch_data[-1]['net'])
print("Lauch Status : ", launch_data[-1]['status']['name'])
print("Short Form Status:", launch_data[-1]['status']['abbrev'])
print("Launch pad location name:", launch_data[-1]['pad']['location']['name'])
print("Launch pad name:", launch_data[-1]['pad']['name'])
print("Launch pad country name:", launch_data[-1]['pad']['country']['name'])
print("Launch pad country name:", launch_data[-1]['pad']['country']['id'])
print("Launch pad lattitude:", launch_data[-1]['pad']['latitude'])
print("Launch pad longitude:", launch_data[-1]['pad']['longitude'])

print()

Launch related parameters
---------------
Date of Launch :  2025-11-06T21:13:50Z
Lauch Status :  Launch Successful
Short Form Status: Success
Launch pad location name: Vandenberg SFB, CA, USA
Launch pad name: Space Launch Complex 4E
Launch pad country name: United States of America
Launch pad country name: 2
Launch pad lattitude: 34.632
Launch pad longitude: -120.611



## Rocket Info Data Frame

In [14]:
rocket_table = []

header = ["ID", "launch_date", "rocket_name", "rocket_full_name","reusable", "min_no_stages", "max_no_stages","rocket_length_in_meters",
          "rocket_diameter_in_meters", "launch_cost", "liftoff_mass_in_tons", "payload_mass_to_leo", "payload_mass_to_gto", "payload_mass_to_geo",
          "payload_mass_to_sso", "liftoff_thrust_in_kN"]

for launch in launch_data:
    ID = launch['id']
    launch_date = launch['net']
    rocket_name = launch['rocket']['configuration']['name']
    rocket_fn = launch['rocket']['configuration']['full_name']
    reusable = launch['rocket']['configuration']['reusable']
    min_no_stages = launch['rocket']['configuration']['min_stage']
    max_no_stages = launch['rocket']['configuration']['max_stage']
    rocket_length_in_meters = launch['rocket']['configuration']['length']
    rocket_diameter_in_meters = launch['rocket']['configuration']['diameter']
    launch_cost = launch['rocket']['configuration']['launch_cost']
    liftoff_mass_in_tons = launch['rocket']['configuration']['launch_mass']
    payload_mass_to_leo = launch['rocket']['configuration']['leo_capacity']
    payload_mass_to_gto = launch['rocket']['configuration']['gto_capacity']
    payload_mass_to_geo = launch['rocket']['configuration']['geo_capacity']  #Could be removed if all null
    payload_mass_to_sso = launch['rocket']['configuration']['sso_capacity']
    liftoff_thrust_in_kN = launch['rocket']['configuration']['to_thrust']

    row = [ID, launch_date, rocket_name, rocket_fn, reusable, min_no_stages, max_no_stages, rocket_length_in_meters, rocket_diameter_in_meters,
           launch_cost, liftoff_mass_in_tons, payload_mass_to_leo, payload_mass_to_gto, payload_mass_to_geo, payload_mass_to_sso, liftoff_thrust_in_kN]
    
    rocket_table.append(row)

In [15]:
rocket_dataframe = pd.DataFrame(rocket_table, columns = header)

In [16]:
rocket_dataframe.head(10)

Unnamed: 0,ID,launch_date,rocket_name,rocket_full_name,reusable,min_no_stages,max_no_stages,rocket_length_in_meters,rocket_diameter_in_meters,launch_cost,liftoff_mass_in_tons,payload_mass_to_leo,payload_mass_to_gto,payload_mass_to_geo,payload_mass_to_sso,liftoff_thrust_in_kN
0,e3df2ecd-c239-472f-95e4-2b89b4f75800,1957-10-04T19:28:34Z,Sputnik 8K74PS,Sputnik 8K74PS,False,1.0,1.0,,,,,,,,,
1,f8c9f344-a6df-4f30-873a-90fe3a7840b3,1957-11-03T02:30:00Z,Sputnik 8K74PS,Sputnik 8K74PS,False,1.0,1.0,,,,,,,,,
2,535c1a09-97c8-4f96-bb64-6336d4bcb1fb,1957-12-06T16:44:35Z,Vanguard,Vanguard,False,3.0,3.0,23.0,1.14,,10.0,9.0,,,,123.0
3,1b9e28d0-c531-44b0-9b37-244e62a6d3f4,1958-02-01T03:47:56Z,Juno-I,Juno-I,False,4.0,4.0,21.2,1.78,,29.0,11.0,,,,416.0
4,48bc7deb-b2e1-46c2-ab63-0ce00fbd192b,1958-02-05T07:33:00Z,Vanguard,Vanguard,False,3.0,3.0,23.0,1.14,,10.0,9.0,,,,123.0
5,896e8af6-d256-4a5b-ab15-2f25c84e90e3,1958-03-05T18:27:57Z,Juno-I,Juno-I,False,4.0,4.0,21.2,1.78,,29.0,11.0,,,,416.0
6,74d39bb8-34a6-4a8b-8554-d2d3ec22aee6,1958-03-17T12:15:41Z,Vanguard,Vanguard,False,3.0,3.0,23.0,1.14,,10.0,9.0,,,,123.0
7,b4e501ff-083c-47d6-9ff0-63ec1bf035c3,1958-03-26T17:38:01Z,Juno-I,Juno-I,False,4.0,4.0,21.2,1.78,,29.0,11.0,,,,416.0
8,59d2de37-4c22-495f-8718-4b22f5f34ab7,1958-04-27T07:00:35Z,Sputnik 8A91,Sputnik 8A91,False,1.0,1.0,,,,,,,,,
9,de282e74-e03b-411e-9633-2d1497629893,1958-04-29T02:53:00Z,Vanguard,Vanguard,False,3.0,3.0,23.0,1.14,,10.0,9.0,,,,123.0


In [17]:
rocket_dataframe.tail(10)

Unnamed: 0,ID,launch_date,rocket_name,rocket_full_name,reusable,min_no_stages,max_no_stages,rocket_length_in_meters,rocket_diameter_in_meters,launch_cost,liftoff_mass_in_tons,payload_mass_to_leo,payload_mass_to_gto,payload_mass_to_geo,payload_mass_to_sso,liftoff_thrust_in_kN
7326,94ec1605-bd9c-47fa-9f36-69148ee4e370,2025-10-29T16:35:20Z,Falcon 9,Falcon 9 Block 5,True,1.0,2.0,70.0,3.65,52000000.0,549.0,22800.0,8300.0,,,7607.0
7327,5ca83f30-9e3d-439c-ac1d-b57953943355,2025-10-31T15:44:46Z,Long March 2,Long March 2F/G,False,2.0,2.0,62.0,3.35,,464.0,8399.0,,,,
7328,e92f090c-cb6a-4e1a-8e07-5a97dbf1c450,2025-10-31T20:41:10Z,Falcon 9,Falcon 9 Block 5,True,1.0,2.0,70.0,3.65,52000000.0,549.0,22800.0,8300.0,,,7607.0
7329,41d2ac1b-a506-47ae-8e44-8b65f5d6f239,2025-11-02T05:09:59Z,Falcon 9,Falcon 9 Block 5,True,1.0,2.0,70.0,3.65,52000000.0,549.0,22800.0,8300.0,,,7607.0
7330,05c00251-c8db-4922-9244-d1e407427962,2025-11-02T11:56:00Z,LVM-3 (GSLV Mk III),Launch Vehicle Mark-3 (GSLV Mk III),False,0.0,3.0,43.4,4.0,46000000.0,629.0,10000.0,5000.0,,,11898.0
7331,5fc609f3-4aee-4ffd-968d-b4cddcd9e381,2025-11-03T03:47:00Z,Long March 7A,Long March 7A,False,,,,,,,,,,,
7332,7afcacb9-32aa-41ee-a000-0c7158f324c2,2025-11-04T21:02:17Z,Ariane 62,Ariane 62,False,2.0,2.0,63.0,5.4,85000000.0,530.0,10350.0,5000.0,,6450.0,10370.0
7333,5d816773-89cb-48b9-9bf0-a9c4d8236785,2025-11-05T19:51:00Z,Electron,Electron,False,2.0,3.0,18.0,1.2,6000000.0,13.0,300.0,,,225.0,162.0
7334,9dd2d2b7-302b-4e9e-804c-aa176f606b6f,2025-11-06T01:31:10Z,Falcon 9,Falcon 9 Block 5,True,1.0,2.0,70.0,3.65,52000000.0,549.0,22800.0,8300.0,,,7607.0
7335,6602c88f-cbff-4495-b417-a184ddb0a426,2025-11-06T21:13:50Z,Falcon 9,Falcon 9 Block 5,True,1.0,2.0,70.0,3.65,52000000.0,549.0,22800.0,8300.0,,,7607.0


In [18]:
rocket_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7336 entries, 0 to 7335
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         7336 non-null   object 
 1   launch_date                7336 non-null   object 
 2   rocket_name                7336 non-null   object 
 3   rocket_full_name           7336 non-null   object 
 4   reusable                   7336 non-null   bool   
 5   min_no_stages              5914 non-null   float64
 6   max_no_stages              5914 non-null   float64
 7   rocket_length_in_meters    5800 non-null   float64
 8   rocket_diameter_in_meters  5799 non-null   float64
 9   launch_cost                2101 non-null   float64
 10  liftoff_mass_in_tons       5702 non-null   float64
 11  payload_mass_to_leo        4993 non-null   float64
 12  payload_mass_to_gto        2323 non-null   float64
 13  payload_mass_to_geo        381 non-null    float

In [19]:
null_rows_mask = rocket_dataframe.isnull().any(axis=1)
null_df = rocket_dataframe[null_rows_mask]
null_df.head()

Unnamed: 0,ID,launch_date,rocket_name,rocket_full_name,reusable,min_no_stages,max_no_stages,rocket_length_in_meters,rocket_diameter_in_meters,launch_cost,liftoff_mass_in_tons,payload_mass_to_leo,payload_mass_to_gto,payload_mass_to_geo,payload_mass_to_sso,liftoff_thrust_in_kN
0,e3df2ecd-c239-472f-95e4-2b89b4f75800,1957-10-04T19:28:34Z,Sputnik 8K74PS,Sputnik 8K74PS,False,1.0,1.0,,,,,,,,,
1,f8c9f344-a6df-4f30-873a-90fe3a7840b3,1957-11-03T02:30:00Z,Sputnik 8K74PS,Sputnik 8K74PS,False,1.0,1.0,,,,,,,,,
2,535c1a09-97c8-4f96-bb64-6336d4bcb1fb,1957-12-06T16:44:35Z,Vanguard,Vanguard,False,3.0,3.0,23.0,1.14,,10.0,9.0,,,,123.0
3,1b9e28d0-c531-44b0-9b37-244e62a6d3f4,1958-02-01T03:47:56Z,Juno-I,Juno-I,False,4.0,4.0,21.2,1.78,,29.0,11.0,,,,416.0
4,48bc7deb-b2e1-46c2-ab63-0ce00fbd192b,1958-02-05T07:33:00Z,Vanguard,Vanguard,False,3.0,3.0,23.0,1.14,,10.0,9.0,,,,123.0


In [20]:
# Checking to see if specific columns have null values
null_mask_2 = rocket_dataframe[['rocket_length_in_meters', 'rocket_diameter_in_meters']].isnull().any(axis=1)
null_df_2 = rocket_dataframe[null_mask_2]
null_df_2.head()

Unnamed: 0,ID,launch_date,rocket_name,rocket_full_name,reusable,min_no_stages,max_no_stages,rocket_length_in_meters,rocket_diameter_in_meters,launch_cost,liftoff_mass_in_tons,payload_mass_to_leo,payload_mass_to_gto,payload_mass_to_geo,payload_mass_to_sso,liftoff_thrust_in_kN
0,e3df2ecd-c239-472f-95e4-2b89b4f75800,1957-10-04T19:28:34Z,Sputnik 8K74PS,Sputnik 8K74PS,False,1.0,1.0,,,,,,,,,
1,f8c9f344-a6df-4f30-873a-90fe3a7840b3,1957-11-03T02:30:00Z,Sputnik 8K74PS,Sputnik 8K74PS,False,1.0,1.0,,,,,,,,,
8,59d2de37-4c22-495f-8718-4b22f5f34ab7,1958-04-27T07:00:35Z,Sputnik 8A91,Sputnik 8A91,False,1.0,1.0,,,,,,,,,
10,6a7f56f9-2565-4b6c-b0c5-23e9c74a2368,1958-05-15T07:00:35Z,Sputnik 8A91,Sputnik 8A91,False,1.0,1.0,,,,,,,,,
46,fdb56feb-e46f-41ba-80c8-c84be17c1888,1959-08-21T12:00:00Z,Little Joe,Little Joe,False,1.0,1.0,17.0,,,,,,,,


In [21]:
null_df_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1546 entries, 0 to 7331
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         1546 non-null   object 
 1   launch_date                1546 non-null   object 
 2   rocket_name                1546 non-null   object 
 3   rocket_full_name           1546 non-null   object 
 4   reusable                   1546 non-null   bool   
 5   min_no_stages              165 non-null    float64
 6   max_no_stages              165 non-null    float64
 7   rocket_length_in_meters    10 non-null     float64
 8   rocket_diameter_in_meters  9 non-null      float64
 9   launch_cost                0 non-null      float64
 10  liftoff_mass_in_tons       6 non-null      float64
 11  payload_mass_to_leo        38 non-null     float64
 12  payload_mass_to_gto        0 non-null      float64
 13  payload_mass_to_geo        0 non-null      float64
 1

In [23]:
null_df["rocket_name"].unique()

array(['Sputnik 8K74PS', 'Vanguard', 'Juno-I', 'Sputnik 8A91',
       'Project Pilot', 'Thor Able I', 'Vostok-L 8K72', 'Juno II',
       'Atlas B', 'Thor Agena A', 'Thor Able III', 'Little Joe',
       'Atlas LV-3B', 'Thor Able II', 'Atlas-Able', 'Atlas Agena A',
       'Thor Able IV', 'Thor DM-21 Ablestar', 'Thor Delta', 'Vostok 8K72',
       'Blue Scout Jr', 'Scout X-1', 'Molniya 8K78', 'Thor DM-21 Agena-B',
       'Redstone', 'Vostok', 'Atlas Agena B', 'Delta DM-19',
       'Trailblazer 1', 'Saturn I', 'Kosmos-2I 63S1', 'Blue Scout II',
       'Scout X-2', 'Trailblazer 2', 'Scout X-2M', 'Vostok 8A92',
       'Thor Agena D', 'North American X-15', 'Scout X-3A', 'Scout X-3',
       'Scout X-3M', 'Thor SLV-2A Agena D', 'Scout X-4',
       'Thor SLV-2A Agena B', 'Atlas Agena D', 'Little Joe II',
       'Thor SLV-2 Agena D', 'Scout X-2B', 'Sputnik 11A59', 'Voskhod',
       'Thor Delta C', 'Atlas Centaur', 'Thor SLV-2 Agena B',
       'Nike Cajun', 'Titan II', 'Atlas D', 'Atlas SLV-3 Agen