# Data Extraction - Rocket

Here, we extract the parameters related to the rocket for each launch from the raw data. We first read in the JSON data that we saved from the full API call, then extract key parameters, examine as a dataframe, and save as a TSV

* This notebook draws from testing notebooks 'DSCI511 data extraction test_v2', 'DSCI511 data extraction_rocket and launch', and 'Raw Data Filtering' 

### Imports and Installs

In [1]:
import json
from pprint import pprint
import zipfile
import pandas as pd

### Read in JSON data from zip file

In [3]:
#The path to the zipfile
zip_path = "../raw data/raw_launch_data.json.zip"
launch_data_filename = 'raw_launch_data.json'

#loading the raw launch data 
with zipfile.ZipFile(zip_path, "r") as z:
    with z.open(launch_data_filename) as f:
        raw_launch_data = json.load(f)

In [4]:
#Check data type is correct, what are the keys, look at the metadata
print(type(raw_launch_data))
pprint(raw_launch_data.keys())
print()
print(f"Collector: {raw_launch_data['collector']}, Total launches: {raw_launch_data['total_launches']}, "
    "Collection date: {raw_launch_data['collection_date']}")
print("Total number of launches:", len(raw_launch_data['launches']))

<class 'dict'>
dict_keys(['collector', 'total_launches', 'collection_date', 'launches'])

Collector: RyanPtest, Total launches: 7336, Collection date: {raw_launch_data['collection_date']}
Total number of launches: 7336


### Select data to use 
In code below, select whether to use a smaller sample of the data or the full data. Each item in the list within `raw_launch_data["launches"]` is an individual launch.

In [5]:
#Save a smaller data sample to test:
#data_sample = raw_launch_data["launches"][:100]  #[-100:]

#If you want the full data instead, set the data_sample to this:
data_sample = raw_launch_data["launches"]

### Create a table for rocket data in a data loop

For each launch dictionary, grab the specific information that we want and add it to a table (list of lists). Make sure to include the tagID, to be able to link dataframes

In [6]:
rocket_table = [] #intialize

#Headers for the data we will gather in loop:
rocket_header = ["ID", "Rocket_name", "Manufacturer_name", "Manufacturer_country", "Manufacturer_company_type", 
                 "Reusability", "Min_no_stages", "Max_no_stages", "Rocket_length", "Rocket_diameter", "Launch_cost", "Liftoff_mass_tons", 
                 "Liftoff_thrust_kN", "Rocket_apogee", "Payload_mass"]

for launch in data_sample:
    ID = launch['id']

    rocket_name = launch['rocket']['configuration']['full_name']

    #The below sometimes causes 'list index out of range' errors - adding code to fill with None in those cases
    if len(launch['rocket']['configuration']['families']) > 0:
        manuf_name = launch['rocket']['configuration']['families'][0]['manufacturer'][0]['name']
        manuf_country = launch['rocket']['configuration']['families'][0]['manufacturer'][0]['country'][0]['name']
        manuf_type = launch['rocket']['configuration']['families'][0]['manufacturer'][0]['type']['name']
    else: 
        manuf_name = None
        manuf_country = None
        manuf_type = None

    #provider = launch["launch_service_provider"]["name"] #this was included in Mission dataset
    reuse = launch['rocket']['configuration']['reusable']
    min_stage = launch['rocket']['configuration']['min_stage'] 
    max_stage = launch['rocket']['configuration']['max_stage'] 
    length = launch['rocket']['configuration']['length']
    diameter = launch['rocket']['configuration']['diameter']
    cost = launch['rocket']['configuration']['launch_cost']
    mass = launch['rocket']['configuration']['launch_mass']
    thrust = launch['rocket']['configuration']['to_thrust']
    apogee = launch['rocket']['configuration']['apogee']

    #Below, we'll combine information from multiple fields to fill out the 'payload' column
    LEO_payload = launch['rocket']['configuration']['leo_capacity'] #rocket payload mass to LEO (kg)
    GTO_payload = launch['rocket']['configuration']['gto_capacity'] #rocket payload mass to GTO (kg) 
    GEO_payload = launch['rocket']['configuration']['geo_capacity'] #rocket payload mass to GEO (kg)
    SSO_payload = launch['rocket']['configuration']['sso_capacity'] #rocket payload mass to SSO (kg)

    if LEO_payload or GTO_payload or GEO_payload or SSO_payload:
        payload = "" #initialize a string
        if LEO_payload:
            payload += f"LEO {LEO_payload} "
        if GTO_payload:
            payload += f"GTO {GTO_payload} "
        if GEO_payload:
            payload += f"GEO {GEO_payload} "
        if SSO_payload:
            payload += f"SSO {SSO_payload} "
    else:
        payload = None
    
    row = [ID, rocket_name, manuf_name, manuf_country, manuf_type, reuse, 
           min_stage, max_stage, length, diameter, cost, mass, thrust, apogee, payload]
    rocket_table.append(row)

### Create Pandas dataframe

In [7]:
rocket_dataframe = pd.DataFrame(rocket_table, columns = rocket_header)
rocket_dataframe.tail()

Unnamed: 0,ID,Rocket_name,Manufacturer_name,Manufacturer_country,Manufacturer_company_type,Reusability,Min_no_stages,Max_no_stages,Rocket_length,Rocket_diameter,Launch_cost,Liftoff_mass_tons,Liftoff_thrust_kN,Rocket_apogee,Payload_mass
7331,5fc609f3-4aee-4ffd-968d-b4cddcd9e381,Long March 7A,China Aerospace Science and Technology Corpora...,China,Government,False,,,,,,,,,
7332,7afcacb9-32aa-41ee-a000-0c7158f324c2,Ariane 62,Arianespace,France,Commercial,False,2.0,2.0,63.0,5.4,85000000.0,530.0,10370.0,,LEO 10350.0 GTO 5000.0 SSO 6450.0
7333,5d816773-89cb-48b9-9bf0-a9c4d8236785,Electron,,,,False,2.0,3.0,18.0,1.2,6000000.0,13.0,162.0,,LEO 300.0 SSO 225.0
7334,9dd2d2b7-302b-4e9e-804c-aa176f606b6f,Falcon 9 Block 5,SpaceX,United States of America,Commercial,True,1.0,2.0,70.0,3.65,52000000.0,549.0,7607.0,200.0,LEO 22800.0 GTO 8300.0
7335,6602c88f-cbff-4495-b417-a184ddb0a426,Falcon 9 Block 5,SpaceX,United States of America,Commercial,True,1.0,2.0,70.0,3.65,52000000.0,549.0,7607.0,200.0,LEO 22800.0 GTO 8300.0


### Examine some content in the dataframe

In [8]:
rocket_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7336 entries, 0 to 7335
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         7336 non-null   object 
 1   Rocket_name                7336 non-null   object 
 2   Manufacturer_name          6677 non-null   object 
 3   Manufacturer_country       6677 non-null   object 
 4   Manufacturer_company_type  6677 non-null   object 
 5   Reusability                7336 non-null   bool   
 6   Min_no_stages              5914 non-null   float64
 7   Max_no_stages              5914 non-null   float64
 8   Rocket_length              5800 non-null   float64
 9   Rocket_diameter            5799 non-null   float64
 10  Launch_cost                2101 non-null   float64
 11  Liftoff_mass_tons          5702 non-null   float64
 12  Liftoff_thrust_kN          4160 non-null   float64
 13  Rocket_apogee              1348 non-null   float

In [9]:
rocket_dataframe['Manufacturer_name'].value_counts()

Manufacturer_name
Progress Rocket Space Center                              1029
Lockheed Martin                                            665
Soviet Space Program                                       632
Strategic Rocket Forces                                    621
China Aerospace Science and Technology Corporation         609
SpaceX                                                     596
Khrunichev State Research and Production Space Center      440
Yuzhnoye Design Bureau                                     342
McDonnell Douglas                                          330
Boeing                                                     301
Arianespace                                                265
Energia                                                    158
Vought                                                     118
Indian Space Research Organization                          93
Northrop Grumman Space Systems                              93
Mitsubishi Heavy Industries          

In [10]:
rocket_dataframe['Manufacturer_company_type'].value_counts()

Manufacturer_company_type
Commercial    3986
Government    2689
Private          2
Name: count, dtype: int64

In [11]:
rocket_dataframe.dtypes

ID                            object
Rocket_name                   object
Manufacturer_name             object
Manufacturer_country          object
Manufacturer_company_type     object
Reusability                     bool
Min_no_stages                float64
Max_no_stages                float64
Rocket_length                float64
Rocket_diameter              float64
Launch_cost                  float64
Liftoff_mass_tons            float64
Liftoff_thrust_kN            float64
Rocket_apogee                float64
Payload_mass                  object
dtype: object

### Examine which columns have null values 

In [12]:
rocket_dataframe.isnull().any()

ID                           False
Rocket_name                  False
Manufacturer_name             True
Manufacturer_country          True
Manufacturer_company_type     True
Reusability                  False
Min_no_stages                 True
Max_no_stages                 True
Rocket_length                 True
Rocket_diameter               True
Launch_cost                   True
Liftoff_mass_tons             True
Liftoff_thrust_kN             True
Rocket_apogee                 True
Payload_mass                  True
dtype: bool

### Save data as a TSV

In [13]:
#Since some fields have commas, not ideal to save as a CSV - saving it as a TSV instead

rocket_dataframe.to_csv("clean_rocket_data.tsv", sep='\t', index=False)

-----

### Below - Test Data Extraction
Here, we tested how to find the parameters that we ultimately saved in our tables

In [9]:
data_sample[0].keys()

dict_keys(['id', 'url', 'name', 'response_mode', 'slug', 'launch_designator', 'status', 'last_updated', 'net', 'net_precision', 'window_end', 'window_start', 'image', 'infographic', 'probability', 'weather_concerns', 'failreason', 'hashtag', 'launch_service_provider', 'rocket', 'mission', 'pad', 'webcast_live', 'program', 'orbital_launch_attempt_count', 'location_launch_attempt_count', 'pad_launch_attempt_count', 'agency_launch_attempt_count', 'orbital_launch_attempt_count_year', 'location_launch_attempt_count_year', 'pad_launch_attempt_count_year', 'agency_launch_attempt_count_year', 'flightclub_url', 'updates', 'info_urls', 'vid_urls', 'timeline', 'pad_turnaround', 'mission_patches'])

In [10]:
print(data_sample[0]['window_start'])
print(data_sample[0]['id'])
print()
pprint(data_sample[0]['rocket'])

1957-10-04T19:28:34Z
e3df2ecd-c239-472f-95e4-2b89b4f75800

{'configuration': {'active': False,
                   'alias': '',
                   'apogee': None,
                   'attempted_landings': 0,
                   'consecutive_successful_landings': 0,
                   'consecutive_successful_launches': 2,
                   'description': 'An early Russian rocket designed by Sergei '
                                  'Korolev in the Soviet Union',
                   'diameter': None,
                   'failed_landings': 0,
                   'failed_launches': 0,
                   'families': [{'active': False,
                                 'attempted_landings': 0,
                                 'consecutive_successful_landings': 0,
                                 'consecutive_successful_launches': 3,
                                 'description': '',
                                 'failed_landings': 0,
                                 'failed_launches': 1,
    

### Data Structure (Ryan, Jillian updates)

The data is a list of dictionaries that detail the information based on the rocket. From my search it appears that each disctionary is related a a certain rocket. Here are my initial findings from looking at how the data is structured.

* Within each launch dictionary there are multiple keys.
    - Some keys behave as key value pairs
    - Some keys have dictionaries as their values, so they may have further nested levels

## Feature Navigation

In this section I aim to locate specific features we wish to access to create our own dataset


### The Rocket

1. Rocket Name, Specific Variant Used
2. Company that Conducted the launch
3. Model Type
4. Paylod

For the example we will be using the first launch

In [11]:
pprint(data_sample[0]['rocket']['configuration']['families'][0]['manufacturer'][0])

{'abbrev': 'OKB-1',
 'administrator': None,
 'attempted_landings': 0,
 'attempted_landings_payload': 0,
 'attempted_landings_spacecraft': 0,
 'consecutive_successful_landings': 0,
 'consecutive_successful_launches': 0,
 'country': [{'alpha_2_code': 'RU',
              'alpha_3_code': 'RUS',
              'id': 5,
              'name': 'Russia',
              'nationality_name': 'Russian',
              'nationality_name_composed': 'Russo'}],
 'description': None,
 'failed_landings': 0,
 'failed_landings_payload': 0,
 'failed_landings_spacecraft': 0,
 'failed_launches': 0,
 'featured': False,
 'founding_year': 1946,
 'id': 1000,
 'image': None,
 'info_url': None,
 'launchers': '',
 'logo': None,
 'name': 'Energia',
 'parent': None,
 'pending_launches': 0,
 'response_mode': 'normal',
 'social_logo': None,
 'social_media_links': [],
 'spacecraft': '',
 'successful_landings': 0,
 'successful_landings_payload': 0,
 'successful_landings_spacecraft': 0,
 'successful_launches': 0,
 'total_laun

In [12]:
# Rocket Name and Company Name
print("Rocket Name : ", data_sample[0]['rocket']['configuration']['name'])
print("Company Name : ", data_sample[0]['rocket']['configuration']['families'][0]['manufacturer'][0]['name'])
print("Company Founding Year : ", data_sample[0]['rocket']['configuration']['families'][0]['manufacturer'][0]['founding_year'])
print("Country Affiliation : ", data_sample[0]['rocket']['configuration']['families'][0]['manufacturer'][0]['country'][0]['name'])
print("Company Type : ", data_sample[0]['rocket']['configuration']['families'][0]['manufacturer'][0]['type']['name'])

Rocket Name :  Sputnik 8K74PS
Company Name :  Energia
Company Founding Year :  1946
Country Affiliation :  Russia
Company Type :  Government


In [13]:
data_sample[0]['failreason']

''

In [14]:
# Launch related parameters
print("Launch related parameters")
print("---------------")
print("Date of Launch : ", data_sample[0]['net'])
print("Lauch Status : ", data_sample[0]['status']['name'])
print("Short Form Status:", data_sample[0]['status']['abbrev'])
print("Launch pad location name:", data_sample[0]['pad']['location']['name'])
print("Launch pad name:", data_sample[0]['pad']['name'])
print("Launch pad country name:", data_sample[0]['pad']['country']['name'])
print("Launch pad country name:", data_sample[0]['pad']['country']['id'])
print("Launch pad lattitude:", data_sample[0]['pad']['latitude'])
print("Launch pad longitude:", data_sample[0]['pad']['longitude'])

print()

Launch related parameters
---------------
Date of Launch :  1957-10-04T19:28:34Z
Lauch Status :  Launch Successful
Short Form Status: Success
Launch pad location name: Baikonur Cosmodrome, Republic of Kazakhstan
Launch pad name: 1/5
Launch pad country name: Kazakhstan
Launch pad country name: 44
Launch pad lattitude: 45.92
Launch pad longitude: 63.342



In [15]:
### Rocket Data - Ryan

# Rocket Name and Company Name
print("Rocket Name : ", data_sample[-1]['rocket']['configuration']['name'])
print("Company Name : ", data_sample[-1]['rocket']['configuration']['families'][0]['manufacturer'][0]['name'])
print("Company Founding Year : ", data_sample[-1]['rocket']['configuration']['families'][0]['manufacturer'][0]['founding_year'])
print("Country Affiliation : ", data_sample[-1]['rocket']['configuration']['families'][0]['manufacturer'][0]['country'][0]['name'])
print("Company Type : ", data_sample[-1]['rocket']['configuration']['families'][0]['manufacturer'][0]['type']['name'])

#Rocket Related Parameters
print("Is the Rocket Reusable : ", data_sample[-1]['rocket']['configuration']['reusable'])
print("Min. No. Stages : ", data_sample[-1]['rocket']['configuration']['min_stage'])
print("Max No. Stages : ", data_sample[-1]['rocket']['configuration']['max_stage'])
print("Rocket Length (meters): ", data_sample[-1]['rocket']['configuration']['length'])
print("Rocket Diameter (meters): ", data_sample[-1]['rocket']['configuration']['diameter'])
print("Rocket Launch Cost : ", data_sample[-1]['rocket']['configuration']['launch_cost'])
print("Rocket Liftoff Mass (Tons): ", data_sample[-1]['rocket']['configuration']['launch_mass'])
print("Rocket Payload Mass to LEO (kg): ", data_sample[-1]['rocket']['configuration']['leo_capacity'])
print("Rocket Payload Mass to GTO (kg): ", data_sample[-1]['rocket']['configuration']['gto_capacity'])
print("Rocket Payload Mass to GEO (kg): ", data_sample[-1]['rocket']['configuration']['geo_capacity'])  #Could be removed if all null
print("Rocket Payload Mass to SSO (kg): ", data_sample[-1]['rocket']['configuration']['sso_capacity'])  #Could be removed if all null
print("Rocket Liftoff Thrust (kN): ", data_sample[-1]['rocket']['configuration']['to_thrust'])
print("Rocket Apogee: ", data_sample[-1]['rocket']['configuration']['apogee']) #Need to figure out what unit is used for apogee

Rocket Name :  Falcon 9
Company Name :  SpaceX
Company Founding Year :  2002
Country Affiliation :  United States of America
Company Type :  Commercial
Is the Rocket Reusable :  True
Min. No. Stages :  1
Max No. Stages :  2
Rocket Length (meters):  70.0
Rocket Diameter (meters):  3.65
Rocket Launch Cost :  52000000
Rocket Liftoff Mass (Tons):  549.0
Rocket Payload Mass to LEO (kg):  22800.0
Rocket Payload Mass to GTO (kg):  8300.0
Rocket Payload Mass to GEO (kg):  None
Rocket Payload Mass to SSO (kg):  None
Rocket Liftoff Thrust (kN):  7607.0
Rocket Apogee:  200.0


In [16]:
#Rocket data - Jillian additions
print(data_sample[-1]['rocket']['configuration']["full_name"])
print()

print(data_sample[-1]['rocket']['configuration'].keys())

Falcon 9 Block 5

dict_keys(['response_mode', 'id', 'url', 'name', 'families', 'full_name', 'variant', 'active', 'is_placeholder', 'manufacturer', 'program', 'reusable', 'image', 'info_url', 'wiki_url', 'description', 'alias', 'min_stage', 'max_stage', 'length', 'diameter', 'maiden_flight', 'launch_cost', 'launch_mass', 'leo_capacity', 'gto_capacity', 'geo_capacity', 'sso_capacity', 'to_thrust', 'apogee', 'total_launch_count', 'consecutive_successful_launches', 'successful_launches', 'failed_launches', 'pending_launches', 'attempted_landings', 'successful_landings', 'failed_landings', 'consecutive_successful_landings', 'fastest_turnaround'])


In [17]:
print(data_sample[-1]["launch_service_provider"]["name"])
print()

print(data_sample[-1]["launch_service_provider"].keys())

SpaceX

dict_keys(['response_mode', 'id', 'url', 'name', 'abbrev', 'type', 'featured', 'country', 'description', 'administrator', 'founding_year', 'launchers', 'spacecraft', 'parent', 'image', 'logo', 'social_logo', 'total_launch_count', 'consecutive_successful_launches', 'successful_launches', 'failed_launches', 'pending_launches', 'consecutive_successful_landings', 'successful_landings', 'failed_landings', 'attempted_landings', 'successful_landings_spacecraft', 'failed_landings_spacecraft', 'attempted_landings_spacecraft', 'successful_landings_payload', 'failed_landings_payload', 'attempted_landings_payload', 'info_url', 'wiki_url', 'social_media_links'])


In [18]:
print(data_sample[-1].keys())
print()
#print(data_sample[-1]["program"]) #.keys())

dict_keys(['id', 'url', 'name', 'response_mode', 'slug', 'launch_designator', 'status', 'last_updated', 'net', 'net_precision', 'window_end', 'window_start', 'image', 'infographic', 'probability', 'weather_concerns', 'failreason', 'hashtag', 'launch_service_provider', 'rocket', 'mission', 'pad', 'webcast_live', 'program', 'orbital_launch_attempt_count', 'location_launch_attempt_count', 'pad_launch_attempt_count', 'agency_launch_attempt_count', 'orbital_launch_attempt_count_year', 'location_launch_attempt_count_year', 'pad_launch_attempt_count_year', 'agency_launch_attempt_count_year', 'flightclub_url', 'updates', 'info_urls', 'vid_urls', 'timeline', 'pad_turnaround', 'mission_patches'])



In [19]:
print(data_sample[-1]["mission"]['description'])

A batch of 28 satellites for the Starlink mega-constellation - SpaceX's project for space-based Internet communication system.


In [20]:
print(data_sample[-1]["launch_service_provider"]["name"])

SpaceX


In [21]:
print(data_sample[-1]["rocket"].keys())
print(data_sample[-1]["rocket"]["payloads"]) #This seems to always be empty

dict_keys(['id', 'configuration', 'launcher_stage', 'spacecraft_stage', 'payloads'])
[]


In [22]:
print(data_sample[-1]["rocket"]["configuration"].keys()) #["spacecraft_stage"]["docking_events"]["payload"])
print()
print(data_sample[5]["rocket"]["spacecraft_stage"]) #.keys())

dict_keys(['response_mode', 'id', 'url', 'name', 'families', 'full_name', 'variant', 'active', 'is_placeholder', 'manufacturer', 'program', 'reusable', 'image', 'info_url', 'wiki_url', 'description', 'alias', 'min_stage', 'max_stage', 'length', 'diameter', 'maiden_flight', 'launch_cost', 'launch_mass', 'leo_capacity', 'gto_capacity', 'geo_capacity', 'sso_capacity', 'to_thrust', 'apogee', 'total_launch_count', 'consecutive_successful_launches', 'successful_launches', 'failed_launches', 'pending_launches', 'attempted_landings', 'successful_landings', 'failed_landings', 'consecutive_successful_landings', 'fastest_turnaround'])

[]
