In [12]:
# DSCI 511 Term Project: Launch Parameters Extraction
##**Author:** Innocent Gumunyu
##**Purpose:** Extracting and cleaning launch related data 
##**Date:** Novembe 2025




In [16]:
# Importing modules
import json
import pandas as pd
import csv
from pprint import pprint

In [17]:
try:
    with open("C:\\Users\\innoc\\Desktop\\Modules\\Part 1.1\\Acquisition and processing DSCI 511\\project\\raw_launch_data.json", 'r', encoding='utf-8') as f:
        full_data = json.load(f)
except FileNotFoundError:
    print("ERROR: raw_baseline_launches.json not found.")
    exit()

# list of all launches is inside the 'launches' key
all_launches = full_data.get('launches', [])
print(f"Loaded {len(all_launches)} total launches.")

USE_SAMPLE = False

if USE_SAMPLE:
    data_source = all_launches[-50:] # Sample last 50
else:
    data_source = all_launches # Full dataset

print(f"Processing {len(data_source)} launches...")

Loaded 7336 total launches.
Processing 7336 launches...


In [18]:
# print the first launch in the sample to see all the keys
pprint(data_source[0])

{'agency_launch_attempt_count': 1,
 'agency_launch_attempt_count_year': 1,
 'failreason': '',
 'flightclub_url': None,
 'hashtag': None,
 'id': 'e3df2ecd-c239-472f-95e4-2b89b4f75800',
 'image': {'credit': None,
           'id': 1844,
           'image_url': 'https://thespacedevs-prod.nyc3.digitaloceanspaces.com/media/images/sputnik_8k74ps_image_20210830185541.jpg',
           'license': {'id': 1, 'link': None, 'name': 'Unknown', 'priority': 9},
           'name': '[AUTO] Sputnik 8K74PS - image',
           'single_use': True,
           'thumbnail_url': 'https://thespacedevs-prod.nyc3.digitaloceanspaces.com/media/images/255bauto255d__image_thumbnail_20240305193923.jpeg',
           'variants': []},
 'info_urls': [],
 'infographic': None,
 'last_updated': '2024-03-17T19:17:35Z',
 'launch_designator': '1957-001',
 'launch_service_provider': {'abbrev': 'CCCP',
                             'administrator': None,
                             'attempted_landings': 0,
                        

In [19]:
# Function to extract timeline stages
def extract_timeline_stages(timeline):
    """Extract timeline stages information"""
    if not timeline:
        return "No timeline data"
    
    stages = []
    for event in timeline:
        stage_name = event.get('name', 'Unknown Stage')
        stage_desc = event.get('description', 'No description')
        stage_info = f"{stage_name}: {stage_desc}"
        stages.append(stage_info)
    
    return "; ".join(stages) if stages else "No stage events"


In [20]:
# Extract the required parameters for each launch
launch_data_list = []

for launch in data_source:
    # 1) FAIL REASON
    fail_reason = launch.get('failreason', '') or "N/A"
    
    # 2) LOCATION LAUNCH ATTEMPT COUNT
    location_launch_attempt = launch.get('location_launch_attempt_count', 'N/A')
    
    # 3) LOCATION LAUNCH ATTEMPT COUNT YEAR
    location_launch_attempt_year = launch.get('location_launch_attempt_count_year', 'N/A')
    
    # 4) PAD
    pad_data = launch.get('pad', {})
    pad_name = pad_data.get('name', 'N/A') if pad_data else 'N/A'
    
    # 5) PAD LAUNCH ATTEMPT COUNT
    pad_launch_attempt_count = launch.get('pad_launch_attempt_count', 'N/A')
    
    # 6) STATUS - SUCCESS/FAILURE
    status_data = launch.get('status', {})
    status_name = status_data.get('name', 'Unknown') if status_data else 'Unknown'
    
    # 7) TIMELINE STAGES
    timeline_data = launch.get('timeline', [])
    timeline_stages = extract_timeline_stages(timeline_data)
    
    # 8) WINDOW-END
    window_end = launch.get('window_end', 'N/A')
    
    # 9) WINDOW-START
    window_start = launch.get('window_start', 'N/A')
    
    # Additional identifying information
    launch_name = launch.get('name', 'Unknown')
    launch_date = launch.get('net', 'N/A')
    
    launch_info = {
        'Launch Name': launch_name,
        'Launch Date': launch_date,
        'Fail Reason': fail_reason,
        'Location Launch Attempt Count': location_launch_attempt,
        'Location Launch Attempt Count Year': location_launch_attempt_year,
        'Pad': pad_name,
        'Pad Launch Attempt Count': pad_launch_attempt_count,
        'Status': status_name,
        'Timeline Stages': timeline_stages,
        'Window Start': window_start,
        'Window End': window_end
    }
    
    launch_data_list.append(launch_info)


In [21]:
# Create DataFrame
df = pd.DataFrame(launch_data_list)

# Display the table
print(f"\n{'='*80}")
print(f"LAUNCH DATA TABLE - {len(df)} LAUNCHES")
print(f"{'='*80}")

print(f"\nFirst 10 launches:")
print(df.head(10).to_string())

# Display summary statistics
print(f"\n{'='*50}")
print("SUMMARY STATISTICS")
print(f"{'='*50}")

print(f"Total launches processed: {len(df)}")
print(f"Successful launches: {len(df[df['Status'] == 'Launch Successful'])}")
print(f"Failed launches: {len(df[df['Status'] != 'Launch Successful'])}")
print(f"Launches with fail reasons: {len(df[df['Fail Reason'] != 'N/A'])}")
print(f"Unique launch pads: {df['Pad'].nunique()}")

# Show status distribution
print(f"\nStatus Distribution:")
print(df['Status'].value_counts())

# Show top pads
print(f"\nTop 10 Most Used Launch Pads:")
print(df['Pad'].value_counts().head(10))

# Save to CSV
output_file = "launch_analysis_table.csv"
df.to_csv(output_file, index=False, encoding='utf-8')
print(f"\nData saved to: {output_file}")


LAUNCH DATA TABLE - 7336 LAUNCHES

First 10 launches:
                  Launch Name           Launch Date Fail Reason  Location Launch Attempt Count  Location Launch Attempt Count Year                 Pad  Pad Launch Attempt Count             Status   Timeline Stages          Window Start            Window End
0  Sputnik 8K74PS | Sputnik 1  1957-10-04T19:28:34Z         N/A                              1                                   1                 1/5                         1  Launch Successful  No timeline data  1957-10-04T19:28:34Z  1957-10-04T19:28:34Z
1  Sputnik 8K74PS | Sputnik 2  1957-11-03T02:30:00Z         N/A                              2                                   2                 1/5                         2  Launch Successful  No timeline data  1957-11-03T02:30:00Z  1957-11-03T02:30:00Z
2         Vanguard | Vanguard  1957-12-06T16:44:35Z         N/A                              1                                   1  Launch Complex 18A                     

In [22]:
# Display detailed information about failed launches
failed_launches = df[df['Status'] != 'Launch Successful']
if not failed_launches.empty:
    print(f"\n{'='*50}")
    print(f"FAILED LAUNCHES DETAILS ({len(failed_launches)})")
    print(f"{'='*50}")
    for idx, row in failed_launches.head().iterrows():
        print(f"\nLaunch: {row['Launch Name']}")
        print(f"Date: {row['Launch Date']}")
        print(f"Status: {row['Status']}")
        print(f"Fail Reason: {row['Fail Reason']}")
        print(f"Pad: {row['Pad']}")


FAILED LAUNCHES DETAILS (547)

Launch: Vanguard | Vanguard
Date: 1957-12-06T16:44:35Z
Status: Launch Failure
Fail Reason: N/A
Pad: Launch Complex 18A

Launch: Vanguard | Vanguard
Date: 1958-02-05T07:33:00Z
Status: Launch Failure
Fail Reason: N/A
Pad: Launch Complex 18A

Launch: Juno-I | Explorer 2
Date: 1958-03-05T18:27:57Z
Status: Launch Failure
Fail Reason: N/A
Pad: Launch Complex 26A

Launch: Sputnik 8A91 | D-1 1
Date: 1958-04-27T07:00:35Z
Status: Launch Failure
Fail Reason: N/A
Pad: 1/5

Launch: Vanguard | Vanguard
Date: 1958-04-29T02:53:00Z
Status: Launch Failure
Fail Reason: N/A
Pad: Launch Complex 18A


In [23]:
# Display launches with timeline data
timeline_launches = df[df['Timeline Stages'] != "No timeline data"]
if not timeline_launches.empty:
    print(f"\n{'='*50}")
    print(f"LAUNCHES WITH TIMELINE DATA ({len(timeline_launches)})")
    print(f"{'='*50}")
    for idx, row in timeline_launches.head(3).iterrows():
        print(f"\nLaunch: {row['Launch Name']}")
        print(f"Timeline: {row['Timeline Stages'][:150]}...")

print(f"\n{'='*80}")
print("DATA EXTRACTION COMPLETE")
print(f"{'='*80}")


LAUNCHES WITH TIMELINE DATA (333)

Launch: Saturn V | Apollo 11
Timeline: Unknown Stage: No description; Unknown Stage: No description; Unknown Stage: No description; Unknown Stage: No description; Unknown Stage: No descript...

Launch: Soyuz STA/Fregat | Pléiades-HR 1A, FASat-Charlie (SSOT), 4 x ELISA
Timeline: Unknown Stage: No description; Unknown Stage: No description; Unknown Stage: No description; Unknown Stage: No description; Unknown Stage: No descript...

Launch: Falcon 9 Block 5 | Starlink Group 6-37
Timeline: Unknown Stage: No description; Unknown Stage: No description; Unknown Stage: No description; Unknown Stage: No description; Unknown Stage: No descript...

DATA EXTRACTION COMPLETE


In [24]:
import os
from datetime import datetime

# Create filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"launch_analysis_table_{timestamp}.csv"

# Define the save path (using your project directory)
save_path = "C:\\Users\\innoc\\Desktop\\Modules\\Part 1.1\\Acquisition and processing DSCI 511\\project\\" + csv_filename

# Save the DataFrame to CSV

In [25]:
#saving data as csv

In [26]:
try:
    df.to_csv(save_path, index=False, encoding='utf-8')
    print(f"\n{'='*80}")
    print("CSV FILE SAVED SUCCESSFULLY!")
    print(f"{'='*80}")
    print(f"File saved as: {csv_filename}")
    print(f"Full path: {save_path}")
    print(f"File size: {len(df)} rows × {len(df.columns)} columns")
    
    # Display file information
    file_size = os.path.getsize(save_path)
    print(f"File size: {file_size / 1024:.2f} KB")
    
    # Show column names in the saved file
    print(f"\nColumns saved in CSV:")
    for i, col in enumerate(df.columns, 1):
        print(f"  {i:2d}. {col}")
        
except Exception as e:
    print(f"\n{'='*80}")
    print("ERROR SAVING CSV FILE!")
    print(f"{'='*80}")
    print(f"Error: {e}")
    
    # Try saving to current directory as fallback
    try:
        fallback_path = csv_filename
        df.to_csv(fallback_path, index=False, encoding='utf-8')
        print(f"File saved to fallback location: {fallback_path}")
    except Exception as fallback_error:
        print(f"Fallback save also failed: {fallback_error}")


CSV FILE SAVED SUCCESSFULLY!
File saved as: launch_analysis_table_20251115_195516.csv
Full path: C:\Users\innoc\Desktop\Modules\Part 1.1\Acquisition and processing DSCI 511\project\launch_analysis_table_20251115_195516.csv
File size: 7336 rows × 11 columns
File size: 1349.03 KB

Columns saved in CSV:
   1. Launch Name
   2. Launch Date
   3. Fail Reason
   4. Location Launch Attempt Count
   5. Location Launch Attempt Count Year
   6. Pad
   7. Pad Launch Attempt Count
   8. Status
   9. Timeline Stages
  10. Window Start
  11. Window End
