In [4]:
from datetime import datetime, timedelta
import dateutil.parser as dateparser
from pprint import pprint
import requests
import time
import math

In [5]:
# Record total start time
total_start_time = time.time()

# Set 5-year date range
start_time = dateparser.parse("1/1/2020")
end_time = dateparser.parse("12/31/2024")

print("Start date:", start_time.date())
print("End date:", end_time.date())
print("Collecting 5 years of launch data...\n")

Start date: 2020-01-01
End date: 2024-12-31
Collecting 5 years of launch data...



In [6]:
# API rate limiting constants
MAX_CALLS_PER_HOUR = 15
MAX_RESULTS_PER_CALL = 100
CALL_INTERVAL = 3600 / MAX_CALLS_PER_HOUR  # 240 seconds between calls (4 minutes)

# Calculate date ranges for pagination
all_launches = []
total_expected_calls = 0
current_start = start_time

print("Calculating required API calls...")

Calculating required API calls...


In [7]:
# First, let's check how many total results we're dealing with
initial_url = "https://ll.thespacedevs.com/2.3.0/launches/previous/" + "?" + "&".join([
    f'net__gte={start_time.isoformat()}',
    f'net__lte={end_time.isoformat()}',
    'mode=detailed',
    'limit=1',  # Just get count
    'ordering=net'
])

initial_response = requests.get(initial_url)
if initial_response.status_code == 200:
    initial_data = initial_response.json()
    total_count = initial_data['count']
    total_expected_calls = math.ceil(total_count / MAX_RESULTS_PER_CALL)
    print(f"Total launches in 5-year period: {total_count}")
    print(f"Expected API calls needed: {total_expected_calls}")
    
    if total_expected_calls > MAX_CALLS_PER_HOUR:
        print(f"WARNING: This will require {total_expected_calls} calls, exceeding the {MAX_CALLS_PER_HOUR} calls/hour limit")
        print("Consider breaking this into multiple sessions or using a wider date range per call")
else:
    print("Could not get total count, proceeding with pagination")
    total_count = "Unknown"

Total launches in 5-year period: 984
Expected API calls needed: 10


In [8]:
# Paginate through results
offset = 0
call_count = 0
api_call_times = []

print("\nStarting data collection...")



Starting data collection...


In [9]:
while True:
    call_start_time = time.time()
    call_count += 1
    
    print(f"\n--- API Call #{call_count} ---")
    
    # Set up the query with pagination
    net_filters = f'net__gte={start_time.isoformat()}&net__lte={end_time.isoformat()}'
    mode = 'mode=detailed'
    limit = f'limit={MAX_RESULTS_PER_CALL}'
    ordering = 'ordering=net'
    offset_param = f'offset={offset}'
    
    # Assemble the full URL for the query:
    query_url = "https://ll.thespacedevs.com/2.3.0/launches/previous/" + "?" + "&".join(
        [net_filters, mode, limit, ordering, offset_param]
    )
    
    print(f"Offset: {offset}")
    print(f"URL: {query_url.split('?')[0]}...")  # Shortened URL for display
        # Make the API call
    response = requests.get(query_url)
    api_call_end_time = time.time()
    api_call_duration = api_call_end_time - call_start_time
    api_call_times.append(api_call_duration)
    
    print(f"Status: {response.status_code}")
    print(f"API call time: {api_call_duration:.2f} seconds")
    
    if response.status_code != 200:
        print(f"Error: Received status code {response.status_code}")
        break
    
    # Process the data
    raw_data = response.json()
    launches_batch = raw_data['results']
    all_launches.extend(launches_batch)
    
    print(f"Launches in this batch: {len(launches_batch)}")
    print(f"Total launches collected: {len(all_launches)}")
    
    # Check if we have more results
    if not raw_data.get('next'):
        print("No more results available.")
        break
    
    # Check if we're approaching rate limit
    if call_count >= MAX_CALLS_PER_HOUR:
        print(f"\n⚠️  Rate limit reached: {MAX_CALLS_PER_HOUR} calls per hour")
        print("Stopping collection to avoid rate limiting")
        break
    
    # Calculate time until next call (respect rate limits)
    time_since_first_call = time.time() - total_start_time
    expected_calls_by_now = (time_since_first_call / 3600) * MAX_CALLS_PER_HOUR
    
    if call_count >= expected_calls_by_now:
        wait_time = CALL_INTERVAL
        print(f"Waiting {wait_time:.1f} seconds to respect rate limits...")
        time.sleep(wait_time)
    else:
        # Small buffer between calls
        time.sleep(1.0)
    
    # Update offset for next call
    offset += MAX_RESULTS_PER_CALL



--- API Call #1 ---
Offset: 0
URL: https://ll.thespacedevs.com/2.3.0/launches/previous/...
Status: 200
API call time: 8.85 seconds
Launches in this batch: 100
Total launches collected: 100
Waiting 240.0 seconds to respect rate limits...

--- API Call #2 ---
Offset: 100
URL: https://ll.thespacedevs.com/2.3.0/launches/previous/...
Status: 200
API call time: 9.21 seconds
Launches in this batch: 100
Total launches collected: 200
Waiting 240.0 seconds to respect rate limits...

--- API Call #3 ---
Offset: 200
URL: https://ll.thespacedevs.com/2.3.0/launches/previous/...
Status: 200
API call time: 7.96 seconds
Launches in this batch: 100
Total launches collected: 300
Waiting 240.0 seconds to respect rate limits...

--- API Call #4 ---
Offset: 300
URL: https://ll.thespacedevs.com/2.3.0/launches/previous/...
Status: 200
API call time: 11.32 seconds
Launches in this batch: 100
Total launches collected: 400
Waiting 240.0 seconds to respect rate limits...

--- API Call #5 ---
Offset: 400
URL: htt

In [10]:
# Calculate final statistics
total_end_time = time.time()
total_duration = total_end_time - total_start_time

# Calculate average API call time
avg_api_call_time = sum(api_call_times) / len(api_call_times) if api_call_times else 0

# Print comprehensive summary
print("\n" + "="*60)
print("DATA COLLECTION COMPLETE - SUMMARY")
print("="*60)
print(f"Time range: {start_time.date()} to {end_time.date()}")
print(f"Total API calls made: {call_count}")
print(f"Total launches collected: {len(all_launches)}")
print(f"Expected total launches: {total_count}")
print(f"Collection coverage: {len(all_launches)/total_count*100:.1f}%" if isinstance(total_count, int) else "N/A")

print(f"\nTIMING METRICS:")
print(f"Total collection time: {total_duration:.2f} seconds ({total_duration/60:.1f} minutes)")
print(f"Average API call time: {avg_api_call_time:.2f} seconds")
print(f"Fastest API call: {min(api_call_times):.2f} seconds" if api_call_times else "N/A")
print(f"Slowest API call: {max(api_call_times):.2f} seconds" if api_call_times else "N/A")

print(f"\nRATE LIMITING:")
print(f"API calls per hour limit: {MAX_CALLS_PER_HOUR}")
print(f"Maximum launches per call: {MAX_RESULTS_PER_CALL}")
print(f"Theoretical maximum per hour: {MAX_CALLS_PER_HOUR * MAX_RESULTS_PER_CALL} launches")
print(f"Actual collection rate: {len(all_launches)/total_duration*3600:.1f} launches/hour")

print(f"\nEFFICIENCY:")
if total_duration > 0:
    print(f"Launches per second: {len(all_launches)/total_duration:.2f}")
    print(f"API calls per minute: {call_count/(total_duration/60):.2f}")

print("="*60)

# Display sample data
if all_launches:
    print(f"\nSAMPLE DATA (first 3 launches):")
    for i, launch in enumerate(all_launches[:3]):
        print(f"\nLaunch {i+1}:")
        print(f"  Name: {launch.get('name', 'N/A')}")
        print(f"  Date: {launch.get('net', 'N/A')}")
        print(f"  Status: {launch.get('status', {}).get('name', 'N/A')}")
        print(f"  Provider: {launch.get('launch_service_provider', {}).get('name', 'N/A')}")
else:
    print("No launches collected.")


DATA COLLECTION COMPLETE - SUMMARY
Time range: 2020-01-01 to 2024-12-31
Total API calls made: 10
Total launches collected: 984
Expected total launches: 984
Collection coverage: 100.0%

TIMING METRICS:
Total collection time: 2346.38 seconds (39.1 minutes)
Average API call time: 10.66 seconds
Fastest API call: 7.96 seconds
Slowest API call: 21.37 seconds

RATE LIMITING:
API calls per hour limit: 15
Maximum launches per call: 100
Theoretical maximum per hour: 1500 launches
Actual collection rate: 1509.7 launches/hour

EFFICIENCY:
Launches per second: 0.42
API calls per minute: 0.26

SAMPLE DATA (first 3 launches):

Launch 1:
  Name: Falcon 9 Block 5 | Starlink 2
  Date: 2020-01-07T02:19:21Z
  Status: Launch Successful
  Provider: SpaceX

Launch 2:
  Name: Long March 3B | TJS-5 (TJSW-5)
  Date: 2020-01-07T15:20:00Z
  Status: Launch Successful
  Provider: China Aerospace Science and Technology Corporation

Launch 3:
  Name: Long March 2D | Jilin-1 Wideband 01, Tianqi-4, NuSat-7 & 8
  Date:

In [12]:
 #Save to file
save_option = input("\nWould you like to save the data to a file? (y/n): ")
if save_option.lower() == 'y':
    filename = f"launch_data_{start_time.date()}_to_{end_time.date()}.json"
    import json
    with open(filename, 'w') as f:
        json.dump(all_launches, f, indent=2)
    print(f"Data saved to {filename}")


Would you like to save the data to a file? (y/n):  y


Data saved to launch_data_2020-01-01_to_2024-12-31.json
