In [44]:
import pandas as pd
import numpy as np

# Load the CSV, telling pandas to treat the first 3 rows as headers
# and to recognize 'NaN' and empty cells as actual missing values.
df_raw = pd.read_csv('launches_raw.csv', header=[0, 1, 2], na_values=['', ' '])

# Display the initial state
df_raw.head(10)

Unnamed: 0_level_0,Date and time (UTC),Rocket,Rocket,Flight number,Launch site,Launch site,LSP,LSP,Year
Unnamed: 0_level_1,Date and time (UTC),Unnamed: 1_level_1,Payload (⚀ = CubeSat),Operator,Orbit,Function,Decay (UTC),Outcome,Unnamed: 8_level_1
Unnamed: 0_level_2,Date and time (UTC),Unnamed: 1_level_2,Remarks,Remarks,Remarks,Remarks,Remarks,Remarks,Unnamed: 8_level_2
0,January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],1990
1,1 January 00:07 (UTC)[1],Commercial Titan III[1],Commercial Titan III[1],,Cape Canaveral[1] LC-40,Cape Canaveral[1] LC-40,Martin Marietta[1],Martin Marietta[1],1990
2,1 January 00:07 (UTC)[1],,,,,,,,1990
3,1 January 00:07 (UTC)[1],,Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990
4,1 January 00:07 (UTC)[1],,JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990
5,1 January 00:07 (UTC)[1],,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990
6,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Space Shuttle Columbia,,Kennedy LC-39A,Kennedy LC-39A,,,1990
7,9 January 12:35 (UTC)[5],,,,,,,,1990
8,9 January 12:35 (UTC)[5],,STS-32,NASA,Low Earth,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990
9,9 January 12:35 (UTC)[5],,Leasat F5 also known as Syncom IV-5.[7],NASA,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990


In [45]:
# Create a working copy
df_working = df_raw.copy()

# Identify the columns to check. These are the ones that should contain data, not just context.
columns_to_check = df_working.columns[2:8] # Check Payload, Operator, Orbit, etc.

# Drop the row ONLY if all of the selected columns are NaN
df_working = df_working.dropna(subset=columns_to_check, how='all')

print("Shape after dropping all-NaN rows:", df_working.shape)
df_working.head(10)

Shape after dropping all-NaN rows: (7527, 9)


Unnamed: 0_level_0,Date and time (UTC),Rocket,Rocket,Flight number,Launch site,Launch site,LSP,LSP,Year
Unnamed: 0_level_1,Date and time (UTC),Unnamed: 1_level_1,Payload (⚀ = CubeSat),Operator,Orbit,Function,Decay (UTC),Outcome,Unnamed: 8_level_1
Unnamed: 0_level_2,Date and time (UTC),Unnamed: 1_level_2,Remarks,Remarks,Remarks,Remarks,Remarks,Remarks,Unnamed: 8_level_2
0,January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],1990
1,1 January 00:07 (UTC)[1],Commercial Titan III[1],Commercial Titan III[1],,Cape Canaveral[1] LC-40,Cape Canaveral[1] LC-40,Martin Marietta[1],Martin Marietta[1],1990
3,1 January 00:07 (UTC)[1],,Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990
4,1 January 00:07 (UTC)[1],,JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990
5,1 January 00:07 (UTC)[1],,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990
6,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Space Shuttle Columbia,,Kennedy LC-39A,Kennedy LC-39A,,,1990
8,9 January 12:35 (UTC)[5],,STS-32,NASA,Low Earth,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990
9,9 January 12:35 (UTC)[5],,Leasat F5 also known as Syncom IV-5.[7],NASA,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990
10,9 January 12:35 (UTC)[5],,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990
11,17 January 14:45 (UTC)[8],Soyuz-U,Soyuz-U,,Plesetsk,Plesetsk,,,1990


In [46]:
df_working = df_working.reset_index(drop=True)
print("Index reset.")
df_working.head(10)

Index reset.


Unnamed: 0_level_0,Date and time (UTC),Rocket,Rocket,Flight number,Launch site,Launch site,LSP,LSP,Year
Unnamed: 0_level_1,Date and time (UTC),Unnamed: 1_level_1,Payload (⚀ = CubeSat),Operator,Orbit,Function,Decay (UTC),Outcome,Unnamed: 8_level_1
Unnamed: 0_level_2,Date and time (UTC),Unnamed: 1_level_2,Remarks,Remarks,Remarks,Remarks,Remarks,Remarks,Unnamed: 8_level_2
0,January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],1990
1,1 January 00:07 (UTC)[1],Commercial Titan III[1],Commercial Titan III[1],,Cape Canaveral[1] LC-40,Cape Canaveral[1] LC-40,Martin Marietta[1],Martin Marietta[1],1990
2,1 January 00:07 (UTC)[1],,Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990
3,1 January 00:07 (UTC)[1],,JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990
4,1 January 00:07 (UTC)[1],,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990
5,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Space Shuttle Columbia,,Kennedy LC-39A,Kennedy LC-39A,,,1990
6,9 January 12:35 (UTC)[5],,STS-32,NASA,Low Earth,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990
7,9 January 12:35 (UTC)[5],,Leasat F5 also known as Syncom IV-5.[7],NASA,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990
8,9 January 12:35 (UTC)[5],,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990
9,17 January 14:45 (UTC)[8],Soyuz-U,Soyuz-U,,Plesetsk,Plesetsk,,,1990


In [47]:
# Step 1: Extract just the first row of the MultiIndex
new_columns = [col[0] if isinstance(col, tuple) else col for col in df_working.columns]

# Step 2: Apply those as the dataframe columns
df_working.columns = new_columns

# Step 3: Define your renaming map
final_columns_map = {
    'Date and time (UTC)': 'DateTime',
    'Rocket': 'Rocket',
    'Payload (☐ = CubeSat)': 'Payload',
    'Operator': 'Operator',
    'Orbit': 'Orbit',
    'Function': 'Function',
    'Decay (UTC)': 'Decay',
    'Outcome': 'Outcome',
    'Year': 'Year'
}

# Step 4: Rename the columns using the map
df_working.rename(columns=final_columns_map, inplace=True)

print("\n--- First 5 rows with updated columns ---")
df_working.head()



--- First 5 rows with updated columns ---


Unnamed: 0,DateTime,Rocket,Rocket.1,Flight number,Launch site,Launch site.1,LSP,LSP.1,Year
0,January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],1990
1,1 January 00:07 (UTC)[1],Commercial Titan III[1],Commercial Titan III[1],,Cape Canaveral[1] LC-40,Cape Canaveral[1] LC-40,Martin Marietta[1],Martin Marietta[1],1990
2,1 January 00:07 (UTC)[1],,Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990
3,1 January 00:07 (UTC)[1],,JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990
4,1 January 00:07 (UTC)[1],,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990


In [9]:
df_working.head(30)

Unnamed: 0,a,o,a.1,p,r,u,e,u.1,n
0,January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],1990
1,1 January 00:07 (UTC)[1],Commercial Titan III[1],Commercial Titan III[1],,Cape Canaveral[1] LC-40,Cape Canaveral[1] LC-40,Martin Marietta[1],Martin Marietta[1],1990
2,1 January 00:07 (UTC)[1],,Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990
3,1 January 00:07 (UTC)[1],,JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990
4,1 January 00:07 (UTC)[1],,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990
5,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Space Shuttle Columbia,,Kennedy LC-39A,Kennedy LC-39A,,,1990
6,9 January 12:35 (UTC)[5],,STS-32,NASA,Low Earth,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990
7,9 January 12:35 (UTC)[5],,Leasat F5 also known as Syncom IV-5.[7],NASA,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990
8,9 January 12:35 (UTC)[5],,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990
9,17 January 14:45 (UTC)[8],Soyuz-U,Soyuz-U,,Plesetsk,Plesetsk,,,1990


In [48]:
new_cols = ['DateTime','Rocket','Payload','Operator','Orbit','Function','Decay','Outcome','Year']

# if lengths match this will just set them
if df_working.shape[1] == len(new_cols):
    df_working.columns = new_cols
else:
    raise ValueError(f"Column count mismatch: df has {df.shape[1]} cols but you provided {len(new_cols)} names")


In [15]:
df_working.head(30)

Unnamed: 0,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year
0,January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],1990
1,1 January 00:07 (UTC)[1],Commercial Titan III[1],Commercial Titan III[1],,Cape Canaveral[1] LC-40,Cape Canaveral[1] LC-40,Martin Marietta[1],Martin Marietta[1],1990
2,1 January 00:07 (UTC)[1],,Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990
3,1 January 00:07 (UTC)[1],,JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990
4,1 January 00:07 (UTC)[1],,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990
5,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Space Shuttle Columbia,,Kennedy LC-39A,Kennedy LC-39A,,,1990
6,9 January 12:35 (UTC)[5],,STS-32,NASA,Low Earth,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990
7,9 January 12:35 (UTC)[5],,Leasat F5 also known as Syncom IV-5.[7],NASA,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990
8,9 January 12:35 (UTC)[5],,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990
9,17 January 14:45 (UTC)[8],Soyuz-U,Soyuz-U,,Plesetsk,Plesetsk,,,1990


In [49]:
# Create a copy to work on for this step
df_filled = df_working.copy()

# Carry the DateTime and Rocket name down into the detail rows
df_filled[['DateTime', 'Rocket']] = df_filled[['DateTime', 'Rocket']].fillna(method='ffill')

print("Forward fill complete.")
df_filled.head(20)

Forward fill complete.


  df_filled[['DateTime', 'Rocket']] = df_filled[['DateTime', 'Rocket']].fillna(method='ffill')


Unnamed: 0,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year
0,January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],1990
1,1 January 00:07 (UTC)[1],Commercial Titan III[1],Commercial Titan III[1],,Cape Canaveral[1] LC-40,Cape Canaveral[1] LC-40,Martin Marietta[1],Martin Marietta[1],1990
2,1 January 00:07 (UTC)[1],Commercial Titan III[1],Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990
3,1 January 00:07 (UTC)[1],Commercial Titan III[1],JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990
4,1 January 00:07 (UTC)[1],Commercial Titan III[1],Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990
5,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Space Shuttle Columbia,,Kennedy LC-39A,Kennedy LC-39A,,,1990
6,9 January 12:35 (UTC)[5],Space Shuttle Columbia,STS-32,NASA,Low Earth,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990
7,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],NASA,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990
8,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990
9,17 January 14:45 (UTC)[8],Soyuz-U,Soyuz-U,,Plesetsk,Plesetsk,,,1990


In [23]:
df_filled.to_csv('launches_semi-clean.csv', index = False)

In [50]:
df_filled.head(50)

Unnamed: 0,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year
0,January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],January[edit],1990
1,1 January 00:07 (UTC)[1],Commercial Titan III[1],Commercial Titan III[1],,Cape Canaveral[1] LC-40,Cape Canaveral[1] LC-40,Martin Marietta[1],Martin Marietta[1],1990
2,1 January 00:07 (UTC)[1],Commercial Titan III[1],Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990
3,1 January 00:07 (UTC)[1],Commercial Titan III[1],JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990
4,1 January 00:07 (UTC)[1],Commercial Titan III[1],Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990
5,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Space Shuttle Columbia,,Kennedy LC-39A,Kennedy LC-39A,,,1990
6,9 January 12:35 (UTC)[5],Space Shuttle Columbia,STS-32,NASA,Low Earth,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990
7,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],NASA,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990
8,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990
9,17 January 14:45 (UTC)[8],Soyuz-U,Soyuz-U,,Plesetsk,Plesetsk,,,1990


In [51]:
df_filled = df_filled.drop(index = 0)

In [52]:
df_filled.head(50)

Unnamed: 0,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year
1,1 January 00:07 (UTC)[1],Commercial Titan III[1],Commercial Titan III[1],,Cape Canaveral[1] LC-40,Cape Canaveral[1] LC-40,Martin Marietta[1],Martin Marietta[1],1990
2,1 January 00:07 (UTC)[1],Commercial Titan III[1],Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990
3,1 January 00:07 (UTC)[1],Commercial Titan III[1],JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990
4,1 January 00:07 (UTC)[1],Commercial Titan III[1],Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990
5,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Space Shuttle Columbia,,Kennedy LC-39A,Kennedy LC-39A,,,1990
6,9 January 12:35 (UTC)[5],Space Shuttle Columbia,STS-32,NASA,Low Earth,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990
7,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],NASA,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990
8,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990
9,17 January 14:45 (UTC)[8],Soyuz-U,Soyuz-U,,Plesetsk,Plesetsk,,,1990
10,17 January 14:45 (UTC)[8],Soyuz-U,Kosmos 2055 (Zenit-8)[8],,Low Earth,Reconnaissance,29 January[8],Successful,1990


In [31]:
df_filled.head(7526)

Unnamed: 0,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year
1,1 January 00:07 (UTC)[1],Commercial Titan III[1],Commercial Titan III[1],,Cape Canaveral[1] LC-40,Cape Canaveral[1] LC-40,Martin Marietta[1],Martin Marietta[1],1990
2,1 January 00:07 (UTC)[1],Commercial Titan III[1],Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990
3,1 January 00:07 (UTC)[1],Commercial Titan III[1],JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990
4,1 January 00:07 (UTC)[1],Commercial Titan III[1],Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990
5,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Space Shuttle Columbia,,Kennedy LC-39A,Kennedy LC-39A,,,1990
...,...,...,...,...,...,...,...,...,...
7522,27 December 15:44[353],Long March 4C,Yaogan 33(R),CAS,Low Earth (SSO),Reconnaissance,In orbit,Operational,2020
7523,27 December 15:44[353],Long March 4C,⚀ Weina-2,SECM,Low Earth (SSO),Technology demonstration,In orbit,Operational,2020
7524,27 December 15:44[353],Long March 4C,"Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...",2020
7525,29 December 16:42:07[354][355],Soyuz ST-A / Fregat-M,Soyuz ST-A / Fregat-M,VS25,Kourou ELS,Kourou ELS,Arianespace,Arianespace,2020


In [53]:
mask = df_filled.apply(lambda row: row.astype(str).str.contains(r"\[edit\]").any(), axis=1)
df_clean = df_filled[~mask].copy()

print("Rows before:", len(df_filled))
print("Rows after:", len(df_clean))
df_clean.head(10)

Rows before: 7526
Rows after: 7191


Unnamed: 0,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year
1,1 January 00:07 (UTC)[1],Commercial Titan III[1],Commercial Titan III[1],,Cape Canaveral[1] LC-40,Cape Canaveral[1] LC-40,Martin Marietta[1],Martin Marietta[1],1990
2,1 January 00:07 (UTC)[1],Commercial Titan III[1],Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990
3,1 January 00:07 (UTC)[1],Commercial Titan III[1],JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990
4,1 January 00:07 (UTC)[1],Commercial Titan III[1],Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990
5,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Space Shuttle Columbia,,Kennedy LC-39A,Kennedy LC-39A,,,1990
6,9 January 12:35 (UTC)[5],Space Shuttle Columbia,STS-32,NASA,Low Earth,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990
7,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],NASA,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990
8,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990
9,17 January 14:45 (UTC)[8],Soyuz-U,Soyuz-U,,Plesetsk,Plesetsk,,,1990
10,17 January 14:45 (UTC)[8],Soyuz-U,Kosmos 2055 (Zenit-8)[8],,Low Earth,Reconnaissance,29 January[8],Successful,1990


In [54]:
mask = df_filled["Rocket"] == df_filled["Payload"]
df_filled = df_filled[~mask].copy()

print("Rows removed where Rocket = Payload:", mask.sum())
print("Rows remaining:", len(df_filled))
df_filled.head(10)

Rows removed where Rocket = Payload: 2594
Rows remaining: 4932


Unnamed: 0,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year
2,1 January 00:07 (UTC)[1],Commercial Titan III[1],Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990
3,1 January 00:07 (UTC)[1],Commercial Titan III[1],JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990
4,1 January 00:07 (UTC)[1],Commercial Titan III[1],Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990
6,9 January 12:35 (UTC)[5],Space Shuttle Columbia,STS-32,NASA,Low Earth,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990
7,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],NASA,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990
8,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990
10,17 January 14:45 (UTC)[8],Soyuz-U,Kosmos 2055 (Zenit-8)[8],,Low Earth,Reconnaissance,29 January[8],Successful,1990
12,18 January 12:52 (UTC)[9],Kosmos-3M,Kosmos 2056 (Strela-2M)[9],,Low Earth,Communications,In orbit,Successful; replaced by Kosmos 2208 12 August ...,1990
14,22 January 01:35 (UTC)[11],Ariane 4 (40),SPOT 2,CNES,Sun-synchronous,Earth observation,In orbit,"Successful; retired July, 2009.[12]",1990
15,22 January 01:35 (UTC)[11],Ariane 4 (40),UOSAT 3 also known as UoSAT-OSCAR 14[13],University of Surrey,Sun-synchronous,Communications,In orbit,Successful; retired from active service in 199...,1990


In [55]:
# Step 3 (generalized): Extract flight number info

# Regex pattern for common flight-number phrases
pattern = r"(Maiden flight|First flight|Second flight|Third flight|Final flight)"

# Create new column with extracted flight info
df_filled["FlightNumberInfo"] = df_filled["Payload"].str.extract(pattern, expand=False)

# Count how many rows got tagged
print(f"[Step 3] Rows with flight number info: {df_filled['FlightNumberInfo'].notna().sum()}")

# Clean Payload column by removing those phrases
df_filled["Payload"] = df_filled["Payload"].str.replace(pattern + ".*", "", regex=True).str.strip()

df_filled.head(10)


[Step 3] Rows with flight number info: 188


Unnamed: 0,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year,FlightNumberInfo
2,1 January 00:07 (UTC)[1],Commercial Titan III[1],Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990,
3,1 January 00:07 (UTC)[1],Commercial Titan III[1],JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990,
4,1 January 00:07 (UTC)[1],Commercial Titan III[1],,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990,Maiden flight
6,9 January 12:35 (UTC)[5],Space Shuttle Columbia,STS-32,NASA,Low Earth,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990,
7,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],NASA,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990,
8,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990,
10,17 January 14:45 (UTC)[8],Soyuz-U,Kosmos 2055 (Zenit-8)[8],,Low Earth,Reconnaissance,29 January[8],Successful,1990,
12,18 January 12:52 (UTC)[9],Kosmos-3M,Kosmos 2056 (Strela-2M)[9],,Low Earth,Communications,In orbit,Successful; replaced by Kosmos 2208 12 August ...,1990,
14,22 January 01:35 (UTC)[11],Ariane 4 (40),SPOT 2,CNES,Sun-synchronous,Earth observation,In orbit,"Successful; retired July, 2009.[12]",1990,
15,22 January 01:35 (UTC)[11],Ariane 4 (40),UOSAT 3 also known as UoSAT-OSCAR 14[13],University of Surrey,Sun-synchronous,Communications,In orbit,Successful; retired from active service in 199...,1990,


In [56]:
df_filled.head(300)

Unnamed: 0,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year,FlightNumberInfo
2,1 January 00:07 (UTC)[1],Commercial Titan III[1],Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990,
3,1 January 00:07 (UTC)[1],Commercial Titan III[1],JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990,
4,1 January 00:07 (UTC)[1],Commercial Titan III[1],,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990,Maiden flight
6,9 January 12:35 (UTC)[5],Space Shuttle Columbia,STS-32,NASA,Low Earth,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990,
7,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],NASA,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990,
...,...,...,...,...,...,...,...,...,...,...
487,6 August 19:30:59,Molniya-M/ML,Molniya-1 #84,MOM,Molniya,Communications,4 April 2008,Successful,1992,
489,9 August 08:00,Long March 2D,FSW-2 #1,CASC,Low Earth,Reconnaissance,1 September,Successful,1992,
491,10 August 23:08:07,Ariane 4 42P,TOPEX/Poseidon,CNES/NASA,Low Earth,Oceanography,In orbit,Successful,1992,
492,10 August 23:08:07,Ariane 4 42P,S80/T,CNES,Low Earth,Technology,In orbit,Successful,1992,


In [57]:
# Step 3b: Normalize FlightNumberInfo values

# Define mapping
flight_map = {
    "Maiden flight": "First flight",
    "First flight": "First flight",  # unify
    "Second flight": "Second flight",
    "Third flight": "Third flight",
    "Final flight": "Final flight"
}

df_filled["FlightNumberInfo"] = df_filled["FlightNumberInfo"].replace(flight_map)

# Show unique values for verification
print("[Step 3b] Unique values in FlightNumberInfo:", df_filled["FlightNumberInfo"].dropna().unique())

df_filled.head(10)


[Step 3b] Unique values in FlightNumberInfo: ['First flight' 'Final flight' 'Second flight']


Unnamed: 0,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year,FlightNumberInfo
2,1 January 00:07 (UTC)[1],Commercial Titan III[1],Skynet 4A[1],MoD,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990,
3,1 January 00:07 (UTC)[1],Commercial Titan III[1],JCSAT 2[1],JSAT,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990,
4,1 January 00:07 (UTC)[1],Commercial Titan III[1],,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990,First flight
6,9 January 12:35 (UTC)[5],Space Shuttle Columbia,STS-32,NASA,Low Earth,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990,
7,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],NASA,Geosynchronous,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990,
8,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990,
10,17 January 14:45 (UTC)[8],Soyuz-U,Kosmos 2055 (Zenit-8)[8],,Low Earth,Reconnaissance,29 January[8],Successful,1990,
12,18 January 12:52 (UTC)[9],Kosmos-3M,Kosmos 2056 (Strela-2M)[9],,Low Earth,Communications,In orbit,Successful; replaced by Kosmos 2208 12 August ...,1990,
14,22 January 01:35 (UTC)[11],Ariane 4 (40),SPOT 2,CNES,Sun-synchronous,Earth observation,In orbit,"Successful; retired July, 2009.[12]",1990,
15,22 January 01:35 (UTC)[11],Ariane 4 (40),UOSAT 3 also known as UoSAT-OSCAR 14[13],University of Surrey,Sun-synchronous,Communications,In orbit,Successful; retired from active service in 199...,1990,


In [58]:
# Step 4: Clean Orbit column

# Known orbit categories (expandable list)
orbit_map = {
    "Low Earth": "LEO",
    "Geosynchronous": "GEO",
    "Geostationary": "GEO",
    "Medium Earth": "MEO",
    "Molniya": "Molniya",
    "Polar": "Polar",
    "Sun-Synchronous": "SSO",
    "Heliocentric": "Heliocentric",
    "Geostationary Transfer": "GTO"
}

# Detect launch site keywords
launch_sites = ["Cape Canaveral", "Kennedy", "Baikonur", "Plesetsk", "Tanegashima", "Kourou", "Jiuquan", "Vostochny"]

# Create new LaunchSite column
df_filled["LaunchSite"] = df_filled["Orbit"].where(
    df_filled["Orbit"].str.contains("|".join(launch_sites), case=False, na=False)
)

# Standardize Orbit column
df_filled["Orbit"] = df_filled["Orbit"].replace(orbit_map, regex=True)

# If Orbit contains a launch site, set Orbit to NaN
df_filled.loc[df_filled["Orbit"].isin(df_filled["LaunchSite"]), "Orbit"] = np.nan

print("[Step 4] Unique values in Orbit after cleanup:", df_filled["Orbit"].dropna().unique()[:20])
print("[Step 4] Unique values in LaunchSite after cleanup:", df_filled["LaunchSite"].dropna().unique()[:10])

df_filled.head(10)


[Step 4] Unique values in Orbit after cleanup: ['GEO' 'Maiden flight of Commercial Titan III' 'LEO'
 'Crewed orbital flight with 5 astronauts Long Duration Exposure Facility retrieval mission Leasat F5 retired 24 September 2015.[4]'
 'Sun-synchronous'
 'Maiden flight of Ariane 4 (40) UOSAT 4 ceased transmitting after 5 hours'
 'Molniya' 'Selenocentric'
 'First Japanese lunar mission Hagoromo suffered a transmitter malfunction prior to selenocentric orbit injection'
 'MEO' 'LEO (Mir)' 'Crewed orbital flight with 2 cosmonauts'
 'Maiden flight of Delta II 6920' 'Intended: GEO transfer'
 'Blocked water line caused rocket explosion'
 'Crewed orbital flight with 5 astronauts'
 'Due to non-separation of second stage Intelsat 603 was released from its perigee motor into a LEO orbit. Space Shuttle Endeavour on mission STS-49 attached a new perigee motor which boosted the satellite to geosynchronous orbit.'
 'Signal anomaly on 21 May 1996 made the satellite unusable in the GPS constellation.'
 '

Unnamed: 0,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year,FlightNumberInfo,LaunchSite
2,1 January 00:07 (UTC)[1],Commercial Titan III[1],Skynet 4A[1],MoD,GEO,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990,,
3,1 January 00:07 (UTC)[1],Commercial Titan III[1],JCSAT 2[1],JSAT,GEO,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990,,
4,1 January 00:07 (UTC)[1],Commercial Titan III[1],,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,Maiden flight of Commercial Titan III,1990,First flight,
6,9 January 12:35 (UTC)[5],Space Shuttle Columbia,STS-32,NASA,LEO,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990,,
7,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],NASA,GEO,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990,,
8,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990,,
10,17 January 14:45 (UTC)[8],Soyuz-U,Kosmos 2055 (Zenit-8)[8],,LEO,Reconnaissance,29 January[8],Successful,1990,,
12,18 January 12:52 (UTC)[9],Kosmos-3M,Kosmos 2056 (Strela-2M)[9],,LEO,Communications,In orbit,Successful; replaced by Kosmos 2208 12 August ...,1990,,
14,22 January 01:35 (UTC)[11],Ariane 4 (40),SPOT 2,CNES,Sun-synchronous,Earth observation,In orbit,"Successful; retired July, 2009.[12]",1990,,
15,22 January 01:35 (UTC)[11],Ariane 4 (40),UOSAT 3 also known as UoSAT-OSCAR 14[13],University of Surrey,Sun-synchronous,Communications,In orbit,Successful; retired from active service in 199...,1990,,


In [60]:
df_filled.to_csv('analyse.csv', index = False)

In [62]:
df_filled = df_filled[~df_filled['Orbit'].str.contains('maiden', case=False, na=False)]

In [63]:
df_filled.head(50)

Unnamed: 0,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year,FlightNumberInfo,LaunchSite
2,1 January 00:07 (UTC)[1],Commercial Titan III[1],Skynet 4A[1],MoD,GEO,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990,,
3,1 January 00:07 (UTC)[1],Commercial Titan III[1],JCSAT 2[1],JSAT,GEO,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990,,
6,9 January 12:35 (UTC)[5],Space Shuttle Columbia,STS-32,NASA,LEO,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990,,
7,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],NASA,GEO,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990,,
8,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990,,
10,17 January 14:45 (UTC)[8],Soyuz-U,Kosmos 2055 (Zenit-8)[8],,LEO,Reconnaissance,29 January[8],Successful,1990,,
12,18 January 12:52 (UTC)[9],Kosmos-3M,Kosmos 2056 (Strela-2M)[9],,LEO,Communications,In orbit,Successful; replaced by Kosmos 2208 12 August ...,1990,,
14,22 January 01:35 (UTC)[11],Ariane 4 (40),SPOT 2,CNES,Sun-synchronous,Earth observation,In orbit,"Successful; retired July, 2009.[12]",1990,,
15,22 January 01:35 (UTC)[11],Ariane 4 (40),UOSAT 3 also known as UoSAT-OSCAR 14[13],University of Surrey,Sun-synchronous,Communications,In orbit,Successful; retired from active service in 199...,1990,,
16,22 January 01:35 (UTC)[11],Ariane 4 (40),UOSAT 4,University of Surrey,Sun-synchronous,Communications,In orbit,Spacecraft failure 30 hours after launch[16],1990,,


In [65]:
df_filled = df_filled.reset_index()

In [66]:
df_filled.head(8000)

Unnamed: 0,index,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year,FlightNumberInfo,LaunchSite
0,2,1 January 00:07 (UTC)[1],Commercial Titan III[1],Skynet 4A[1],MoD,GEO,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990,,
1,3,1 January 00:07 (UTC)[1],Commercial Titan III[1],JCSAT 2[1],JSAT,GEO,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990,,
2,6,9 January 12:35 (UTC)[5],Space Shuttle Columbia,STS-32,NASA,LEO,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990,,
3,7,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],NASA,GEO,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990,,
4,8,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990,,
...,...,...,...,...,...,...,...,...,...,...,...,...
4804,7520,22 December 04:37:37[347][348][349],Long March 8,,First flight of Long March 8.,First flight of Long March 8.,First flight of Long March 8.,First flight of Long March 8.,First flight of Long March 8.,2020,First flight,
4805,7522,27 December 15:44[353],Long March 4C,Yaogan 33(R),CAS,LEO (SSO),Reconnaissance,In orbit,Operational,2020,,
4806,7523,27 December 15:44[353],Long March 4C,⚀ Weina-2,SECM,LEO (SSO),Technology demonstration,In orbit,Operational,2020,,
4807,7524,27 December 15:44[353],Long March 4C,"Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...",2020,,


In [68]:
df_filled['DateTime'] = df_filled['DateTime'].astype(str)
df_filled['Year'] = df_filled['Year'].astype(str)
df_filled['DateTimeYear'] = df_filled['DateTime'] + '-' + df_filled['Year']

In [69]:
df_filled.head(50)


Unnamed: 0,index,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year,FlightNumberInfo,LaunchSite,DateTimeYear
0,2,1 January 00:07 (UTC)[1],Commercial Titan III[1],Skynet 4A[1],MoD,GEO,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,1990,,,1 January 00:07 (UTC)[1]-1990
1,3,1 January 00:07 (UTC)[1],Commercial Titan III[1],JCSAT 2[1],JSAT,GEO,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],1990,,,1 January 00:07 (UTC)[1]-1990
2,6,9 January 12:35 (UTC)[5],Space Shuttle Columbia,STS-32,NASA,LEO,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,1990,,,9 January 12:35 (UTC)[5]-1990
3,7,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],NASA,GEO,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,1990,,,9 January 12:35 (UTC)[5]-1990
4,8,9 January 12:35 (UTC)[5],Space Shuttle Columbia,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,1990,,,9 January 12:35 (UTC)[5]-1990
5,10,17 January 14:45 (UTC)[8],Soyuz-U,Kosmos 2055 (Zenit-8)[8],,LEO,Reconnaissance,29 January[8],Successful,1990,,,17 January 14:45 (UTC)[8]-1990
6,12,18 January 12:52 (UTC)[9],Kosmos-3M,Kosmos 2056 (Strela-2M)[9],,LEO,Communications,In orbit,Successful; replaced by Kosmos 2208 12 August ...,1990,,,18 January 12:52 (UTC)[9]-1990
7,14,22 January 01:35 (UTC)[11],Ariane 4 (40),SPOT 2,CNES,Sun-synchronous,Earth observation,In orbit,"Successful; retired July, 2009.[12]",1990,,,22 January 01:35 (UTC)[11]-1990
8,15,22 January 01:35 (UTC)[11],Ariane 4 (40),UOSAT 3 also known as UoSAT-OSCAR 14[13],University of Surrey,Sun-synchronous,Communications,In orbit,Successful; retired from active service in 199...,1990,,,22 January 01:35 (UTC)[11]-1990
9,16,22 January 01:35 (UTC)[11],Ariane 4 (40),UOSAT 4,University of Surrey,Sun-synchronous,Communications,In orbit,Spacecraft failure 30 hours after launch[16],1990,,,22 January 01:35 (UTC)[11]-1990


In [71]:
df_filled = df_filled.drop(columns=['DateTime', 'Year'])


In [74]:
columns = list(df_filled.columns)
columns.remove('DateTimeYear')
columns.insert(1, 'DateTimeYear')
df_filled = df_filled[columns]
df_filled.head(50)

Unnamed: 0,index,DateTimeYear,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,FlightNumberInfo,LaunchSite
0,2,1 January 00:07 (UTC)[1]-1990,Commercial Titan III[1],Skynet 4A[1],MoD,GEO,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,,
1,3,1 January 00:07 (UTC)[1]-1990,Commercial Titan III[1],JCSAT 2[1],JSAT,GEO,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],,
2,6,9 January 12:35 (UTC)[5]-1990,Space Shuttle Columbia,STS-32,NASA,LEO,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,,
3,7,9 January 12:35 (UTC)[5]-1990,Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],NASA,GEO,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,,
4,8,9 January 12:35 (UTC)[5]-1990,Space Shuttle Columbia,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,,
5,10,17 January 14:45 (UTC)[8]-1990,Soyuz-U,Kosmos 2055 (Zenit-8)[8],,LEO,Reconnaissance,29 January[8],Successful,,
6,12,18 January 12:52 (UTC)[9]-1990,Kosmos-3M,Kosmos 2056 (Strela-2M)[9],,LEO,Communications,In orbit,Successful; replaced by Kosmos 2208 12 August ...,,
7,14,22 January 01:35 (UTC)[11]-1990,Ariane 4 (40),SPOT 2,CNES,Sun-synchronous,Earth observation,In orbit,"Successful; retired July, 2009.[12]",,
8,15,22 January 01:35 (UTC)[11]-1990,Ariane 4 (40),UOSAT 3 also known as UoSAT-OSCAR 14[13],University of Surrey,Sun-synchronous,Communications,In orbit,Successful; retired from active service in 199...,,
9,16,22 January 01:35 (UTC)[11]-1990,Ariane 4 (40),UOSAT 4,University of Surrey,Sun-synchronous,Communications,In orbit,Spacecraft failure 30 hours after launch[16],,


In [85]:
def combine_unique(series):
    # Drop NaN values and convert to a list of unique strings
    unique_values = series.dropna().unique().astype(str)
    
    # If there are values, join them with a comma and space.
    # Otherwise, return NaN.
    if len(unique_values) > 0:
        return ', '.join(unique_values)
    return np.nan

agg_columns = df_filled.columns.drop('DateTimeYear')

# Create a dictionary where each column is mapped to your function
agg_dict = {col: combine_unique for col in agg_columns}

# Now, use the agg() method with the dictionary
merged_df = df_filled.groupby('DateTimeYear', sort=False).agg(agg_dict).reset_index()



In [86]:
merged_df.head(6000)

Unnamed: 0,DateTimeYear,index,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,FlightNumberInfo,LaunchSite
0,1 January 00:07 (UTC)[1]-1990,"2, 3",Commercial Titan III[1],"Skynet 4A[1], JCSAT 2[1]","MoD, JSAT",GEO,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,,
1,9 January 12:35 (UTC)[5]-1990,"6, 7, 8",Space Shuttle Columbia,"STS-32, Leasat F5 also known as Syncom IV-5.[7...","NASA, Crewed orbital flight with 5 astronauts ...","LEO, GEO, Crewed orbital flight with 5 astrona...","Satellite deployment and retrieval, Communicat...","20 January 09:35 (UTC)[6], In orbit, Crewed or...","Successful, Successful; placed in graveyard or...",,
2,17 January 14:45 (UTC)[8]-1990,10,Soyuz-U,Kosmos 2055 (Zenit-8)[8],,LEO,Reconnaissance,29 January[8],Successful,,
3,18 January 12:52 (UTC)[9]-1990,12,Kosmos-3M,Kosmos 2056 (Strela-2M)[9],,LEO,Communications,In orbit,Successful; replaced by Kosmos 2208 12 August ...,,
4,22 January 01:35 (UTC)[11]-1990,"14, 15, 16, 17, 18, 19, 20",Ariane 4 (40),"SPOT 2, UOSAT 3 also known as UoSAT-OSCAR 14[1...","CNES, University of Surrey, AMSAT",Sun-synchronous,"Earth observation, Communications",In orbit,"Successful; retired July, 2009.[12], Successfu...",,
...,...,...,...,...,...,...,...,...,...,...,...
2252,18 December 12:26:26[343]-2020,"7508, 7509",Soyuz-2.1b / Fregat,"OneWeb × 36, Vostochny flight 1. Third large b...","OneWeb, Vostochny flight 1. Third large batch ...",LEO,"Communications, Vostochny flight 1. Third larg...","In orbit, Vostochny flight 1. Third large batc...","Operational, Vostochny flight 1. Third large b...",,Vostochny flight 1. Third large batch of satel...
2253,19 December 14:00[346]-2020,"7511, 7512, 7513",Falcon 9 Block 5,"USA-312, USA-313, NROL-108 mission, satellites...","NRO, NROL-108 mission, satellites being possib...","LEO, NROL-108 mission, satellites being possib...","Reconnaissance, NROL-108 mission, satellites b...","In orbit, NROL-108 mission, satellites being p...","Operational, NROL-108 mission, satellites bein...",,
2254,22 December 04:37:37[347][348][349]-2020,"7515, 7516, 7517, 7518, 7519, 7520",Long March 8,"Xinjishu Yanzheng-7 (XJY-7), Hisea-1[351], Tia...","CAST, Spacety, Guodian Gaoke, Spacety / HBUT, ...","LEO (SSO), First flight of Long March 8.","Technology demonstration, Earth observation, I...","In orbit, First flight of Long March 8.","Operational, First flight of Long March 8.",First flight,
2255,27 December 15:44[353]-2020,"7522, 7523, 7524",Long March 4C,"Yaogan 33(R), ⚀ Weina-2, Replacement for Yaoga...","CAS, SECM, Replacement for Yaogan 33, which wa...","LEO (SSO), Replacement for Yaogan 33, which wa...","Reconnaissance, Technology demonstration, Repl...","In orbit, Replacement for Yaogan 33, which was...","Operational, Replacement for Yaogan 33, which ...",,


In [84]:
df_filled.head(6000)

Unnamed: 0,index,DateTimeYear,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,FlightNumberInfo,LaunchSite
0,2,1 January 00:07 (UTC)[1]-1990,Commercial Titan III[1],Skynet 4A[1],MoD,GEO,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,,
1,3,1 January 00:07 (UTC)[1]-1990,Commercial Titan III[1],JCSAT 2[1],JSAT,GEO,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],,
2,6,9 January 12:35 (UTC)[5]-1990,Space Shuttle Columbia,STS-32,NASA,LEO,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,,
3,7,9 January 12:35 (UTC)[5]-1990,Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],NASA,GEO,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,,
4,8,9 January 12:35 (UTC)[5]-1990,Space Shuttle Columbia,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,,
...,...,...,...,...,...,...,...,...,...,...,...
4804,7520,22 December 04:37:37[347][348][349]-2020,Long March 8,,First flight of Long March 8.,First flight of Long March 8.,First flight of Long March 8.,First flight of Long March 8.,First flight of Long March 8.,First flight,
4805,7522,27 December 15:44[353]-2020,Long March 4C,Yaogan 33(R),CAS,LEO (SSO),Reconnaissance,In orbit,Operational,,
4806,7523,27 December 15:44[353]-2020,Long March 4C,⚀ Weina-2,SECM,LEO (SSO),Technology demonstration,In orbit,Operational,,
4807,7524,27 December 15:44[353]-2020,Long March 4C,"Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...","Replacement for Yaogan 33, which was lost in a...",,


In [87]:
merged_df.to_csv('merged.csv', index=False)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Assuming your DataFrame is already loaded and named 'df'
# If not, you would load your data here, for example:
# df = pd.read_csv('your_data_file.csv')

# --- Step 1: Initialize the LabelEncoder ---
# This creates the tool that will do the encoding for us.
le = LabelEncoder()

# --- Step 2: Fit and transform the 'Rocket' column ---
# We use .fit_transform() which is a convenient method that both learns the mapping
# and applies it to the 'Rocket' column in one step.
# The result is a new numerical column.
df['Rocket_ID'] = le.fit_transform(df['Rocket'])

# --- Step 3: Print a sample to check the result ---
# Let's look at the original 'Rocket' column alongside the new 'Rocket_ID' column
# to make sure the conversion worked as expected.
print("Original and new columns side-by-side:")
print(df[['Rocket', 'Rocket_ID']].head())

# --- Step 4: Access the key/mapping ---
# As we discussed, the encoder object 'le' now holds the mapping.
# We can access it using the .classes_ attribute.
print("\nHere is the key (the mapping of IDs to rocket names):")
for i, name in enumerate(le.classes_):
    print(f"ID: {i} -> Rocket: {name}")

# And that's it! Your DataFrame 'df' now has a new column called 'Rocket_ID'
# with the numerical representations.
print("\nEncoding complete! Your DataFrame now has a 'Rocket_ID' column.")