In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('merged.csv')

success_condition = df['Outcome'].str.contains('Successful|Operational', na=False)
df['Success'] = success_condition.astype(int)

failure_condition = df['Outcome'].str.contains('Failure', na=False)
df.loc[failure_condition, 'Success'] = 0 

print(df[['Outcome', 'Success']].head(20))
print("\nValue Counts for the 'Success' column:")
print(df['Success'].value_counts())


                                              Outcome  Success
0   Successful; placed in graveyard orbit 20 June ...        1
1   Successful, Successful; placed in graveyard or...        1
2                                          Successful        1
3   Successful; replaced by Kosmos 2208 12 August ...        1
4   Successful; retired July, 2009.[12], Successfu...        1
5                                          Successful        1
6   Successful, Spacecraft failure, First Japanese...        1
7                                          Successful        1
8                                          Successful        1
9                                         Operational        1
10         Successful; retired in the early 1990s[20]        1
11                                         Successful        1
12                            Operational, Successful        1
13  Successful, Crewed orbital flight with 2 cosmo...        1
14                                         Successful  

In [2]:
# --- Clean Up Citation Brackets from Text Columns ---

# This regular expression `\[.*?]` means:
# \[  - Match a literal opening square bracket.
# .*? - Match any character (.), any number of times (*), in a non-greedy way (?).
# \]  - Match a literal closing square bracket.
# Effectively, it finds anything that looks like [a bunch of stuff].
pattern = r"\[.*?\]"

# Select all the columns that have text data (Dtype 'object')
text_columns = df.select_dtypes(include=['object']).columns

# Loop through each text column and apply the cleaning
for col in text_columns:
    df[col] = df[col].str.replace(pattern, '', regex=True)

    
# --- Let's check our work on a few columns ---
print("--- Cleaned 'Rocket' and 'Payload' Columns ---")
print(df[['Rocket', 'Payload']].head(10))

print("\n--- First 5 rows of fully cleaned (for now) data ---")
print(df.head())

--- Cleaned 'Rocket' and 'Payload' Columns ---
                   Rocket                                            Payload
0    Commercial Titan III                                 Skynet 4A, JCSAT 2
1  Space Shuttle Columbia  STS-32, Leasat F5 also known as Syncom IV-5., ...
2                 Soyuz-U                              Kosmos 2055 (Zenit-8)
3               Kosmos-3M                            Kosmos 2056 (Strela-2M)
4           Ariane 4 (40)  SPOT 2, UOSAT 3 also known as UoSAT-OSCAR 14, ...
5            Molniya-M/ML                                      Molniya 3-53L
6                Mu-3S-II  Hiten, Hagoromo, First Japanese lunar mission ...
7           Delta II 6925                                  GPS II-6 (USA-50)
8                 Soyuz-U                           Kosmos 2057 (Yantar-4K2)
9               Tsyklon-3                            Kosmos 2058 (Tselina-R)

--- First 5 rows of fully cleaned (for now) data ---
                  DateTimeYear                      

In [3]:
df.head(20)

Unnamed: 0,DateTimeYear,index,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,FlightNumberInfo,LaunchSite,Success
0,1 January 00:07 (UTC)-1990,"2, 3",Commercial Titan III,"Skynet 4A, JCSAT 2","MoD, JSAT",GEO,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,,,1
1,9 January 12:35 (UTC)-1990,"6, 7, 8",Space Shuttle Columbia,"STS-32, Leasat F5 also known as Syncom IV-5., ...","NASA, Crewed orbital flight with 5 astronauts ...","LEO, GEO, Crewed orbital flight with 5 astrona...","Satellite deployment and retrieval, Communicat...","20 January 09:35 (UTC), In orbit, Crewed orbit...","Successful, Successful; placed in graveyard or...",,,1
2,17 January 14:45 (UTC)-1990,10,Soyuz-U,Kosmos 2055 (Zenit-8),,LEO,Reconnaissance,29 January,Successful,,,1
3,18 January 12:52 (UTC)-1990,12,Kosmos-3M,Kosmos 2056 (Strela-2M),,LEO,Communications,In orbit,Successful; replaced by Kosmos 2208 12 August ...,,,1
4,22 January 01:35 (UTC)-1990,"14, 15, 16, 17, 18, 19, 20",Ariane 4 (40),"SPOT 2, UOSAT 3 also known as UoSAT-OSCAR 14, ...","CNES, University of Surrey, AMSAT",Sun-synchronous,"Earth observation, Communications",In orbit,"Successful; retired July, 2009., Successful; r...",,,1
5,23 January 02:52-1990,23,Molniya-M/ML,Molniya 3-53L,,Molniya,Communications,23 June 2003,Successful,,,1
6,24 January 11:46-1990,"25, 26, 27",Mu-3S-II,"Hiten, Hagoromo, First Japanese lunar mission ...","ISAS, First Japanese lunar mission Hagoromo su...","Selenocentric, First Japanese lunar mission Ha...","Lunar probe, Lunar orbiter, First Japanese lun...","11 April 1993, In orbit, First Japanese lunar ...","Successful, Spacecraft failure, First Japanese...",,,1
7,24 January 22:55:01 (UTC)-1990,29,Delta II 6925,GPS II-6 (USA-50),US Air Force,MEO,Navigation,In orbit,Successful,,,1
8,25 January 17:15 (UTC)-1990,31,Soyuz-U,Kosmos 2057 (Yantar-4K2),,LEO,Reconnaissance,19 March 1990,Successful,,,1
9,30 January 11:20 (UTC)-1990,33,Tsyklon-3,Kosmos 2058 (Tselina-R),,LEO,ELINT,In orbit,Operational,,,1


In [6]:
import pandas as pd
import numpy as np

# Load your merged dataset
df = pd.read_csv('merged.csv')

# --- Convert string representations of lists into actual Python lists ---
list_columns = ['Payload', 'Operator', 'Orbit', 'Function', 'Decay', 'Outcome']

for col in list_columns:
    # Use .astype(str) to prevent errors on empty cells
    # The .str.split(', ') turns 'Payload A, Payload B' into ['Payload A', 'Payload B']
    df[col] = df[col].astype(str).str.split(', ')

    
# --- The Robust Explode Process ---

# 1. First, create a new DataFrame by exploding ONLY the 'Payload' column.
#    This column is our primary guide for what constitutes a single "observation".
df_exploded = df.explode('Payload')

# 2. Get the cumulative count of payloads for each original launch group.
#    This tells us if a row is the 1st, 2nd, 3rd, etc., payload of a launch.
df_exploded['payload_idx'] = df_exploded.groupby(level=0).cumcount()

# 3. Now, we apply a clever function to the other columns to pick the correct element.
for col in list_columns:
    if col != 'Payload': # We've already handled Payload
        # The lambda function gets the list from the column (x) and the index (y).
        # It tries to pick the element at the right index. If the list is too short,
        # it just picks the last available element (e.g., uses the first outcome for all payloads).
        df_exploded[col] = df_exploded.apply(lambda row: row[col][min(row['payload_idx'], len(row[col])-1)], axis=1)

# We are done with the temporary 'payload_idx' column, so we can drop it.
df_exploded = df_exploded.drop(columns=['payload_idx'])


# --- Final Cleanup and Target Variable Creation ---

# Reset the index for a clean final look
df_final = df_exploded.reset_index(drop=True)

# Clean up any leading/trailing whitespace from all text columns
for col in df_final.select_dtypes(include=['object']).columns:
    df_final[col] = df_final[col].str.strip()

# Now that rows are properly structured, we create our binary 'Success' column
df_final['Success'] = 0 # Default to 0 (failure)
df_final.loc[df_final['Outcome'].str.contains('Successful|Operational', na=False), 'Success'] = 1


# --- Final Review and Save ---
print("--- Final Cleaned DataFrame (1 Payload Per Row) ---")
df_final.info()

print("\n--- First 20 rows of final data ---")
print(df_final.head(20))

# Save our beautifully structured data to a new file
df_final.to_csv('launches_cleaned.csv', index=False)
print("\n!!! DAY 2 COMPLETE: Successfully saved the final clean data to launches_cleaned.csv !!!")

--- Final Cleaned DataFrame (1 Payload Per Row) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5084 entries, 0 to 5083
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   DateTimeYear      5084 non-null   object
 1   index             5084 non-null   object
 2   Rocket            5084 non-null   object
 3   Payload           5084 non-null   object
 4   Operator          5084 non-null   object
 5   Orbit             5084 non-null   object
 6   Function          5084 non-null   object
 7   Decay             5084 non-null   object
 8   Outcome           5084 non-null   object
 9   FlightNumberInfo  201 non-null    object
 10  LaunchSite        54 non-null     object
 11  Success           5084 non-null   int64 
dtypes: int64(1), object(11)
memory usage: 476.8+ KB

--- First 20 rows of final data ---
                              DateTimeYear                       index  \
0            1 January 00:07 (UTC

In [7]:
df_final.head(20)

Unnamed: 0,DateTimeYear,index,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,FlightNumberInfo,LaunchSite,Success
0,1 January 00:07 (UTC)[1]-1990,"2, 3",Commercial Titan III[1],Skynet 4A[1],MoD,GEO,Communications,In orbit,Successful; placed in graveyard orbit 20 June ...,,,1
1,1 January 00:07 (UTC)[1]-1990,"2, 3",Commercial Titan III[1],JCSAT 2[1],JSAT,GEO,Communications,In orbit,Successful; placed in graveyard orbit 2002[3],,,1
2,9 January 12:35 (UTC)[5]-1990,"6, 7, 8",Space Shuttle Columbia,STS-32,NASA,LEO,Satellite deployment and retrieval,20 January 09:35 (UTC)[6],Successful,,,1
3,9 January 12:35 (UTC)[5]-1990,"6, 7, 8",Space Shuttle Columbia,Leasat F5 also known as Syncom IV-5.[7],Crewed orbital flight with 5 astronauts Long D...,GEO,Communications,In orbit,Successful; placed in graveyard orbit 24 Septe...,,,1
4,9 January 12:35 (UTC)[5]-1990,"6, 7, 8",Space Shuttle Columbia,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,Crewed orbital flight with 5 astronauts Long D...,,,0
5,17 January 14:45 (UTC)[8]-1990,10,Soyuz-U,Kosmos 2055 (Zenit-8)[8],,LEO,Reconnaissance,29 January[8],Successful,,,1
6,18 January 12:52 (UTC)[9]-1990,12,Kosmos-3M,Kosmos 2056 (Strela-2M)[9],,LEO,Communications,In orbit,Successful; replaced by Kosmos 2208 12 August ...,,,1
7,22 January 01:35 (UTC)[11]-1990,"14, 15, 16, 17, 18, 19, 20",Ariane 4 (40),SPOT 2,CNES,Sun-synchronous,Earth observation,In orbit,Successful; retired July,,,1
8,22 January 01:35 (UTC)[11]-1990,"14, 15, 16, 17, 18, 19, 20",Ariane 4 (40),UOSAT 3 also known as UoSAT-OSCAR 14[13],University of Surrey,Sun-synchronous,Communications,In orbit,2009.[12],,,0
9,22 January 01:35 (UTC)[11]-1990,"14, 15, 16, 17, 18, 19, 20",Ariane 4 (40),UOSAT 4,AMSAT,Sun-synchronous,Communications,In orbit,Successful; retired from active service in 199...,,,1
