In [1]:
import pandas as pd
import numpy as np

# Load the great dataset you just created
df = pd.read_csv('launches_clean.csv')

# --- FINAL POLISHING and TYPE CONVERSION ---

# Your 'DateTime' column might be empty when loading from CSV. Let's fix that.
# First, re-apply the datetime conversion. errors='coerce' is key here.
df['DateTime'] = pd.to_datetime(df['DateTime'], errors='coerce')

# Let's drop any rows that have junk in so many columns that the date failed to parse
df.dropna(subset=['DateTime'], inplace=True)

# Now, let's be EXTRA aggressive about removing the last few remark rows.
# A real payload usually doesn't have the same text in the Payload, Operator, and Orbit columns.
# A remark row OFTEN DOES.
df = df[~(df['Payload'] == df['Operator'])]


# --- CREATE THE 'SUCCESS' TARGET VARIABLE ---
# Define success as an Outcome containing 'success' or 'operational'.
success_criteria = df['Outcome'].str.lower().str.contains('success|operational', na=False)
df['Success'] = success_criteria.astype(int)

# Clean up the outcome text itself to make it cleaner
df['Outcome'] = df['Outcome'].str.split(';').str[0].str.strip()


# --- CHECK OUR FINAL PRODUCT ---
df = df.reset_index(drop=True)

print("--- FINAL MASTER DATASET READY FOR MODELING ---")
print(df.info())
print("\n")
print(df[['Rocket', 'Payload', 'Outcome', 'Success']].head(15))

print("\n--- DISTRIBUTION OF OUTCOMES ---")
print(df['Success'].value_counts())

--- FINAL MASTER DATASET READY FOR MODELING ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   DateTime  1 non-null      datetime64[ns]
 1   Rocket    1 non-null      object        
 2   Payload   1 non-null      object        
 3   Operator  1 non-null      object        
 4   Orbit     1 non-null      object        
 5   Function  1 non-null      object        
 6   Decay     1 non-null      object        
 7   Outcome   1 non-null      object        
 8   Year      1 non-null      int64         
 9   Success   1 non-null      int64         
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 208.0+ bytes
None


           Rocket        Payload      Outcome  Success
0  Proton-K/DM-2M  Ekspress AM-2  Operational        1

--- DISTRIBUTION OF OUTCOMES ---
Success
1    1
Name: count, dtype: int64


In [3]:
df.head()

Unnamed: 0,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year,Success
0,2000-03-29 22:31:00,Proton-K/DM-2M,Ekspress AM-2,RSCC,Geosynchronous,Communications,In orbit,Operational,2005,1


In [4]:
df.head(10)

Unnamed: 0,DateTime,Rocket,Payload,Operator,Orbit,Function,Decay,Outcome,Year,Success
0,2000-03-29 22:31:00,Proton-K/DM-2M,Ekspress AM-2,RSCC,Geosynchronous,Communications,In orbit,Operational,2005,1


In [5]:
print("The shape of my DataFrame is:", df.shape)

NameError: name 'df_final' is not defined