In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("../data/dataset_part_1.csv")
df.head()

Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude
0,1,2010-06-04,Falcon 9,6123.547647,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0003,-80.577366,28.561857
1,2,2012-05-22,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0005,-80.577366,28.561857
2,3,2013-03-01,Falcon 9,677.0,ISS,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B0007,-80.577366,28.561857
3,4,2013-09-29,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1,False,False,False,,1.0,0,B1003,-120.610829,34.632093
4,5,2013-12-03,Falcon 9,3170.0,GTO,CCSFS SLC 40,None None,1,False,False,False,,1.0,0,B1004,-80.577366,28.561857


In [3]:
# ### 🔍 Check Missing Values (in Percentage)

# Calculate percentage of missing values in each column
missing_values_percent = df.isnull().sum() / len(df) * 100
missing_values_percent


FlightNumber       0.000000
Date               0.000000
BoosterVersion     0.000000
PayloadMass        0.000000
Orbit              0.000000
LaunchSite         0.000000
Outcome            0.000000
Flights            0.000000
GridFins           0.000000
Reused             0.000000
Legs               0.000000
LandingPad        28.888889
Block              0.000000
ReusedCount        0.000000
Serial             0.000000
Longitude          0.000000
Latitude           0.000000
dtype: float64

In [4]:
# Identify Data Types of Each Column

# This will help distinguish between categorical and numerical features
df.dtypes


FlightNumber        int64
Date               object
BoosterVersion     object
PayloadMass       float64
Orbit              object
LaunchSite         object
Outcome            object
Flights             int64
GridFins             bool
Reused               bool
Legs                 bool
LandingPad         object
Block             float64
ReusedCount         int64
Serial             object
Longitude         float64
Latitude          float64
dtype: object

# Calculate the number of launches on each site

In [5]:
# ## 🚀 TASK 1: Number of Launches per Site

# Count the number of launches per unique launch site
launch_counts = df['LaunchSite'].value_counts().reset_index()

# Rename columns for clarity
launch_counts.columns = ['Launch Site', 'Number of Launches']

# Display the result
launch_counts


Unnamed: 0,Launch Site,Number of Launches
0,CCSFS SLC 40,55
1,KSC LC 39A,22
2,VAFB SLC 4E,13


# Landing Outcome Counts and Classification

In [6]:
# Count each unique value in the 'Outcome' column
landing_outcomes = df['Outcome'].value_counts()

# Display the outcomes and their index numbers
for i, outcome in enumerate(landing_outcomes.keys()):
    print(i, outcome)

# Create a set of outcomes that represent **unsuccessful landings**
# These include: False Ocean, False RTLS, False ASDS, None ASDS, None None
bad_outcomes = set(landing_outcomes.keys()[[1, 3, 5, 6, 7]])

# Display the set of bad outcomes
bad_outcomes


0 True ASDS
1 None None
2 True RTLS
3 False ASDS
4 True Ocean
5 False Ocean
6 None ASDS
7 False RTLS


{'False ASDS', 'False Ocean', 'False RTLS', 'None ASDS', 'None None'}

# Create a Binary Landing Outcome Label

In [7]:
# Create a landing_class list: 0 if the outcome is in bad_outcomes, else 1
landing_class = [0 if outcome in bad_outcomes else 1 for outcome in df['Outcome']]

# Add the new classification label to the DataFrame as a new column 'Class'
df['Class'] = landing_class

# Display the first 8 values of the new Class column
print(df[['Class']].head(8))

# Display the first 5 rows of the updated DataFrame
print(df.head(5))

# Calculate the success rate of first stage landings
success_rate = df["Class"].mean()
print(f"Landing Success Rate: {success_rate:.2f}")

   Class
0      0
1      0
2      0
3      0
4      0
5      0
6      1
7      1
   FlightNumber        Date BoosterVersion  PayloadMass Orbit    LaunchSite  \
0             1  2010-06-04       Falcon 9  6123.547647   LEO  CCSFS SLC 40   
1             2  2012-05-22       Falcon 9   525.000000   LEO  CCSFS SLC 40   
2             3  2013-03-01       Falcon 9   677.000000   ISS  CCSFS SLC 40   
3             4  2013-09-29       Falcon 9   500.000000    PO   VAFB SLC 4E   
4             5  2013-12-03       Falcon 9  3170.000000   GTO  CCSFS SLC 40   

       Outcome  Flights  GridFins  Reused   Legs LandingPad  Block  \
0    None None        1     False   False  False        NaN    1.0   
1    None None        1     False   False  False        NaN    1.0   
2    None None        1     False   False  False        NaN    1.0   
3  False Ocean        1     False   False  False        NaN    1.0   
4    None None        1     False   False  False        NaN    1.0   

   ReusedCount Serial  

In [9]:
# Export the updated DataFrame to a CSV file for future use
df.to_csv("../data/dataset_part_2.csv", index=False)
print("dataset saved")

dataset saved
