In [18]:
# Notebook 01: Create Science-Ready Subset Dataset
# -------------------------------------------------
# Purpose:
# This notebook loads the full exoplanet dataset and creates a disciplined,
# science-ready subset containing only the variables required for
# hypothesis testing, habitability analysis, and robustness checks.

import pandas as pd

# -----------------------------
# 1. Load Full Dataset
# -----------------------------
# Update the path/filename as needed depending on how the Kaggle dataset is stored

df = pd.read_csv("Exoplanet Data.csv")

print("Full dataset shape:", df.shape)

Full dataset shape: (5788, 210)


In [20]:
# -----------------------------
# 2. Define Science-Ready Columns
# -----------------------------

SCIENCE_COLUMNS = [
    # Planet composition (with uncertainty)
    "planet_radius_earth_radius",
    "planet_radius_upper_unc_earth_radius",
    "planet_radius_lower_unc_earth_radius",

    # Planet mass (optional but valuable if populated)
    "planet_mass_earth_mass",

    # Habitability / insolation (with uncertainty)
    "insolation_flux_earth_flux",
    "insolation_flux_upper_unc_earth_flux",
    "insolation_flux_lower_unc_earth_flux",

    # Thermal environment (with uncertainty)
    "equilibrium_temperature_k",
    "equilibrium_temperature_upper_unc_k",
    "equilibrium_temperature_lower_unc_k",

    # How many days to orbit home star
    "orbital_period_days",

    # How the exoplanet was discovered
    'discovery_method'
]

In [22]:
# -----------------------------
# 3. Subset Dataset
# -----------------------------

subset_df = df[SCIENCE_COLUMNS].copy()

print("Subset dataset shape:", subset_df.shape)

Subset dataset shape: (5788, 12)


In [24]:
# -----------------------------
# 4. Basic Missingness Check
# -----------------------------

missingness = subset_df.isna().mean().sort_values(ascending=False)
print("\nMissingness by column:")
print(missingness)


Missingness by column:
equilibrium_temperature_upper_unc_k     0.732032
equilibrium_temperature_lower_unc_k     0.732032
insolation_flux_upper_unc_earth_flux    0.410332
insolation_flux_lower_unc_earth_flux    0.410332
planet_radius_upper_unc_earth_radius    0.308742
planet_radius_lower_unc_earth_radius    0.308742
insolation_flux_earth_flux              0.302004
equilibrium_temperature_k               0.256565
orbital_period_days                     0.048549
planet_mass_earth_mass                  0.005183
planet_radius_earth_radius              0.003628
discovery_method                        0.000000
dtype: float64


In [26]:
# -----------------------------
# 5. Save Science-Ready Dataset
# -----------------------------

subset_df.to_csv("exoplanets_science_ready.csv", index=False)

print("\nSaved science-ready dataset as 'exoplanets_science_ready.csv'")


Saved science-ready dataset as 'exoplanets_science_ready.csv'


In [28]:
# -----------------------------
# 6. Quick Preview
# -----------------------------

subset_df.head()

Unnamed: 0,planet_radius_earth_radius,planet_radius_upper_unc_earth_radius,planet_radius_lower_unc_earth_radius,planet_mass_earth_mass,insolation_flux_earth_flux,insolation_flux_upper_unc_earth_flux,insolation_flux_lower_unc_earth_flux,equilibrium_temperature_k,equilibrium_temperature_upper_unc_k,equilibrium_temperature_lower_unc_k,orbital_period_days,discovery_method
0,1.66,,,3.41,1.3,,,,,,17.8719,Radial Velocity
1,2.69,,,7.7,0.06,,,,,,217.21,Radial Velocity
2,1.08,,,1.26,0.65,0.03,-0.03,250.0,7.0,-6.0,15.564,Radial Velocity
3,1.24,0.06,-0.06,2.53,233.9,18.3,-18.3,996.0,22.0,-22.0,0.573474,Transit
4,2.043,0.069,-0.069,6.27,64.7,5.5,-5.5,789.0,16.0,-16.0,6.00127,Transit
