In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
df = pd.read_csv("../data/raw/raw_exoplanet_data.csv")
df.head()

Unnamed: 0,pl_name,hostname,pl_rade,pl_masse,pl_orbsmax,pl_eqt,st_teff,st_lum
0,Kepler-1167 b,Kepler-1167,1.71,,0.0175,1419.0,4971.0,-0.53589
1,Kepler-1740 b,Kepler-1740,3.323214,,0.0779,858.0,5705.0,-0.07942
2,Kepler-1581 b,Kepler-1581,0.8,,0.06865,1108.0,6022.0,0.39085
3,Kepler-644 b,Kepler-644,3.15,,0.04641,1655.0,6747.0,0.71041
4,Kepler-1752 b,Kepler-1752,4.540605,,0.2698,419.0,5446.0,-0.39819


In [3]:
df.shape
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pl_name     2000 non-null   str    
 1   hostname    2000 non-null   str    
 2   pl_rade     1975 non-null   float64
 3   pl_masse    906 non-null    float64
 4   pl_orbsmax  1926 non-null   float64
 5   pl_eqt      1604 non-null   float64
 6   st_teff     1959 non-null   float64
 7   st_lum      1952 non-null   float64
dtypes: float64(6), str(2)
memory usage: 125.1 KB


In [4]:
missing = df.isnull().sum().sort_values(ascending=False)
missing.head(10)

pl_masse      1094
pl_eqt         396
pl_orbsmax      74
st_lum          48
st_teff         41
pl_rade         25
pl_name          0
hostname         0
dtype: int64

In [5]:
# ==============================
# Missing Value Strategy - Version A (Baseline)
# ==============================

df_baseline = df.copy()

# 1️⃣ Drop planet mass (too many missing values)
df_baseline = df_baseline.drop(columns=["pl_masse"])

# 2️⃣ Drop rows where critical low-missing features are null
df_baseline = df_baseline.dropna(subset=["pl_orbsmax", "st_teff", "st_lum"])

# 3️⃣ Impute equilibrium temperature using median
median_eqt = df_baseline["pl_eqt"].median()
df_baseline["pl_eqt"] = df_baseline["pl_eqt"].fillna(median_eqt)

# 4️⃣ Final shape check
print("Baseline dataset shape:", df_baseline.shape)
df_baseline.isnull().sum()


Baseline dataset shape: (1877, 7)


pl_name        0
hostname       0
pl_rade       19
pl_orbsmax     0
pl_eqt         0
st_teff        0
st_lum         0
dtype: int64

In [6]:
# ==============================
# Missing Value Strategy - Version B (Physics-Informed)
# ==============================

df_physics = df.copy()

# Drop planet mass
df_physics = df_physics.drop(columns=["pl_masse"])

# Keep only rows with valid physics inputs
df_physics = df_physics.dropna(subset=["pl_orbsmax", "st_lum"])

# Remove zero or negative values (physically invalid)
df_physics = df_physics[
    (df_physics["pl_orbsmax"] > 0) &
    (df_physics["st_lum"] > 0)
]

# Compute recalculated equilibrium temperature proxy
df_physics["teq_recalc"] = (
    (df_physics["st_lum"] / (df_physics["pl_orbsmax"] ** 2)) ** 0.25
)

# Drop original pl_eqt
df_physics = df_physics.drop(columns=["pl_eqt"])

# Drop rows where recalculation still failed
df_physics = df_physics.dropna(subset=["teq_recalc"])

print("Physics-informed dataset shape:", df_physics.shape)
df_physics.isnull().sum()


Physics-informed dataset shape: (879, 7)


pl_name       0
hostname      0
pl_rade       7
pl_orbsmax    0
st_teff       0
st_lum        0
teq_recalc    0
dtype: int64