In [None]:
import pandas as pd
import numpy as np

RAW_PATH = "../data/raw/PS_2026.01.19_01.24.31.csv"

df = pd.read_csv(RAW_PATH, comment="#", engine="python", on_bad_lines="skip")
print("✅ Loaded dataset:", df.shape)
df.head()


In [None]:
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]

print("✅ Duplicates removed:", before - after)
print("✅ New shape:", df.shape)


In [None]:
missing_ratio = df.isna().mean()
drop_cols = missing_ratio[missing_ratio > 0.80].index.tolist()

print("✅ Dropping columns:", len(drop_cols))
df = df.drop(columns=drop_cols)

print("✅ Shape after dropping:", df.shape)


In [None]:
num_cols = df.select_dtypes(include=["number"]).columns
cat_cols = df.select_dtypes(include=["object"]).columns

# Fill numeric
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Fill categorical
for col in cat_cols:
    df[col] = df[col].fillna("Unknown")

print("✅ Missing values handled")
df.isna().sum().sum()


In [None]:
def iqr_clip(series, factor=1.5):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    if IQR == 0 or pd.isna(IQR):
        return series
    lower = Q1 - factor * IQR
    upper = Q3 + factor * IQR
    return series.clip(lower, upper)


outlier_cols = ["pl_rade", "pl_eqt", "pl_orbper", "pl_bmasse", "st_teff"]
for col in outlier_cols:
    if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
        df[col] = iqr_clip(df[col])

print("✅ Outliers clipped (IQR method)")


In [None]:
import matplotlib.pyplot as plt

col = "pl_rade"
if col in df.columns:
    plt.figure(figsize=(6,3))
    plt.boxplot(df[col].dropna(), vert=False)
    plt.title(f"Boxplot after outlier clipping: {col}")
    plt.tight_layout()
    plt.show()


In [None]:
def safe_score(series, ideal, scale):
    score = 1 - (np.abs(series - ideal) / scale)
    return np.clip(score, 0, 1)


# Habitability Score Index (HSI)
if "pl_rade" in df.columns:
    radius_score = safe_score(df["pl_rade"], ideal=1.0, scale=1.5)
else:
    radius_score = 0

if "pl_eqt" in df.columns:
    temp_score = safe_score(df["pl_eqt"], ideal=288, scale=200)
else:
    temp_score = 0

df["HSI"] = (radius_score + temp_score) / 2


# Stellar Compatibility Index (SCI)
if "st_teff" in df.columns:
    teff_score = safe_score(df["st_teff"], ideal=5778, scale=2500)
else:
    teff_score = 0

if "st_mass" in df.columns:
    mass_score = safe_score(df["st_mass"], ideal=1.0, scale=1.0)
else:
    mass_score = 0

if "st_rad" in df.columns:
    rad_score = safe_score(df["st_rad"], ideal=1.0, scale=1.0)
else:
    rad_score = 0

df["SCI"] = (teff_score + mass_score + rad_score) / 3

print("✅ Added features: HSI, SCI")
df[["HSI", "SCI"]].head()


In [None]:
# Baseline habitability label from HSI
df["habitability"] = (df["HSI"] >= 0.60).astype(int)

df["habitability"].value_counts()


In [None]:
numeric_df = df.select_dtypes(include=["number"])

corr = numeric_df.corr()

plt.figure(figsize=(12,8))
plt.imshow(corr, aspect="auto")
plt.title("Correlation Heatmap (After Cleaning + Feature Engineering)")
plt.colorbar()
plt.tight_layout()
plt.show()


In [None]:
CLEANED_PATH = "../data/processed/feature_engineered_exoplanets.csv"

df.to_csv(CLEANED_PATH, index=False)
print("✅ Saved engineered dataset:", CLEANED_PATH)
