In [None]:
# ============================================================
# üöÄ EXOHABITAI ‚Äî LEVEL-500 DATA PREPROCESSING PIPELINE
# SINGLE CELL ‚Äî PRODUCTION READY
# ============================================================

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

plt.style.use("dark_background")

print("üöÄ ExoHabitAI Data Preprocessing Started")

# ============================================================
# üåå AUTO PROJECT ROOT DETECTION
# ============================================================

CURRENT_DIR = os.getcwd()
PROJECT_ROOT = os.path.dirname(CURRENT_DIR)

INPUT_PATH = os.path.join(
    PROJECT_ROOT,
    "data",
    "processed",
    "feature_engineered_exoplanets.csv"
)

OUTPUT_DIR = os.path.join(PROJECT_ROOT, "data", "processed")
os.makedirs(OUTPUT_DIR, exist_ok=True)

OUTPUT_PATH = os.path.join(
    OUTPUT_DIR,
    "model_ready_exoplanets.csv"
)

print("üìÇ Input:", INPUT_PATH)

# ============================================================
# 1Ô∏è‚É£ LOAD DATASET
# ============================================================

if not os.path.exists(INPUT_PATH):
    raise FileNotFoundError(f"Missing feature engineered dataset:\n{INPUT_PATH}")

df = pd.read_csv(INPUT_PATH)

print("\n‚úÖ Dataset Loaded")
print("Shape:", df.shape)

# ============================================================
# 2Ô∏è‚É£ SELECT MODEL FEATURES (BACKEND COMPATIBLE)
# ============================================================

model_features = [
    "pl_rade",
    "pl_eqt",
    "pl_orbper",
    "st_teff",
    "st_mass",
    "st_rad",
    "HSI",
    "SCI"
]

existing_features = [c for c in model_features if c in df.columns]

print("\nüß† Features Found:", existing_features)

df_model = df[existing_features + ["habitability"]].copy()

# ============================================================
# 3Ô∏è‚É£ HANDLE MISSING VALUES (SCIENCE SAFE)
# ============================================================

print("\nüßπ Filling Missing Values...")

numeric_cols = df_model.select_dtypes(include=np.number).columns

for col in numeric_cols:
    median_val = df_model[col].median()
    df_model[col] = df_model[col].fillna(median_val)

# ============================================================
# 4Ô∏è‚É£ OUTLIER CLIPPING (NASA SAFE)
# ============================================================

print("\nüõ∞Ô∏è Clipping extreme outliers...")

for col in existing_features:
    q1 = df_model[col].quantile(0.01)
    q99 = df_model[col].quantile(0.99)
    df_model[col] = df_model[col].clip(q1, q99)

# ============================================================
# 5Ô∏è‚É£ FEATURE SCALING (NEURAL READY)
# ============================================================

print("\n‚öôÔ∏è Scaling Features...")

scaler = StandardScaler()

df_model[existing_features] = scaler.fit_transform(
    df_model[existing_features]
)

# ============================================================
# 6Ô∏è‚É£ SAVE SCALER (VERY IMPORTANT FOR BACKEND)
# ============================================================

import joblib

SCALER_PATH = os.path.join(PROJECT_ROOT, "backend", "models", "scaler.pkl")
os.makedirs(os.path.dirname(SCALER_PATH), exist_ok=True)

joblib.dump(scaler, SCALER_PATH)

print("üíæ Scaler Saved:", SCALER_PATH)

# ============================================================
# 7Ô∏è‚É£ SAVE FINAL DATASET
# ============================================================

df_model.to_csv(OUTPUT_PATH, index=False)

print("\nüíæ Model Ready Dataset Saved:", OUTPUT_PATH)

# ============================================================
# 8Ô∏è‚É£ QUICK VISUAL CHECK
# ============================================================

plt.figure(figsize=(6,4))
df_model["HSI"].hist(bins=40)
plt.title("Scaled HSI Distribution")
plt.show()

plt.figure(figsize=(6,4))
df_model["SCI"].hist(bins=40)
plt.title("Scaled SCI Distribution")
plt.show()

# ============================================================
# üéâ LEVEL-500 PREPROCESSING COMPLETE
# ============================================================

print("\nüöÄ LEVEL-500 PREPROCESSING COMPLETE ‚Äî READY FOR TRAINING")