In [None]:
# ============================================================
# üöÄ EXOHABITAI ‚Äî LEVEL-900 ML DATASET PREPARATION PIPELINE
# SINGLE CELL ‚Äî AUTO ROOT DETECTION + PRO FEATURE ENGINEERING
# ============================================================

import os
import pandas as pd
import numpy as np

print("üöÄ ExoHabitAI ‚Äî ML Dataset Preparation Started")

# ============================================================
# üåå AUTO PROJECT ROOT DETECTION (VERY IMPORTANT)
# ============================================================

CURRENT_DIR = os.getcwd()
PROJECT_ROOT = os.path.dirname(CURRENT_DIR)

INPUT_PATH = os.path.join(
    PROJECT_ROOT,
    "data",
    "processed",
    "cleaned_exoplanets.csv"
)

OUTPUT_PATH = os.path.join(
    PROJECT_ROOT,
    "data",
    "processed",
    "model_ready_exoplanets.csv"
)

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

print("üìÇ Input Dataset:", INPUT_PATH)
print("üìÇ Output Dataset:", OUTPUT_PATH)
print("üì° File Exists:", os.path.exists(INPUT_PATH))

# ============================================================
# 1Ô∏è‚É£ LOAD CLEAN DATASET
# ============================================================

if not os.path.exists(INPUT_PATH):
    raise FileNotFoundError(
        f"Dataset not found:\n{INPUT_PATH}"
    )

df = pd.read_csv(INPUT_PATH, low_memory=False)

print("‚úÖ Dataset Loaded:", df.shape)

# ============================================================
# 2Ô∏è‚É£ MEMORY OPTIMIZATION (NASA BIG DATA SAFE)
# ============================================================

print("\nüíæ Optimizing Memory...")

for col in df.select_dtypes(include=["float64"]).columns:
    df[col] = df[col].astype("float32")

for col in df.select_dtypes(include=["int64"]).columns:
    df[col] = df[col].astype("int32")

mem_mb = df.memory_usage(deep=True).sum() / 1024**2
print(f"Memory Usage After Optimization: {mem_mb:.2f} MB")

# ============================================================
# 3Ô∏è‚É£ SCIENTIFIC FEATURE SELECTION
# ============================================================

CORE_FEATURES = [
    "pl_rade",
    "pl_eqt",
    "pl_orbper",
    "st_teff",
    "st_mass",
    "st_rad"
]

existing_features = [c for c in CORE_FEATURES if c in df.columns]

print("\nüß† Using Scientific Features:", existing_features)

if len(existing_features) == 0:
    raise Exception("‚ùå No scientific features found")

df_model = df[existing_features].copy()

# ============================================================
# 4Ô∏è‚É£ ADVANCED FEATURE ENGINEERING (HSI + SCI)
# ============================================================

print("\nüõ∞Ô∏è Creating Engineered Features...")

if "pl_rade" in df_model.columns:
    df_model["rade_norm"] = np.exp(-abs(df_model["pl_rade"] - 1))

if "pl_eqt" in df_model.columns:
    df_model["eqt_norm"] = np.exp(-abs(df_model["pl_eqt"] - 288)/150)

if "st_teff" in df_model.columns:
    df_model["teff_norm"] = np.exp(-abs(df_model["st_teff"] - 5778)/2000)

# ------------------------------------------------------------
# üåç HSI ‚Äî Habitability Score Index
# ------------------------------------------------------------
if {"rade_norm","eqt_norm"}.issubset(df_model.columns):
    df_model["HSI"] = (df_model["rade_norm"] + df_model["eqt_norm"]) / 2

# ------------------------------------------------------------
# ‚≠ê SCI ‚Äî Stellar Compatibility Index
# ------------------------------------------------------------
if "teff_norm" in df_model.columns:
    df_model["SCI"] = df_model["teff_norm"]

# ============================================================
# 5Ô∏è‚É£ AUTO TARGET GENERATION (HABITABILITY LABEL)
# ============================================================

print("\nüß™ Generating Habitability Label...")

conditions = []

if "pl_rade" in df_model.columns:
    conditions.append((df_model["pl_rade"] >= 0.5) & (df_model["pl_rade"] <= 2))

if "pl_eqt" in df_model.columns:
    conditions.append((df_model["pl_eqt"] >= 200) & (df_model["pl_eqt"] <= 350))

if len(conditions) > 0:
    combined = conditions[0]
    for cond in conditions[1:]:
        combined &= cond

    df_model["habitability"] = combined.astype(int)
else:
    df_model["habitability"] = 0

print("üåç Habitable planets:", df_model["habitability"].sum())

# ============================================================
# 6Ô∏è‚É£ HANDLE MISSING VALUES (MODEL SAFE)
# ============================================================

print("\nüßπ Filling Missing Values...")

df_model = df_model.replace([np.inf, -np.inf], np.nan)
df_model = df_model.fillna(df_model.median(numeric_only=True))

# ============================================================
# 7Ô∏è‚É£ FINAL DATASET CHECK
# ============================================================

print("\nüìä Final Dataset Shape:", df_model.shape)
print("Columns:", list(df_model.columns))

# ============================================================
# 8Ô∏è‚É£ SAVE MODEL-READY DATASET
# ============================================================

df_model.to_csv(OUTPUT_PATH, index=False)

print("\nüíæ Model Ready Dataset Saved Successfully!")
print("‚úÖ Path:", OUTPUT_PATH)

# ============================================================
# üéâ LEVEL-900 PREPARATION COMPLETE
# ============================================================

print("\nüöÄ Dataset Ready For:")
print("‚úÖ Week3 Training Pipeline")
print("‚úÖ Backend Model Deployment")
print("‚úÖ Ranking Engine")