In [None]:
# ============================================================
# üöÄ EXOHABITAI ‚Äî LEVEL-700 AUTO MODEL TRAINING PIPELINE
# SINGLE CELL VERSION ‚Äî PRODUCTION READY
# ============================================================

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

plt.style.use("dark_background")

print("üöÄ ExoHabitAI ‚Äî Training Pipeline Started")

# ============================================================
# üåå AUTO PROJECT ROOT DETECTION
# ============================================================

CURRENT_DIR = os.getcwd()
PROJECT_ROOT = os.path.dirname(CURRENT_DIR)

DATA_PATH = os.path.join(
    PROJECT_ROOT,
    "data",
    "processed",
    "model_ready_exoplanets.csv"
)

MODEL_DIR = os.path.join(PROJECT_ROOT, "backend", "models")
os.makedirs(MODEL_DIR, exist_ok=True)

MODEL_PATH = os.path.join(MODEL_DIR, "exohabitai_model.pkl")

print("üìÇ Dataset:", DATA_PATH)

# ============================================================
# 1Ô∏è‚É£ LOAD DATA
# ============================================================

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found:\n{DATA_PATH}")

df = pd.read_csv(DATA_PATH)

print("‚úÖ Dataset Loaded:", df.shape)

# ============================================================
# 2Ô∏è‚É£ FEATURE SPLIT
# ============================================================

target = "habitability"

X = df.drop(columns=[target])
y = df[target]

print("\nüß† Feature Count:", X.shape[1])

# ============================================================
# 3Ô∏è‚É£ TRAIN TEST SPLIT
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train Size:", X_train.shape)
print("Test Size:", X_test.shape)

# ============================================================
# 4Ô∏è‚É£ BUILD MODELS (AUTO COMPARE)
# ============================================================

models = {

    "LogisticRegression": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=3000, class_weight="balanced"))
    ]),

    "RandomForest": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", RandomForestClassifier(
            n_estimators=400,
            max_depth=12,
            random_state=42,
            class_weight="balanced"
        ))
    ])
}

results = []

# ============================================================
# 5Ô∏è‚É£ TRAIN + EVALUATE
# ============================================================

for name, pipe in models.items():

    print(f"\nüöÄ Training {name}")

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_prob = pipe.predict_proba(X_test)[:,1]

    auc = roc_auc_score(y_test, y_prob)

    print(classification_report(y_test, y_pred))
    print("AUC:", auc)

    results.append((name, auc, pipe))

# ============================================================
# 6Ô∏è‚É£ AUTO SELECT BEST MODEL
# ============================================================

results = sorted(results, key=lambda x: x[1], reverse=True)

best_name, best_auc, best_model = results[0]

print("\nüèÜ BEST MODEL:", best_name)
print("üî• BEST AUC:", best_auc)

# ============================================================
# 7Ô∏è‚É£ SAVE MODEL (BACKEND READY)
# ============================================================

joblib.dump(best_model, MODEL_PATH)

print("üíæ Model Saved:", MODEL_PATH)

# ============================================================
# 8Ô∏è‚É£ CREATE RANKED DATASET (VERY IMPORTANT)
# ============================================================

print("\nüåç Creating Ranked Exoplanets File...")

df["habitability_score"] = best_model.predict_proba(X)[:,1]
df["prediction"] = best_model.predict(X)

df = df.sort_values("habitability_score", ascending=False)

RANK_PATH = os.path.join(
    PROJECT_ROOT,
    "data",
    "processed",
    "ranked_exoplanets.csv"
)

df.to_csv(RANK_PATH, index=False)

print("üìä Ranked Dataset Saved:", RANK_PATH)

# ============================================================
# 9Ô∏è‚É£ QUICK FEATURE IMPORTANCE (RF ONLY)
# ============================================================

if best_name == "RandomForest":

    importances = best_model.named_steps["model"].feature_importances_

    plt.figure(figsize=(8,5))
    plt.barh(X.columns, importances)
    plt.title("RandomForest Feature Importance")
    plt.tight_layout()
    plt.show()

# ============================================================
# üéâ LEVEL-700 TRAINING COMPLETE
# ============================================================

print("\nüöÄ LEVEL-700 MODEL TRAINING COMPLETE")
print("‚úÖ Backend /predict API READY")
print("‚úÖ Ranking API READY")
print("‚úÖ Dashboard Sync READY")