In [None]:
# ============================================================
# üöÄ EXOHABITAI ‚Äî LEVEL-1200 WEEK3 ML PIPELINE (FINAL)
# AUTO ROOT DETECTION + AUTO DATASET FINDER + BACKEND READY
# ============================================================

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

plt.style.use("dark_background")

print("üöÄ ExoHabitAI ‚Äî Week3 ML Pipeline Started")

# ============================================================
# üåå AUTO PROJECT ROOT DETECTOR (ULTRA SAFE)
# ============================================================

def find_project_root():
    path = os.getcwd()
    for _ in range(6):
        if os.path.exists(os.path.join(path, "backend")):
            return path
        path = os.path.dirname(path)
    return os.getcwd()

PROJECT_ROOT = find_project_root()

print("üì° Project Root:", PROJECT_ROOT)

# ============================================================
# üî• AUTO DATASET SCANNER (NO MORE FILE ERRORS)
# ============================================================

SEARCH_DIRS = [
    os.path.join(PROJECT_ROOT, "data", "processed"),
    os.path.join(PROJECT_ROOT, "notebooks", "data", "processed"),
    PROJECT_ROOT
]

TARGET_FILES = [
    "model_ready_exoplanets.csv",
    "processed_model_ready_exoplanets.csv",
    "final_model_ready_exoplanets.csv"
]

DATA_PATH = None

for folder in SEARCH_DIRS:
    for file in TARGET_FILES:
        test = os.path.join(folder, file)
        if os.path.exists(test):
            DATA_PATH = test
            break
    if DATA_PATH:
        break

if DATA_PATH is None:
    raise FileNotFoundError(
        "‚ùå No model-ready dataset found.\n"
        "üëâ Run preprocessing notebook FIRST."
    )

print("‚úÖ Using Dataset:", DATA_PATH)

# ============================================================
# LOAD DATASET (SAFE MODE)
# ============================================================

df = pd.read_csv(DATA_PATH, low_memory=False)

print("üìä Dataset Shape:", df.shape)

# ============================================================
# TARGET VALIDATION
# ============================================================

if "habitability" not in df.columns:
    raise ValueError("‚ùå 'habitability' column missing")

target = "habitability"

# Keep only numeric features for ML stability
numeric_df = df.select_dtypes(include=np.number)

X = numeric_df.drop(columns=[target])
y = numeric_df[target]

print("üß† Feature Count:", X.shape[1])

# ============================================================
# TRAIN TEST SPLIT
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)

# ============================================================
# MODEL PIPELINES
# ============================================================

models = {

    "LogisticRegression": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=3000, class_weight="balanced"))
    ]),

    "RandomForest": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", RandomForestClassifier(
            n_estimators=400,
            max_depth=12,
            random_state=42,
            class_weight="balanced",
            n_jobs=-1
        ))
    ])
}

results = []

# ============================================================
# TRAIN + EVALUATE
# ============================================================

for name, pipe in models.items():

    print(f"\nüöÄ Training {name}")

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_prob = pipe.predict_proba(X_test)[:,1]

    auc = roc_auc_score(y_test, y_prob)

    print(classification_report(y_test, y_pred))
    print("AUC:", auc)

    results.append((name, auc, pipe))

# ============================================================
# AUTO SELECT BEST MODEL
# ============================================================

best_name, best_auc, best_model = sorted(results, key=lambda x:x[1], reverse=True)[0]

print("\nüèÜ BEST MODEL:", best_name)
print("üî• BEST AUC:", best_auc)

# ============================================================
# SAVE MODEL FOR BACKEND
# ============================================================

MODEL_DIR = os.path.join(PROJECT_ROOT, "backend", "models")
os.makedirs(MODEL_DIR, exist_ok=True)

MODEL_PATH = os.path.join(MODEL_DIR, "exohabitai_model.pkl")

joblib.dump(best_model, MODEL_PATH)

print("üíæ Model Saved:", MODEL_PATH)

# ============================================================
# CREATE RANKED DATASET (DASHBOARD FIX)
# ============================================================

print("\nüåç Generating ranked_exoplanets.csv")

df["habitability_score"] = best_model.predict_proba(X)[:,1]
df["prediction"] = best_model.predict(X)

df = df.sort_values("habitability_score", ascending=False)

RANK_DIR = os.path.join(PROJECT_ROOT, "data", "processed")
os.makedirs(RANK_DIR, exist_ok=True)

RANK_PATH = os.path.join(RANK_DIR, "ranked_exoplanets.csv")

df.to_csv(RANK_PATH, index=False)

print("üìä Ranked Dataset Saved:", RANK_PATH)

# ============================================================
# FEATURE IMPORTANCE VISUAL (RF ONLY)
# ============================================================

if best_name == "RandomForest":

    importances = best_model.named_steps["model"].feature_importances_

    plt.figure(figsize=(8,5))
    plt.barh(X.columns, importances)
    plt.title("RandomForest Feature Importance")
    plt.tight_layout()
    plt.show()

# ============================================================
# COMPLETE
# ============================================================

print("\nüöÄ LEVEL-1200 WEEK3 ML PIPELINE COMPLETE")
print("‚úÖ Backend Ready")
print("‚úÖ Ranking API Ready")
print("‚úÖ Dashboard Sync Ready")