In [None]:
# ============================================================
# üöÄ EXOHABITAI ‚Äî LEVEL-900 AUTO TRAINING PIPELINE (FINAL FIX)
# AUTO DATASET DETECTOR + BACKEND SYNC + RANK GENERATOR
# ============================================================

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

plt.style.use("dark_background")

print("üöÄ ExoHabitAI ‚Äî Training Pipeline Started")

# ============================================================
# üåå AUTO PROJECT ROOT DETECTION (ULTRA SAFE)
# ============================================================

CURRENT_DIR = os.getcwd()
PROJECT_ROOT = os.path.dirname(CURRENT_DIR)

DATA_FOLDER = os.path.join(PROJECT_ROOT, "data", "processed")

print("üìÇ Searching dataset inside:", DATA_FOLDER)

# ============================================================
# üî• AUTO DATASET FINDER (NO MORE FILE ERRORS)
# ============================================================

possible_files = [
    "model_ready_exoplanets.csv",
    "processed_model_ready_exoplanets.csv",
    "final_model_ready_exoplanets.csv"
]

DATA_PATH = None

for f in possible_files:
    test_path = os.path.join(DATA_FOLDER, f)
    if os.path.exists(test_path):
        DATA_PATH = test_path
        break

if DATA_PATH is None:
    raise FileNotFoundError(
        f"‚ùå No training dataset found inside:\n{DATA_FOLDER}"
    )

print("‚úÖ Using Dataset:", DATA_PATH)

# ============================================================
# LOAD DATA
# ============================================================

df = pd.read_csv(DATA_PATH)

print("Dataset Shape:", df.shape)

# ============================================================
# TARGET DETECTION
# ============================================================

if "habitability" not in df.columns:
    raise ValueError("‚ùå Column 'habitability' not found in dataset")

target = "habitability"

X = df.drop(columns=[target])
y = df[target]

print("üß† Feature Count:", X.shape[1])

# ============================================================
# TRAIN TEST SPLIT
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ============================================================
# MODELS
# ============================================================

models = {

    "LogisticRegression": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=3000, class_weight="balanced"))
    ]),

    "RandomForest": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", RandomForestClassifier(
            n_estimators=400,
            max_depth=12,
            random_state=42,
            class_weight="balanced"
        ))
    ])
}

results = []

# ============================================================
# TRAIN + EVALUATE
# ============================================================

for name, pipe in models.items():

    print(f"\nüöÄ Training {name}")

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_prob = pipe.predict_proba(X_test)[:,1]

    auc = roc_auc_score(y_test, y_prob)

    print(classification_report(y_test, y_pred))
    print("AUC:", auc)

    results.append((name, auc, pipe))

# ============================================================
# BEST MODEL
# ============================================================

results = sorted(results, key=lambda x: x[1], reverse=True)

best_name, best_auc, best_model = results[0]

print("\nüèÜ BEST MODEL:", best_name)
print("üî• BEST AUC:", best_auc)

# ============================================================
# SAVE MODEL
# ============================================================

MODEL_DIR = os.path.join(PROJECT_ROOT, "backend", "models")
os.makedirs(MODEL_DIR, exist_ok=True)

MODEL_PATH = os.path.join(MODEL_DIR, "exohabitai_model.pkl")

joblib.dump(best_model, MODEL_PATH)

print("üíæ Model Saved:", MODEL_PATH)

# ============================================================
# CREATE RANKED DATASET (DASHBOARD FIX)
# ============================================================

print("\nüåç Generating ranked_exoplanets.csv")

df["habitability_score"] = best_model.predict_proba(X)[:,1]
df["prediction"] = best_model.predict(X)

df = df.sort_values("habitability_score", ascending=False)

RANK_PATH = os.path.join(DATA_FOLDER, "ranked_exoplanets.csv")

df.to_csv(RANK_PATH, index=False)

print("üìä Ranked Dataset Saved:", RANK_PATH)

# ============================================================
# FEATURE IMPORTANCE (RF)
# ============================================================

if best_name == "RandomForest":

    importances = best_model.named_steps["model"].feature_importances_

    plt.figure(figsize=(8,5))
    plt.barh(X.columns, importances)
    plt.title("RandomForest Feature Importance")
    plt.tight_layout()
    plt.show()

# ============================================================
# COMPLETE
# ============================================================

print("\nüöÄ LEVEL-900 TRAINING COMPLETE")
print("‚úÖ Backend Ready")
print("‚úÖ Ranking API Ready")
print("‚úÖ Dashboard Sync Ready")

üöÄ ExoHabitAI ‚Äî Training Pipeline Started
üìÇ Searching dataset inside: d:\Infosys Springboard Internship\ExoHabitAI\data\processed


FileNotFoundError: ‚ùå No training dataset found inside:
d:\Infosys Springboard Internship\ExoHabitAI\data\processed