In [None]:
import pandas as pd
import numpy as np

ENGINEERED_PATH = "../data/processed/feature_engineered_exoplanets.csv"

df = pd.read_csv(ENGINEERED_PATH)

print("✅ Dataset shape:", df.shape)
df.head()


In [None]:
target_col = "habitability"

print("✅ Target column exists:", target_col in df.columns)
df[target_col].value_counts()


In [None]:
missing_pct = (df.isna().mean() * 100).sort_values(ascending=False)
missing_pct.head(20)


In [None]:
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()

print("✅ Numeric cols:", len(numeric_cols))
print("✅ Categorical cols:", len(categorical_cols))

numeric_cols[:15], categorical_cols[:15]


In [None]:
numeric_df = df.select_dtypes(include=["number"]).copy()

corrs = numeric_df.corr()[target_col].abs().sort_values(ascending=False)
corrs = corrs.drop(target_col, errors="ignore")

top_features = corrs.head(25).index.tolist()

print("✅ Top correlated features:")
top_features


In [None]:
selected_cols = list(set(top_features + categorical_cols))

X = df[selected_cols]
y = df[target_col]

print("✅ X shape:", X.shape)
print("✅ y shape:", y.shape)
X.head()


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y if y.nunique() > 1 else None
)

print("✅ Train:", X_train.shape, y_train.shape)
print("✅ Test :", X_test.shape, y_test.shape)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_cols),
        ("cat", categorical_pipeline, categorical_cols)
    ]
)

model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

pipeline


In [None]:
pipeline.fit(X_train, y_train)
print("✅ Training completed")


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = pipeline.predict(X_test)

acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("✅ Accuracy:", acc)
print("\n✅ Classification Report:\n")
print(classification_report(y_test, y_pred, zero_division=0))

cm


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(5,4))
plt.imshow(cm, aspect="auto")
plt.title("Confusion Matrix")
plt.colorbar()
plt.xticks([0,1], ["Not Habitable", "Habitable"])
plt.yticks([0,1], ["Not Habitable", "Habitable"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()


In [None]:
import joblib
import os

MODEL_PATH = "../models/week3_pipeline_model.pkl"
os.makedirs("../models", exist_ok=True)

joblib.dump(pipeline, MODEL_PATH)
print("✅ Model saved to:", MODEL_PATH)
