In [None]:
# ============================================================
# 🚀 NASA Exoplanet Detection using Random Forest Classifier
# ============================================================

# 📦 Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

# ------------------------------------------------------------
# 🌍 Load Dataset
# ------------------------------------------------------------
SEED = 42
url = 'https://raw.githubusercontent.com/Siddharths99/NASA-Exoplanet-Detection/refs/heads/main/kepler.csv'
df = pd.read_csv(url)

df_names = df[["kepler_name","kepoi_name"]].reset_index(drop=True)
print("Initial dataset size:", df.shape)
display(df.head(3))

# ------------------------------------------------------------
# 🧹 Data Cleaning and Preprocessing
# ------------------------------------------------------------
drop_cols = ["rowid","kepid","kepoi_name","kepler_name",
             "koi_pdisposition","koi_tce_delivname","ra","dec"]
df_model = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

# Remove rows with missing target
df_model = df_model.dropna(subset=["koi_disposition"])
df_model["koi_disposition"] = df_model["koi_disposition"].str.upper().str.strip()

# Fill missing numeric values with median
num_cols = df_model.select_dtypes(include=np.number).columns.tolist()
df_model[num_cols] = df_model[num_cols].fillna(df_model[num_cols].median())

print(df_model["koi_disposition"].value_counts())
print("Dataset size after cleaning:", len(df_model))

# ------------------------------------------------------------
# ⚙️ Feature Selection
# ------------------------------------------------------------
feature_cols = [
    "koi_period","koi_duration","koi_prad","koi_teq","koi_insol",
    "koi_impact","koi_depth","koi_steff","koi_slogg","koi_srad","koi_model_snr"
]
feature_cols = [c for c in feature_cols if c in df_model.columns]

X = df_model[feature_cols]
y = df_model["koi_disposition"]

# ------------------------------------------------------------
# 🎯 Label Encoding and Data Splitting
# ------------------------------------------------------------
le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=SEED, stratify=y_enc
)

# ------------------------------------------------------------
# ⚖️ Handle Class Imbalance using SMOTE
# ------------------------------------------------------------
smote = SMOTE(random_state=SEED)
X_train, y_train = smote.fit_resample(X_train, y_train)

print("Train size after SMOTE:", X_train.shape)
print("Test size:", X_test.shape)

# ------------------------------------------------------------
# 🌟 Model Training (Random Forest)
# ------------------------------------------------------------
clf = RandomForestClassifier(n_estimators=200, random_state=SEED)
clf.fit(X_train, y_train)

# ------------------------------------------------------------
# 🔍 Model Evaluation
# ------------------------------------------------------------
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
feat_importances = pd.Series(clf.feature_importances_, index=X_train.columns)

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("Accuracy:", clf.score(X_test, y_test))

# ------------------------------------------------------------
# 💾 Save Predictions and Top Candidates
# ------------------------------------------------------------
df_test = X_test.copy()
df_test["Actual"] = le.inverse_transform(y_test)
df_test["Predicted"] = le.inverse_transform(y_pred)
df_test["Probability"] = clf.predict_proba(X_test).max(axis=1)
df_test = pd.concat([df_names.iloc[df_test.index].reset_index(drop=True), df_test.reset_index(drop=True)], axis=1)

df_test.to_csv("exo_predictions_with_names.csv", index=False)

top_candidates = df_test[df_test["Predicted"].isin(["CONFIRMED","CANDIDATE"])] \
    .sort_values(by="Probability", ascending=False)
top_candidates.to_csv("top_exo_candidates_with_names.csv", index=False)

# ------------------------------------------------------------
# 📊 Visualization Section
# ------------------------------------------------------------
# Feature Importance
plt.figure(figsize=(8,6))
feat_importances.sort_values().plot(kind='barh', color="#2196F3")
plt.title("Feature Importances")
plt.show()

# Confusion Matrix
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Oranges",
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion Matrix")
plt.show()

# Top 10 Candidates
top10 = top_candidates.head(10)
display(HTML("<h3 style='color:#4CAF50'>Top 10 Candidate/Confirmed Exoplanets</h3>"))
display(top10[['kepler_name','Predicted','Probability','koi_prad','koi_period','koi_teq']])

plt.figure(figsize=(10,6))
sns.barplot(x="Probability", y=top10.index, hue="Predicted", data=top10, dodge=False, palette=["#4CAF50","#FFC107"])
plt.title("Top 10 Candidate/Confirmed Exoplanets")
plt.show()

# Probability Heatmap
proba_df = pd.DataFrame(clf.predict_proba(X_test), columns=le.classes_)
plt.figure(figsize=(12,6))
sns.heatmap(proba_df.head(15).T, annot=True, cmap="YlGnBu")
plt.title("Prediction Probabilities for First 15 Test Samples")
plt.show()

print("✅ Notebook execution completed successfully.")