In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("PS_2025.12.01_05.00.30.csv", comment="#")

df.columns = df.columns.str.strip()

df = df.drop_duplicates()

print("Shape after deduplication:", df.shape)


Shape after deduplication: (39063, 92)


In [None]:
BASE_FEATURES = [
    "pl_name",
    "pl_rade",
    "pl_bmasse",
    "pl_orbsmax",
    "pl_eqt",
    "pl_orbper",
    "pl_insol",
    "pl_orbeccen",
    "st_teff",
    "st_rad",
    "st_mass",
    "st_met",
    "st_spectype"
]

df = df[BASE_FEATURES].copy()


In [None]:
df['pl_rade'] = df['pl_rade'].replace(0, np.nan)


In [None]:
df['pl_density'] = np.where(
    (df['pl_bmasse'].notna()) & (df['pl_rade'].notna()),
    (df['pl_bmasse'] / (df['pl_rade'] ** 3)) * 5.51,
    np.nan
)


In [None]:
df['pl_density'] = df['pl_density'].replace([np.inf, -np.inf], np.nan)
df['pl_density'] = df['pl_density'].fillna(df['pl_density'].median())


In [None]:
df['pl_density'] = df['pl_density'].clip(lower=0.5, upper=15)


In [None]:
df["st_luminosity"] = (df["st_rad"] ** 2) * ((df["st_teff"] / 5772) ** 4)

In [None]:
import re

df["spectral_class"] = df["st_spectype"].str.extract(r"([OBAFGKM])")


In [None]:
import numpy as np

numeric_cols = df.select_dtypes(include=np.number).columns
categorical_cols = df.select_dtypes(include="object").columns

df[numeric_cols] = df[numeric_cols].apply(
    lambda x: x.fillna(x.median())
)

df[categorical_cols] = df[categorical_cols].apply(
    lambda x: x.fillna(x.mode()[0])
)


In [None]:
df = pd.get_dummies(df, columns=["spectral_class"], drop_first=True)


# **Module 3**

3.1 Create Habitability Score

In [None]:
def safe_score(x, ideal, tol):
    return np.clip(1 - np.abs(x - ideal) / tol, 0, 1)

df['habitability_score'] = (
    0.30 * safe_score(df['pl_eqt'], 288, 200) +
    0.20 * safe_score(df['pl_rade'], 1, 1) +
    0.20 * safe_score(df['pl_bmasse'], 1, 5) +
    0.20 * safe_score(df['pl_orbsmax'], 1, 1) +
    0.10 * safe_score(df['pl_density'], 1, 3)
)


3.2 Create Classification Target

In [None]:
threshold = df["habitability_score"].quantile(0.7)
df["habitability_class"] = (df["habitability_score"] >= threshold).astype(int)

print("Threshold:", threshold)
print(df["habitability_class"].value_counts())


Threshold: 0.10349333333333334
habitability_class
0    27239
1    11880
Name: count, dtype: int64


In [None]:
FEATURE_PATH = "//content//exoplanet_feature_engineered_dataset.csv"
df.to_csv(FEATURE_PATH, index=False)
print("Saved:", FEATURE_PATH)


Saved: //content//exoplanet_feature_engineered_dataset.csv


3.3 Feature Selection via Pearson

In [None]:
from scipy.stats import pearsonr
import pandas as pd
import numpy as np

target = "habitability_score"

numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()

numeric_features.remove(target)

pearson_results = []

for feature in numeric_features:
    valid = df[[feature, target]].dropna()
    if valid[feature].nunique() > 1:
        r, p = pearsonr(valid[feature], valid[target])
        pearson_results.append((feature, r, abs(r), p))

pearson_df = pd.DataFrame(
    pearson_results,
    columns=["Feature", "Pearson_r", "AbsCorr", "p_value"]
).sort_values("AbsCorr", ascending=False)

selected_features = pearson_df[pearson_df["AbsCorr"] >= 0.2]["Feature"].tolist()

print("Selected Features (|r| >= 0.2):")
print(selected_features)

pearson_df.head(25)


Selected Features (|r| >= 0.2):
['habitability_class', 'pl_eqt', 'st_met']


Unnamed: 0,Feature,Pearson_r,AbsCorr,p_value
13,habitability_class,0.84035,0.84035,0.0
3,pl_eqt,-0.249073,0.249073,0.0
10,st_met,-0.200972,0.200972,0.0
9,st_mass,-0.176765,0.176765,1.26253e-271
7,st_teff,-0.138308,0.138308,4.373732e-166
1,pl_bmasse,-0.084224,0.084224,1.976012e-62
6,pl_orbeccen,-0.045988,0.045988,9.575647e-20
11,pl_density,-0.044772,0.044772,8.518174e-19
8,st_rad,-0.039097,0.039097,1.075555e-14
0,pl_rade,-0.02398,0.02398,2.136044e-06


3.4 Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

X = df[selected_features]
y_reg = df["habitability_score"]
y_cls = df["habitability_class"]

X_train, X_test, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

_, _, y_train_cls, y_test_cls = train_test_split(
    X, y_cls, test_size=0.2, random_state=42
)


# **Module 4**

model training and evaluation

linear regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lr = LinearRegression()
lr.fit(X_train, y_train_reg)

pred = lr.predict(X_test)

print("LR RMSE:", mean_squared_error(y_test_reg, pred))
print("LR R2:", r2_score(y_test_reg, pred))


LR RMSE: 0.0017101710981326
LR R2: 0.7371235445970432


Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42)
rf.fit(X_train, y_train_reg)

pred = rf.predict(X_test)

print("RF RMSE:", mean_squared_error(y_test_reg, pred))
print("RF R2:", r2_score(y_test_reg, pred))


RF RMSE: 0.0007761756249660012
RF R2: 0.8806912961609314


XGBoost Regressor

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=300,
    max_depth=10,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb.fit(X_train, y_train_reg)
pred = xgb.predict(X_test)
print("RF RMSE:", mean_squared_error(y_test_reg, pred))
print("RF R2:", r2_score(y_test_reg, pred))


RF RMSE: 0.0007878044266941217
RF R2: 0.8789037918684018


Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

log = LogisticRegression(max_iter=1000)
log.fit(X_train, y_train_cls)

print(classification_report(y_test_cls, log.predict(X_test)))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5521
           1       1.00      1.00      1.00      2292

    accuracy                           1.00      7813
   macro avg       1.00      1.00      1.00      7813
weighted avg       1.00      1.00      1.00      7813



Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=300, max_depth=5, random_state=42)
rfc.fit(X_train, y_train_cls)

print(classification_report(y_test_cls, rfc.predict(X_test)))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5521
           1       1.00      1.00      1.00      2292

    accuracy                           1.00      7813
   macro avg       1.00      1.00      1.00      7813
weighted avg       1.00      1.00      1.00      7813



xgboost

In [None]:
from xgboost import XGBClassifier

xgbc = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgbc.fit(X_train, y_train_cls)

print(classification_report(y_test_cls, xgbc.predict(X_test)))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5521
           1       1.00      1.00      1.00      2292

    accuracy                           1.00      7813
   macro avg       1.00      1.00      1.00      7813
weighted avg       1.00      1.00      1.00      7813



cross validation

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

X = df[selected_features]
y = df["habitability_class"]

models = {
    "Logistic": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(
        n_estimators=300, max_depth=5, random_state=42
    ),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42
    )
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model)
    ])

    scores = cross_val_score(pipe, X, y, cv=cv, scoring="roc_auc")

    print(f"\n{name}")
    print("ROC-AUC scores:", scores)
    print("Mean:", scores.mean())
    print("Std:", scores.std())



Logistic
ROC-AUC scores: [1. 1. 1. 1. 1.]
Mean: 1.0
Std: 0.0

RandomForest
ROC-AUC scores: [1. 1. 1. 1. 1.]
Mean: 1.0
Std: 0.0

XGBoost
ROC-AUC scores: [1. 1. 1. 1. 1.]
Mean: 1.0
Std: 1.4043333874306804e-16


selecting the features





In [None]:
BASE_FEATURES = [
    "st_teff",
    "st_rad",
    "st_mass",
    "st_met",
    "st_luminosity",
    "pl_orbper",
    "pl_orbeccen",
    "pl_insol",
    "pl_orbsmax"
]


In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

X = df[BASE_FEATURES]
y = df["habitability_class"]


In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


re doing the cross validation

In [None]:
models = {
    "Logistic": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            penalty="l2",
            C=0.5,
            class_weight="balanced",
            max_iter=1000
        ))
    ]),

    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        max_depth=6,
        min_samples_leaf=20,
        class_weight="balanced",
        random_state=42
    ),

    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42
    )
}


In [None]:
for name, model in models.items():
    scores = cross_val_score(
        model, X, y,
        cv=cv,
        scoring="roc_auc"
    )
    print(f"\n{name}")
    print("ROC-AUC scores:", scores)
    print("Mean:", scores.mean())
    print("Std:", scores.std())



Logistic
ROC-AUC scores: [0.64247349 0.65172794 0.65332881 0.65252928 0.64784389]
Mean: 0.6495806814753492
Std: 0.004021990492109172

RandomForest
ROC-AUC scores: [0.84696208 0.84659283 0.83984221 0.85165388 0.84284413]
Mean: 0.8455790284823916
Std: 0.004005876020863883

XGBoost
ROC-AUC scores: [0.88116857 0.88035578 0.87706945 0.88515492 0.87812152]
Mean: 0.880374047766136
Std: 0.002810073122341794


final classification models without overfitting

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)


In [None]:
FEATURES = [
    "st_teff",
    "st_rad",
    "st_mass",
    "st_met",
    "st_luminosity",
    "pl_orbper",
    "pl_orbeccen",
    "pl_insol"
]

X = df[FEATURES]
y = df["habitability_class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


random forest

In [None]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_leaf=20,
    random_state=42
)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


Random Forest
Accuracy: 0.8049404838090363
Precision: 0.8269537480063796
Recall: 0.4424061433447099
F1: 0.5764313507504168
ROC-AUC: 0.8388403658348607


xg boost

In [None]:
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)
y_prob = xgb.predict_proba(X_test)[:, 1]

print("XGBoost")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


XGBoost
Accuracy: 0.8142838858313068
Precision: 0.8039482641252553
Recall: 0.503839590443686
F1: 0.6194597429845267
ROC-AUC: 0.8521994430912802


predicted habitable and non habitable planets

In [None]:
df["predicted_habitability"] = xgb.predict(X)

ranked_planets = (
    df[["pl_name", "predicted_habitability"]]
    .sort_values("predicted_habitability", ascending=False)
)

ranked_planets.head(10)


Unnamed: 0,pl_name,predicted_habitability
39118,xi Aql b,1
0,11 Com b,1
1,11 Com b,1
39117,xi Aql b,1
3,11 UMi b,1
4,11 UMi b,1
39095,tau Gem b,1
39091,tau Cet f,1
39090,tau Cet e,1
36,2MASS J01033563-5515561 AB b,1


Ranked planets based on predicted score

In [None]:
# Predict continuous habitability score
df["predicted_habitability_score"] = xgb.predict(X)

# Sort and assign rank
ranked_planets = (
    df[["pl_name", "predicted_habitability_score"]]
    .sort_values("predicted_habitability_score", ascending=False)
    .reset_index(drop=True)
)

ranked_planets["rank"] = ranked_planets.index + 1

ranked_planets.head(10)


Unnamed: 0,pl_name,predicted_habitability_score,rank
0,xi Aql b,1,1
1,11 Com b,1,2
2,11 Com b,1,3
3,xi Aql b,1,4
4,11 UMi b,1,5
5,11 UMi b,1,6
6,tau Gem b,1,7
7,tau Cet f,1,8
8,tau Cet e,1,9
9,2MASS J01033563-5515561 AB b,1,10


In [None]:
ranked_planets.to_csv("ranked_exoplanets.csv", index=False)


In [None]:
df.to_csv("//content//PS_2025.12.01_05.00.30.csv", index=False)
print("Saved: exoplanet_clean_dataset.csv")


Saved: exoplanet_clean_dataset.csv


In [None]:
df.to_csv("//content//PS_2025.12.01_05.00.30.csv", index=False)
print("Saved: exoplanet_feature_engineered_dataset.csv")


Saved: exoplanet_feature_engineered_dataset.csv


Saving all trained models

In [None]:
import joblib


In [None]:
import os

os.makedirs("models/regression", exist_ok=True)

joblib.dump(lr, "models/regression/linear_regression.pkl")
joblib.dump(rf, "models/regression/random_forest_reg.pkl")
joblib.dump(xgb, "models/regression/xgboost_reg.pkl")

print("✅ Saved regression models successfully")

✅ Saved regression models successfully


In [None]:
joblib.dump(log, "models/logistic_classifier.pkl")
joblib.dump(rf, "models/random_forest_classifier.pkl")
joblib.dump(xgb, "models/xgboost_classifier.pkl")

print("Saved classification models")


Saved classification models


Saving ranked planets data

In [None]:
import os

# Predict continuous habitability score
df["predicted_habitability_score"] = xgb.predict(X)

# Sort and assign rank
ranked_planets = (
    df[["pl_name", "predicted_habitability_score"]]
    .sort_values("predicted_habitability_score", ascending=False)
    .reset_index(drop=True)
)

ranked_planets["rank"] = ranked_planets.index + 1

ranked_planets.head(10)


os.makedirs("outputs", exist_ok=True)
ranked_planets.to_csv("outputs/ranked_exoplanets.csv", index=False)
print("Saved: ranked_exoplanets.csv")

Saved: ranked_exoplanets.csv
