In [62]:
import pandas as pd
import numpy as np

df = pd.read_csv("PS_2025.12.01_05.00.30.csv", comment="#")

df.columns = df.columns.str.strip()

df = df.drop_duplicates()

print("Shape after deduplication:", df.shape)


Shape after deduplication: (39063, 92)


In [91]:
BASE_FEATURES = [
    "pl_name",
    "pl_rade",
    "pl_bmasse",
    "pl_orbsmax",
    "pl_eqt",
    "pl_orbper",
    "pl_insol",
    "pl_orbeccen",
    "st_teff",
    "st_rad",
    "st_mass",
    "st_met",
    "st_spectype"
]

df = df[BASE_FEATURES].copy()


In [122]:
df['pl_rade'] = df['pl_rade'].replace(0, np.nan)


In [123]:
df['pl_density'] = np.where(
    (df['pl_bmasse'].notna()) & (df['pl_rade'].notna()),
    (df['pl_bmasse'] / (df['pl_rade'] ** 3)) * 5.51,
    np.nan
)


In [124]:
df['pl_density'] = df['pl_density'].replace([np.inf, -np.inf], np.nan)
df['pl_density'] = df['pl_density'].fillna(df['pl_density'].median())


In [125]:
df['pl_density'] = df['pl_density'].clip(lower=0.5, upper=15)


In [126]:
df["st_luminosity"] = (df["st_rad"] ** 2) * ((df["st_teff"] / 5772) ** 4)

In [127]:
import re

df["spectral_class"] = df["st_spectype"].str.extract(r"([OBAFGKM])")


In [133]:
import numpy as np

numeric_cols = df.select_dtypes(include=np.number).columns
categorical_cols = df.select_dtypes(include="object").columns

df[numeric_cols] = df[numeric_cols].apply(
    lambda x: x.fillna(x.median())
)

df[categorical_cols] = df[categorical_cols].apply(
    lambda x: x.fillna(x.mode()[0])
)


In [129]:
df = pd.get_dummies(df, columns=["spectral_class"], drop_first=True)


# **Module 3**

3.1 Create Habitability Score

In [136]:
def safe_score(x, ideal, tol):
    return np.clip(1 - np.abs(x - ideal) / tol, 0, 1)

df['habitability_score'] = (
    0.30 * safe_score(df['pl_eqt'], 288, 200) +
    0.20 * safe_score(df['pl_rade'], 1, 1) +
    0.20 * safe_score(df['pl_bmasse'], 1, 5) +
    0.20 * safe_score(df['pl_orbsmax'], 1, 1) +
    0.10 * safe_score(df['pl_density'], 1, 3)
)


3.2 Create Classification Target

In [204]:
threshold = df["habitability_score"].quantile(0.75)
df["habitability_class"] = (df["habitability_score"] >= threshold).astype(int)

print("Threshold:", threshold)
print(df["habitability_class"].value_counts())


Threshold: 0.16043281218712255
habitability_class
0    29297
1     9766
Name: count, dtype: int64


3.3 Feature Selection via Pearson

In [173]:
from scipy.stats import pearsonr
import pandas as pd
import numpy as np

target = "habitability_score"

numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()

numeric_features.remove(target)

pearson_results = []

for feature in numeric_features:
    valid = df[[feature, target]].dropna()
    if valid[feature].nunique() > 1:
        r, p = pearsonr(valid[feature], valid[target])
        pearson_results.append((feature, r, abs(r), p))

pearson_df = pd.DataFrame(
    pearson_results,
    columns=["Feature", "Pearson_r", "AbsCorr", "p_value"]
).sort_values("AbsCorr", ascending=False)

selected_features = pearson_df[pearson_df["AbsCorr"] >= 0.2]["Feature"].tolist()

print("Selected Features (|r| >= 0.2):")
print(selected_features)

pearson_df.head(25)


Selected Features (|r| >= 0.2):
['pl_rade', 'pl_density', 'pl_orbsmax', 'pl_bmasse']


Unnamed: 0,Feature,Pearson_r,AbsCorr,p_value
0,pl_rade,0.909615,0.909615,0.0
12,pl_density,-0.884055,0.884055,0.0
2,pl_orbsmax,0.326138,0.326138,0.0
1,pl_bmasse,0.25912,0.25912,0.0
4,pl_orbper,0.087853,0.087853,8.739164e-68
6,pl_orbeccen,0.069698,0.069698,2.857839e-43
13,habitability_class,0.067257,0.067257,2.091162e-40
8,st_rad,0.052678,0.052678,2.0444150000000002e-25
9,st_mass,0.047936,0.047936,2.559336e-21
3,pl_eqt,0.025168,0.025168,6.524799e-07


3.4 Train-Test Split

In [205]:
from sklearn.model_selection import train_test_split

X = df[selected_features]
y_reg = df["habitability_score"]
y_cls = df["habitability_class"]

X_train, X_test, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

_, _, y_train_cls, y_test_cls = train_test_split(
    X, y_cls, test_size=0.2, random_state=42
)


# **Module 4**

model training and evaluation

linear regression

In [143]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lr = LinearRegression()
lr.fit(X_train, y_train_reg)

pred = lr.predict(X_test)

print("LR RMSE:", mean_squared_error(y_test_reg, pred))
print("LR R2:", r2_score(y_test_reg, pred))


LR RMSE: 1.695087817794872e-07
LR R2: 0.9927053387647528


Random Forest Regressor

In [147]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42)
rf.fit(X_train, y_train_reg)

pred = rf.predict(X_test)

print("RF RMSE:", mean_squared_error(y_test_reg, pred))
print("RF R2:", r2_score(y_test_reg, pred))


RF RMSE: 1.4349379358499288e-06
RF R2: 0.9382487088530488


XGBoost Regressor

In [206]:
from xgboost import XGBRegressor

xgb = XGBRegressor(
    n_estimators=300,
    max_depth=10,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb.fit(X_train, y_train_reg)
pred = xgb.predict(X_test)
print("RF RMSE:", mean_squared_error(y_test_reg, pred))
print("RF R2:", r2_score(y_test_reg, pred))


RF RMSE: 2.2202468011899397e-06
RF R2: 0.9044536330017929


Logistic Regression

In [162]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

log = LogisticRegression(max_iter=1000)
log.fit(X_train, y_train_cls)

print(classification_report(y_test_cls, log.predict(X_test)))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      3119
           1       0.60      1.00      0.75      4694

    accuracy                           0.60      7813
   macro avg       0.30      0.50      0.38      7813
weighted avg       0.36      0.60      0.45      7813



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Classifier

In [163]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=300, max_depth=5, random_state=42)
rfc.fit(X_train, y_train_cls)

print(classification_report(y_test_cls, rfc.predict(X_test)))


              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3119
           1       1.00      1.00      1.00      4694

    accuracy                           1.00      7813
   macro avg       1.00      1.00      1.00      7813
weighted avg       1.00      1.00      1.00      7813



xgboost

In [157]:
from xgboost import XGBClassifier

xgbc = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgbc.fit(X_train, y_train_cls)

print(classification_report(y_test_cls, xgbc.predict(X_test)))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5472
           1       1.00      1.00      1.00      2341

    accuracy                           1.00      7813
   macro avg       1.00      1.00      1.00      7813
weighted avg       1.00      1.00      1.00      7813



cross validation

In [171]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

X = df[selected_features]
y = df["habitability_class"]

models = {
    "Logistic": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(
        n_estimators=300, max_depth=5, random_state=42
    ),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42
    )
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model)
    ])

    scores = cross_val_score(pipe, X, y, cv=cv, scoring="roc_auc")

    print(f"\n{name}")
    print("ROC-AUC scores:", scores)
    print("Mean:", scores.mean())
    print("Std:", scores.std())



Logistic
ROC-AUC scores: [0.99839171 0.99948852 0.99950408 0.99866418 0.99887519]
Mean: 0.9989847346143105
Std: 0.00044496135762727763

RandomForest
ROC-AUC scores: [0.99996829 0.9999362  0.99997237 0.99979362 0.9999725 ]
Mean: 0.9999285963986047
Std: 6.88417101174537e-05

XGBoost
ROC-AUC scores: [0.99997969 0.99994927 0.99999092 0.99999687 0.99998809]
Mean: 0.99998096518949
Std: 1.6787572123193307e-05


selecting the features





In [169]:
BASE_FEATURES = [
    "st_teff",
    "st_rad",
    "st_mass",
    "st_met",
    "st_luminosity",
    "pl_orbper",
    "pl_orbeccen",
    "pl_insol",
    "pl_orbsmax"
]


In [174]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

X = df[BASE_FEATURES]
y = df["habitability_class"]


In [175]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


re doing the cross validation

In [176]:
models = {
    "Logistic": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            penalty="l2",
            C=0.5,
            class_weight="balanced",
            max_iter=1000
        ))
    ]),

    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        max_depth=6,
        min_samples_leaf=20,
        class_weight="balanced",
        random_state=42
    ),

    "XGBoost": XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42
    )
}


In [197]:
for name, model in models.items():
    scores = cross_val_score(
        model, X, y,
        cv=cv,
        scoring="roc_auc"
    )
    print(f"\n{name}")
    print("ROC-AUC scores:", scores)
    print("Mean:", scores.mean())
    print("Std:", scores.std())



Logistic
ROC-AUC scores: [0.70398846 0.69622131 0.69954079 0.70623791 0.70672617]
Mean: 0.7025429284420193
Std: 0.004055063298234222

RandomForest
ROC-AUC scores: [0.78412836 0.7785048  0.77982538 0.78200867 0.78908222]
Mean: 0.7827098859922385
Std: 0.0037178879864419364

XGBoost
ROC-AUC scores: [0.83445484 0.82685057 0.82504019 0.83042945 0.82838338]
Mean: 0.8290316862807401
Std: 0.003239564372911455


final classification models without overfitting

In [184]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)


In [190]:
FEATURES = [
    "st_teff",
    "st_rad",
    "st_mass",
    "st_met",
    "st_luminosity",
    "pl_orbper",
    "pl_orbeccen",
    "pl_insol"
]

X = df[FEATURES]
y = df["habitability_class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


random forest

In [193]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_leaf=20,
    random_state=42
)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


Random Forest
Accuracy: 0.8167157301932676
Precision: 0.8326947637292464
Recall: 0.33384536610343063
F1: 0.4766081871345029
ROC-AUC: 0.8116234933916318


xg boost

In [196]:
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)
y_prob = xgb.predict_proba(X_test)[:, 1]

print("XGBoost")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


XGBoost
Accuracy: 0.8234992960450531
Precision: 0.816079295154185
Recall: 0.3794162826420891
F1: 0.518000699056274
ROC-AUC: 0.8225978148608336
