In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import lightgbm as lgb
import joblib

In [None]:
# 1) Load your K2 CSV
# -----------------------------
CSV_PATH = r"D:\pro1\10th project NASA Hackathon 2025 - Exoplanet Detection\datasets\k2-data.csv"
df = pd.read_csv(CSV_PATH)

In [3]:
df.head()

Unnamed: 0,pl_name,hostname,default_flag,disposition,disp_refname,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,...,sy_vmagerr2,sy_kmag,sy_kmagerr1,sy_kmagerr2,sy_gaiamag,sy_gaiamagerr1,sy_gaiamagerr2,rowupdate,pl_pubdate,releasedate
0,BD+20 594 b,BD+20 594,0,CONFIRMED,Espinoza et al. 2016,1,1,Transit,2016,K2,...,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249,4/25/2018,2018-03,2/15/2018
1,BD+20 594 b,BD+20 594,0,CONFIRMED,Espinoza et al. 2016,1,1,Transit,2016,K2,...,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249,4/25/2018,2016-10,7/28/2016
2,BD+20 594 b,BD+20 594,1,CONFIRMED,Espinoza et al. 2016,1,1,Transit,2016,K2,...,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249,4/25/2018,2017-03,4/26/2018
3,EPIC 201111557.01,EPIC 201111557,1,CANDIDATE,Livingston et al. 2018,1,0,Transit,2018,K2,...,-0.046,9.22,0.019,-0.019,11.3995,0.001307,-0.001307,8/2/2018,2018-08,8/2/2018
4,EPIC 201111557.01,EPIC 201111557,0,CANDIDATE,Livingston et al. 2018,1,0,Transit,2018,K2,...,-0.046,9.22,0.019,-0.019,11.3995,0.001307,-0.001307,2/15/2018,2018-03,2/15/2018


In [4]:
# === 2. Choose Target Column (auto) ===
possible_targets = ["disposition", "Archive Disposition", "archive_disposition"]
target_col = next((c for c in possible_targets if c in df.columns), None)
if target_col is None:
    raise ValueError(f"Target column not found. Tried: {possible_targets}")


In [5]:
# === 3. Normalize Labels to 3 classes ===
def norm_label(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip().upper()
    if s in {"CONFIRMED", "CONFIRMED PLANET", "KP"}:
        return "CONFIRMED"
    if s in {"CANDIDATE", "PLANETARY CANDIDATE", "PC", "APC", "AMBIGUOUS PLANETARY CANDIDATE"}:
        return "CANDIDATE"
    if s in {"FALSE POSITIVE", "FP"}:
        return "FALSE POSITIVE"
    return np.nan

df[target_col] = df[target_col].apply(norm_label)
df = df.dropna(subset=[target_col]).reset_index(drop=True)

In [6]:
df.head()

Unnamed: 0,pl_name,hostname,default_flag,disposition,disp_refname,sy_snum,sy_pnum,discoverymethod,disc_year,disc_facility,...,sy_vmagerr2,sy_kmag,sy_kmagerr1,sy_kmagerr2,sy_gaiamag,sy_gaiamagerr1,sy_gaiamagerr2,rowupdate,pl_pubdate,releasedate
0,BD+20 594 b,BD+20 594,0,CONFIRMED,Espinoza et al. 2016,1,1,Transit,2016,K2,...,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249,4/25/2018,2018-03,2/15/2018
1,BD+20 594 b,BD+20 594,0,CONFIRMED,Espinoza et al. 2016,1,1,Transit,2016,K2,...,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249,4/25/2018,2016-10,7/28/2016
2,BD+20 594 b,BD+20 594,1,CONFIRMED,Espinoza et al. 2016,1,1,Transit,2016,K2,...,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249,4/25/2018,2017-03,4/26/2018
3,EPIC 201111557.01,EPIC 201111557,1,CANDIDATE,Livingston et al. 2018,1,0,Transit,2018,K2,...,-0.046,9.22,0.019,-0.019,11.3995,0.001307,-0.001307,8/2/2018,2018-08,8/2/2018
4,EPIC 201111557.01,EPIC 201111557,0,CANDIDATE,Livingston et al. 2018,1,0,Transit,2018,K2,...,-0.046,9.22,0.019,-0.019,11.3995,0.001307,-0.001307,2/15/2018,2018-03,2/15/2018


In [7]:
# === 4. Drop obvious text-heavy / ID columns (keep it minimal & safe) ===
drop_cols = [
    "pl_name","hostname","disp_refname","discoverymethod","disc_facility","soltype",
    "pl_refname","st_refname","sy_refname","rastr","decstr","rowupdate",
    "pl_pubdate","releasedate","default_flag","pl_bmassprov"
]
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

In [8]:
df.head()

Unnamed: 0,disposition,sy_snum,sy_pnum,disc_year,pl_controv_flag,pl_orbper,pl_orbpererr1,pl_orbpererr2,pl_orbperlim,pl_orbsmax,...,sy_disterr2,sy_vmag,sy_vmagerr1,sy_vmagerr2,sy_kmag,sy_kmagerr1,sy_kmagerr2,sy_gaiamag,sy_gaiamagerr1,sy_gaiamagerr2
0,CONFIRMED,1,1,2016,0,41.688644,0.003353,-0.003419,0.0,,...,-1.24,10.849,0.012,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249
1,CONFIRMED,1,1,2016,0,41.6855,0.003,-0.0031,0.0,0.241,...,-1.24,10.849,0.012,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249
2,CONFIRMED,1,1,2016,0,41.6855,0.003,-0.003,0.0,,...,-1.24,10.849,0.012,-0.012,9.368,0.018,-0.018,10.8644,0.000249,-0.000249
3,CANDIDATE,1,0,2018,0,2.30183,0.00028,-0.0003,0.0,,...,-0.4598,11.727,0.046,-0.046,9.22,0.019,-0.019,11.3995,0.001307,-0.001307
4,CANDIDATE,1,0,2018,0,2.302368,0.000105,-0.000103,0.0,,...,-0.4598,11.727,0.046,-0.046,9.22,0.019,-0.019,11.3995,0.001307,-0.001307


In [None]:
# === 5. Build X, y (numeric only) ===
y_raw = df[target_col]
X = df.drop(columns=[target_col], errors="ignore").select_dtypes(include=[np.number])

# Impute missing numerics with median 
imputer = SimpleImputer(strategy="median")
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Encode labels
le = LabelEncoder()
y = le.fit_transform(y_raw)

In [10]:
# === 6. Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [14]:
# === 7. Train LightGBM (multiclass) ===
model = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=len(le.classes_),
    learning_rate=0.03,
    n_estimators=800,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002606 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10896
[LightGBM] [Info] Number of data points in the train set: 3185, number of used features: 63
[LightGBM] [Info] Start training from score -1.064052
[LightGBM] [Info] Start training from score -0.542186
[LightGBM] [Info] Start training from score -2.610886


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.03
,n_estimators,800
,subsample_for_bin,200000
,objective,'multiclass'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [15]:
# === 8. Evaluate ===
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

try:
    y_prob = model.predict_proba(X_test)
    print("\nROC-AUC (macro):", roc_auc_score(y_test, y_prob, multi_class="ovr"))
except Exception:
    pass

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
                precision    recall  f1-score   support

     CANDIDATE       0.99      0.98      0.98       275
     CONFIRMED       0.99      1.00      1.00       463
FALSE POSITIVE       0.95      0.93      0.94        59

      accuracy                           0.99       797
     macro avg       0.98      0.97      0.97       797
  weighted avg       0.99      0.99      0.99       797


ROC-AUC (macro): 0.9978538705785605

Confusion Matrix:
[[269   3   3]
 [  0 463   0]
 [  4   0  55]]


In [None]:
# === 9. Save the Model ===
artifact = "k2_lightgbm.pkl"
joblib.dump(
    (model, le, X.columns.tolist(), imputer),
    artifact
)
print(f"\n✅ Model trained and saved as '{artifact}'")


✅ Model trained and saved as 'k2_lightgbm.pkl'
