In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import lightgbm as lgb
import joblib

In [None]:
# === 1. Load Dataset ===
df = pd.read_csv(r"D:\pro1\10th project NASA Hackathon 2025 - Exoplanet Detection\datasets\Tess-data.csv")

In [20]:
df.head()

Unnamed: 0,toi,tid,tfopwg_disp,rastr,ra,decstr,dec,st_pmra,st_pmraerr1,st_pmraerr2,...,st_logg,st_loggerr1,st_loggerr2,st_logglim,st_rad,st_raderr1,st_raderr2,st_radlim,toi_created,rowupdate
0,1000.01,50365310,FP,07h29m25.85s,112.357708,-12d41m45.46s,-12.69596,-5.964,0.085,-0.085,...,4.19,0.07,-0.07,0,2.16986,0.072573,-0.072573,0,7/24/2019 15:58,9/9/2024 10:08
1,1001.01,88863718,PC,08h10m19.31s,122.580465,-05d30m49.87s,-5.513852,-4.956,0.102,-0.102,...,4.03,0.09,-0.09,0,2.01,0.09,-0.09,0,7/24/2019 15:58,4/3/2023 14:31
2,1002.01,124709665,FP,06h58m54.47s,104.726966,-10d34m49.64s,-10.580455,-1.462,0.206,-0.206,...,,,,0,5.73,,,0,7/24/2019 15:58,7/11/2022 16:02
3,1003.01,106997505,FP,07h22m14.39s,110.559945,-25d12m25.26s,-25.207017,-0.939,0.041,-0.041,...,4.15,1.64,-1.64,0,,,,0,7/24/2019 15:58,2/23/2022 10:10
4,1004.01,238597883,FP,08h08m42.77s,122.178195,-48d48m10.12s,-48.802811,-4.496,0.069,-0.069,...,4.14,0.07,-0.07,0,2.15,0.06,-0.06,0,7/24/2019 15:58,9/9/2024 10:08


In [21]:
# === 2) Target column autodetect ===
possible_targets = ["tfopwg_disp", "TFOPWG Disposition", "TFOPWG_DISP", "disposition"]
target_col = next((c for c in possible_targets if c in df.columns), None)
if target_col is None:
    raise ValueError(f"Target column not found. Tried: {possible_targets}")

In [22]:
# === 3) Normalize labels to 3 classes ===
def norm_label(x):
    if pd.isna(x): return np.nan
    s = str(x).strip().upper()
    if s in {"CONFIRMED", "KP", "KNOWN PLANET"}: return "CONFIRMED"
    if s in {"PC","PLANET CANDIDATE","CANDIDATE","APC","AMBIGUOUS PLANETARY CANDIDATE"}: return "CANDIDATE"
    if s in {"FALSE POSITIVE","FP"}: return "FALSE POSITIVE"
    return np.nan

df[target_col] = df[target_col].apply(norm_label)
df = df.dropna(subset=[target_col]).reset_index(drop=True)

In [23]:
df.head()

Unnamed: 0,toi,tid,tfopwg_disp,rastr,ra,decstr,dec,st_pmra,st_pmraerr1,st_pmraerr2,...,st_logg,st_loggerr1,st_loggerr2,st_logglim,st_rad,st_raderr1,st_raderr2,st_radlim,toi_created,rowupdate
0,1000.01,50365310,FALSE POSITIVE,07h29m25.85s,112.357708,-12d41m45.46s,-12.69596,-5.964,0.085,-0.085,...,4.19,0.07,-0.07,0,2.16986,0.072573,-0.072573,0,7/24/2019 15:58,9/9/2024 10:08
1,1001.01,88863718,CANDIDATE,08h10m19.31s,122.580465,-05d30m49.87s,-5.513852,-4.956,0.102,-0.102,...,4.03,0.09,-0.09,0,2.01,0.09,-0.09,0,7/24/2019 15:58,4/3/2023 14:31
2,1002.01,124709665,FALSE POSITIVE,06h58m54.47s,104.726966,-10d34m49.64s,-10.580455,-1.462,0.206,-0.206,...,,,,0,5.73,,,0,7/24/2019 15:58,7/11/2022 16:02
3,1003.01,106997505,FALSE POSITIVE,07h22m14.39s,110.559945,-25d12m25.26s,-25.207017,-0.939,0.041,-0.041,...,4.15,1.64,-1.64,0,,,,0,7/24/2019 15:58,2/23/2022 10:10
4,1004.01,238597883,FALSE POSITIVE,08h08m42.77s,122.178195,-48d48m10.12s,-48.802811,-4.496,0.069,-0.069,...,4.14,0.07,-0.07,0,2.15,0.06,-0.06,0,7/24/2019 15:58,9/9/2024 10:08


In [24]:
# === 4) Drop obvious text-heavy / IDs ===
drop_cols = ["toi","tid","rastr","decstr","rowupdate","toi_created"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

In [None]:
# === 5) Build X & y ===
y_raw = df[target_col]
X = df.drop(columns=[target_col], errors="ignore").select_dtypes(include=[np.number])

# De-dup columns & drop all-NaN columns to avoid shape errors later
X = X.loc[:, ~X.columns.duplicated()]
X = X.dropna(axis=1, how="all")


In [26]:
# === 6) Feature engineering ===
def safe_div(a, b):
    with np.errstate(divide="ignore", invalid="ignore"):
        out = a / b
        out[~np.isfinite(out)] = np.nan
        return out

def add_relerr(X, base, e1, e2):
    if base in X.columns and e1 in X.columns and e2 in X.columns:
        denom = X[base].abs().replace(0, np.nan)
        rel = 0.5 * (X[e1].abs() + X[e2].abs()) / denom
        X[f"relerr_{base}"] = rel

# 6a) Log transforms (tame skew)
for col in ["pl_orbper","pl_trandep","pl_rade","pl_insol","st_dist","st_teff","pl_eqt"]:
    if col in X.columns:
        X[f"log_{col}"] = np.log1p(X[col].clip(lower=0))

# 6b) Ratios / proxies
if {"pl_trandurh","pl_orbper"}.issubset(X.columns):
    X["dur_per_ratio"] = safe_div(X["pl_trandurh"], X["pl_orbper"])

if {"pl_rade","st_rad"}.issubset(X.columns):
    X["rade_over_st_rad"] = safe_div(X["pl_rade"], X["st_rad"])

if {"st_pmra","st_pmdec"}.issubset(X.columns):
    X["pm_total"] = np.sqrt(X["st_pmra"]**2 + X["st_pmdec"]**2)

if {"pl_insol","pl_eqt"}.issubset(X.columns):
    X["insol_eqt_ratio"] = safe_div(np.log1p(X["pl_insol"].clip(lower=0)), np.log1p(X["pl_eqt"].clip(lower=0)))

# crude star-density proxy ~ M / R^3 (unit-agnostic)
if {"st_mass","st_rad"}.issubset(X.columns):
    X["star_density_proxy"] = safe_div(X["st_mass"], (X["st_rad"]**3))

# flux ~ 10^(-0.4 * mag); scale transit depth by flux as a rough SNR proxy
if {"pl_trandep","st_tmag"}.issubset(X.columns):
    flux = np.power(10.0, -0.4 * X["st_tmag"].fillna(X["st_tmag"].median()))
    X["depth_flux_scaled"] = X["pl_trandep"].fillna(0) * flux

# 6c) Uncertainty features (relative errors)
add_relerr(X, "pl_orbper", "pl_orbpererr1", "pl_orbpererr2")
add_relerr(X, "pl_trandurh", "pl_trandurherr1", "pl_trandurherr2")
add_relerr(X, "pl_trandep", "pl_trandeperr1", "pl_trandeperr2")
add_relerr(X, "pl_rade", "pl_radeerr1", "pl_radeerr2")
add_relerr(X, "st_teff", "st_tefferr1", "st_tefferr2")
add_relerr(X, "st_dist", "st_disterr1", "st_disterr2")
add_relerr(X, "st_tmag", "st_tmagerr1", "st_tmagerr2")

# Freeze feature list BEFORE imputation
feat_names = X.columns.tolist()

In [27]:
# === 7) Impute ===
imputer = SimpleImputer(strategy="median")
X = pd.DataFrame(imputer.fit_transform(X), columns=feat_names)

In [28]:
# === 8) Encode labels ===
le = LabelEncoder()
y = le.fit_transform(y_raw)

In [29]:
# === 9) Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [30]:
# === 10) Train LightGBM (balanced + mild regularization) ===
model = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=len(le.classes_),
    learning_rate=0.03,
    n_estimators=1200,
    num_leaves=63,
    max_depth=-1,
    min_child_samples=30,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=1.0,
    class_weight="balanced",
    random_state=42
)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003048 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14525
[LightGBM] [Info] Number of data points in the train set: 5536, number of used features: 59
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.03
,n_estimators,1200
,subsample_for_bin,200000
,objective,'multiclass'
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [31]:
# === 11) Evaluate ===
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

try:
    y_prob = model.predict_proba(X_test)
    print("\nROC-AUC (macro):", roc_auc_score(y_test, y_prob, multi_class="ovr"))
except Exception:
    pass

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Top features (quick view)
fi = pd.DataFrame({"feature": feat_names, "importance": model.feature_importances_}) \
       .sort_values("importance", ascending=False)
print("\nTop 25 features by importance:")
print(fi.head(25).to_string(index=False))


Classification Report:
                precision    recall  f1-score   support

     CANDIDATE       0.86      0.93      0.89      1029
     CONFIRMED       0.80      0.63      0.70       117
FALSE POSITIVE       0.68      0.51      0.58       239

      accuracy                           0.83      1385
     macro avg       0.78      0.69      0.73      1385
  weighted avg       0.82      0.83      0.82      1385


ROC-AUC (macro): 0.8839633706532285

Confusion Matrix:
[[956  16  57]
 [ 41  74   2]
 [113   3 123]]

Top 25 features by importance:
           feature  importance
               dec        9503
                ra        8943
        pl_tranmid        7613
       pl_trandurh        7044
           st_tmag        6227
  relerr_pl_orbper        6098
           st_pmra        6065
 depth_flux_scaled        6040
    relerr_st_tmag        5822
          st_pmdec        5787
 relerr_pl_trandep        5638
          pm_total        5524
     dur_per_ratio        5349
    relerr_st

In [32]:
# === 12) Save bundle ===
artifact = "tess_lightgbm.pkl"
joblib.dump((model, le, feat_names, imputer), artifact)
print(f"\n✅ Model trained and saved as '{artifact}'")


✅ Model trained and saved as 'tess_lightgbm.pkl'
