In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import recall_score, roc_auc_score, accuracy_score, f1_score, precision_score
from sklearn.utils import compute_sample_weight
import numpy as np

In [34]:
pd.set_option('display.max_columns',None)

df = pd.read_csv("datasets/KOI Table (Cumulative list).csv", skiprows=53)

print(df.head(2))

      kepid kepoi_name   kepler_name koi_disposition koi_pdisposition  \
0  10797460  K00752.01  Kepler-227 b       CONFIRMED        CANDIDATE   
1  10797460  K00752.02  Kepler-227 c       CONFIRMED        CANDIDATE   

   koi_score  koi_fpflag_nt  koi_fpflag_ss  koi_fpflag_co  koi_fpflag_ec  \
0      1.000              0              0              0              0   
1      0.969              0              0              0              0   

   koi_period  koi_period_err1  koi_period_err2  koi_time0bk  \
0    9.488036         0.000028        -0.000028    170.53875   
1   54.418383         0.000248        -0.000248    162.51384   

   koi_time0bk_err1  koi_time0bk_err2  koi_impact  koi_impact_err1  \
0           0.00216          -0.00216       0.146            0.318   
1           0.00352          -0.00352       0.586            0.059   

   koi_impact_err2  koi_duration  koi_duration_err1  koi_duration_err2  \
0           -0.146        2.9575             0.0819            -0.0819   

In [36]:
feature_columns = [
    'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
    'koi_period', 'koi_time0bk', 'koi_duration','koi_depth', 'koi_model_snr',
    'koi_prad', 'koi_teq', 'koi_insol', 'koi_steff', 'koi_slogg', 'koi_srad'
]

target_column = 'koi_disposition'



In [38]:
df_model = df[[target_column]+feature_columns].copy()

In [40]:
df_model['is_planet'] = df_model[target_column].apply(
    lambda x:1 if x in ['CONFIRMED', 'CANDIDATE'] else 0
)

print(df_model['is_planet'].value_counts())

df_model.drop(columns=[target_column], inplace=True)

is_planet
0    4839
1    4725
Name: count, dtype: int64


In [42]:
print("\n--- Handling Missing Values ---")

for col in feature_columns:
    if df_model[col].dtype != 'object':
        median_val = df_model[col].median()
        df_model[col].fillna(median_val, inplace=True)
        print(f"Filled NaNs in {col}")



--- Handling Missing Values ---
Filled NaNs in koi_fpflag_nt
Filled NaNs in koi_fpflag_ss
Filled NaNs in koi_fpflag_co
Filled NaNs in koi_fpflag_ec
Filled NaNs in koi_period
Filled NaNs in koi_time0bk
Filled NaNs in koi_duration
Filled NaNs in koi_depth
Filled NaNs in koi_model_snr
Filled NaNs in koi_prad
Filled NaNs in koi_teq
Filled NaNs in koi_insol
Filled NaNs in koi_steff
Filled NaNs in koi_slogg
Filled NaNs in koi_srad


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_model[col].fillna(median_val, inplace=True)


In [44]:
X = df_model.drop(columns=['is_planet'])
Y = df_model['is_planet']

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

print(f"\nX_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")


X_train shape: (7651, 15)
X_test shape: (1913, 15)


In [48]:

sample_weights = compute_sample_weight(class_weight='balanced', y=Y_train)

model = GradientBoostingClassifier(
    n_estimators=100,      
    learning_rate=0.1,     
    max_depth=3,           
    random_state=42
)

print("\n--- Training Gradient Boosting Model... ---")
model.fit(X_train, Y_train, sample_weight=sample_weights)
print("Training Complete.")


Y_pred_proba = model.predict_proba(X_test)[:, 1]
Y_pred = model.predict(X_test)

auc = roc_auc_score(Y_test, Y_pred_proba)
recall = recall_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
accuracy = accuracy_score(Y_test, Y_pred)

print("\n--- Model Evaluation (Kepler KOI Data) ---")
print(f"Target: is_planet (1=Candidate/Confirmed, 0=False Positive)")
print(f"Total Test Samples: {len(Y_test)}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Recall (Planet Recovery Rate): {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {auc:.4f}")


feature_importance = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
top_5_features = feature_importance.head(5)

print("\n--- Top 5 Most Important Features for Prediction ---")
print(top_5_features.to_markdown(numalign='left', stralign='left'))


--- Training Gradient Boosting Model... ---
Training Complete.

--- Model Evaluation (Kepler KOI Data) ---
Target: is_planet (1=Candidate/Confirmed, 0=False Positive)
Total Test Samples: 1913
Accuracy: 0.9901
Recall (Planet Recovery Rate): 0.9989
Precision: 0.9813
F1 Score: 0.9900
ROC AUC: 0.9983

--- Top 5 Most Important Features for Prediction ---
|               | 0          |
|:--------------|:-----------|
| koi_fpflag_nt | 0.341227   |
| koi_fpflag_ss | 0.30497    |
| koi_fpflag_co | 0.293347   |
| koi_fpflag_ec | 0.0442553  |
| koi_prad      | 0.00399032 |


In [49]:
sample_data_values = {
    # Vetting Flags (All clean)
    'koi_fpflag_nt': 0.0,
    'koi_fpflag_ss': 0.0,
    'koi_fpflag_co': 0.0,
    'koi_fpflag_ec': 0.0,
    
    # Confirmed Transit Data
    'koi_period': 2.470613377,
    'koi_duration': 1.74319,
    'koi_depth': 14231.0,
    'koi_model_snr': 4304.3,
    'koi_prad': 13.04,
    
    # Imputed/Contextual Values (for a full feature set)
    'koi_time0bk': 132.898099, # Median value used in training imputation
    'koi_teq': 643.0,          # Median value used in training imputation
    'koi_insol': 43.16,        # Median value used in training imputation
    'koi_steff': 5732.0,       # Median value used in training imputation
    'koi_slogg': 4.437,        # Median value used in training imputation
    'koi_srad': 0.964,
}


In [50]:
df_sample = pd.DataFrame([sample_data_values], columns=feature_columns)

In [51]:
# Predict the probability (P=1 is Planet)
prediction_proba = model.predict_proba(df_sample)[:, 1][0]

# Predict the final class (1 or 0)
prediction_class = model.predict(df_sample)[0]

# --- 5. Output Results ---

if prediction_class == 1:
    print("✅ PREDICTION: PLANET CANDIDATE")
else:
    print("❌ PREDICTION: FALSE POSITIVE")

print(f"Confidence Score (Probability): {prediction_proba:.4f}")

✅ PREDICTION: PLANET CANDIDATE
Confidence Score (Probability): 0.9967


In [60]:
df

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_teq_err1,koi_teq_err2,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_tce_delivname,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.000,0,0,0,0,9.488036,2.775000e-05,-2.775000e-05,170.538750,0.002160,-0.002160,0.146,0.318,-0.146,2.95750,0.08190,-0.08190,615.8,19.5,-19.5,2.26,0.26,-0.15,793.0,,,93.59,29.45,-16.65,35.8,1.0,q1_q17_dr25_tce,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,54.418383,2.479000e-04,-2.479000e-04,162.513840,0.003520,-0.003520,0.586,0.059,-0.443,4.50700,0.11600,-0.11600,874.8,35.5,-35.5,2.83,0.32,-0.19,443.0,,,9.11,2.87,-1.62,25.8,2.0,q1_q17_dr25_tce,5455.0,81.0,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.000,0,0,0,0,19.899140,1.494000e-05,-1.494000e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,1.78220,0.03410,-0.03410,10829.0,171.0,-171.0,14.60,3.92,-1.31,638.0,,,39.30,31.04,-10.49,76.3,1.0,q1_q17_dr25_tce,5853.0,158.0,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,1,0,0,1.736952,2.630000e-07,-2.630000e-07,170.307565,0.000115,-0.000115,1.276,0.115,-0.092,2.40641,0.00537,-0.00537,8079.2,12.8,-12.8,33.46,8.50,-2.83,1395.0,,,891.96,668.95,-230.35,505.6,1.0,q1_q17_dr25_tce,5805.0,157.0,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.000,0,0,0,0,2.525592,3.761000e-06,-3.761000e-06,171.595550,0.001130,-0.001130,0.701,0.235,-0.478,1.65450,0.04200,-0.04200,603.3,16.9,-16.9,2.75,0.88,-0.35,1406.0,,,926.16,874.33,-314.24,40.9,1.0,q1_q17_dr25_tce,6031.0,169.0,-211.0,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,10090151,K07985.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,1,1,0,0.527699,1.160000e-07,-1.160000e-07,131.705093,0.000170,-0.000170,1.252,0.051,-0.049,3.22210,0.01740,-0.01740,1579.2,4.6,-4.6,29.35,7.70,-2.57,2088.0,,,4500.53,3406.38,-1175.26,453.3,1.0,q1_q17_dr25_tce,5638.0,139.0,-166.0,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
9560,10128825,K07986.01,,CANDIDATE,CANDIDATE,0.497,0,0,0,0,1.739849,1.780000e-05,-1.780000e-05,133.001270,0.007690,-0.007690,0.043,0.423,-0.043,3.11400,0.22900,-0.22900,48.5,5.4,-5.4,0.72,0.24,-0.08,1608.0,,,1585.81,1537.86,-502.22,10.6,1.0,q1_q17_dr25_tce,6119.0,165.0,-220.0,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
9561,10147276,K07987.01,,FALSE POSITIVE,FALSE POSITIVE,0.021,0,0,1,0,0.681402,2.434000e-06,-2.434000e-06,132.181750,0.002850,-0.002850,0.147,0.309,-0.147,0.86500,0.16200,-0.16200,103.6,14.7,-14.7,1.07,0.36,-0.11,2218.0,,,5713.41,5675.74,-1836.94,12.3,1.0,q1_q17_dr25_tce,6173.0,193.0,-236.0,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385
9562,10155286,K07988.01,,CANDIDATE,CANDIDATE,0.092,0,0,0,0,333.486169,4.235000e-03,-4.235000e-03,153.615010,0.005070,-0.005070,0.214,0.255,-0.214,3.19900,0.22900,-0.22900,639.1,52.7,-52.7,19.30,0.55,-4.68,557.0,,,22.68,2.07,-10.95,14.0,1.0,q1_q17_dr25_tce,4989.0,39.0,-128.0,2.992,0.030,-0.027,7.824,0.223,-1.896,296.76288,47.145142,10.998


In [65]:
import pickle
import pandas as pd

model_filename = 'exoplanet_gbt_model.pkl'

with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

print(f"Model saved successfully as: {model_filename}")

Model saved successfully as: exoplanet_gbt_model.pkl


In [67]:

feature_order = X_train.columns.tolist()


median_values = df_model[feature_order].median().to_dict()

metadata = {
    'feature_order': feature_order,
    'median_values': median_values
}

metadata_filename = 'model_metadata.pkl'
with open(metadata_filename, 'wb') as file:
    pickle.dump(metadata, file)

print(f"Metadata (Feature Order & Imputation) saved successfully as: {metadata_filename}")

Metadata (Feature Order & Imputation) saved successfully as: model_metadata.pkl


In [69]:
import pickle
import pandas as pd


with open('exoplanet_gbt_model.pkl', 'rb') as file:
    LOADED_MODEL = pickle.load(file)

with open('model_metadata.pkl', 'rb') as file:
    METADATA = pickle.load(file)

FEATURE_ORDER = METADATA['feature_order']
MEDIAN_VALUES = METADATA['median_values']



def predict_exoplanet_status(input_data: dict):
    """
    Takes raw user input features, processes them, and returns prediction results.
    :param input_data: Dictionary of user inputs (e.g., {'koi_period': 2.47, ...})
    """
    df_predict = pd.DataFrame([input_data])
    
    df_predict = df_predict.reindex(columns=FEATURE_ORDER)
    df_predict.fillna(MEDIAN_VALUES, inplace=True)
    
    proba = LOADED_MODEL.predict_proba(df_predict)[0, 1]
    prediction = LOADED_MODEL.predict(df_predict)[0]
    
    feature_importances = pd.Series(
        LOADED_MODEL.feature_importances_, 
        index=FEATURE_ORDER
    ).sort_values(ascending=False).head(5)
    
    return {
        'prediction_class': int(prediction),
        'probability': proba,
        'feature_importance_rationale': feature_importances.to_dict()
    }
