# NASA Space Apps Challenges 2025 - Hunting for Exoplanets using AI

## Dependencies

In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## Load Data

In [2]:
# Load the Kepler exoplanet dataset, skipping the first 53 rows of metadata (Commented lines)
kepler_data = pd.read_csv('./dataset/kepler_exoplanet_data.csv', skiprows=53)

## Data Cleaning and Pre-processing

In [3]:
# Drop the unnecessary columns like KEPID, KOI Name, Kepler Name, koi_tce_delivname.
kepler_data = kepler_data.drop(columns=['kepid', 'kepoi_name', 'kepler_name', 'koi_tce_delivname'])

In [4]:
# Convert the 'koi_disposition' column to categorical type
kepler_data['koi_disposition'] = kepler_data['koi_disposition'].astype('category')
categories = kepler_data['koi_disposition'].cat.categories

In [5]:
# Map the categories to numerical codes and replace the original column 0 -> 'CANDIDATE', 1 -> 'CONFIRMED', 2 -> 'FALSE POSITIVE' 
kepler_data['koi_disposition'] = kepler_data['koi_disposition'].cat.codes

In [6]:
# Convert 'koi_pdisposition' to Categorical type
kepler_data['koi_pdisposition'] = kepler_data['koi_pdisposition'].astype('category')
kepler_data['koi_pdisposition'] = kepler_data['koi_pdisposition'].cat.codes

In [7]:
# Check any character columns and convert them to categorical type
for column in kepler_data.select_dtypes(include=['object']).columns:
    print(f"Column '{column}' is of type 'object'")
    kepler_data[column] = kepler_data[column].astype('category')
    kepler_data[column] = kepler_data[column].cat.codes

In [8]:
# Identify columns with missing values and fill them with the median of the respective columns
# Fill all numeric columns' missing values with their median
kepler_data.fillna(
    kepler_data.median(numeric_only=True),
    inplace=True
)

In [9]:
# # Fill missing values in 'koi_prad' with the median value of the column
# kepler_data['koi_prad'].fillna(kepler_data['koi_prad'].median(), inplace=True)

In [10]:
# Identify skewed columns in kepler_data
# skew_values = kepler_data.skew(numeric_only=True)

In [11]:
# # List columns with high skewness
# skewed_columns = skew_values[abs(skew_values) > 1].index.tolist()
# # print("Highly skewed columns:", skewed_columns)

In [12]:
# # log-transform skewed features
# for column in skewed_columns:
#     # Apply RobustScaler to reduce the impact of outliers
#     kepler_data[column] = np.log1p(kepler_data[column])

# print(kepler_data.head())

In [13]:
data_to_scale = kepler_data.drop(columns=['koi_disposition', 'koi_pdisposition'])

# Identify columns containing -inf values
inf_columns = []
for column in data_to_scale.columns:
    if np.isneginf(data_to_scale[column]).any():
        inf_columns.append(column)
print("Columns containing -inf values:", inf_columns)

# Display rows for each column that contain -inf values
for column in inf_columns:
    inf_rows = data_to_scale[np.isneginf(data_to_scale[column])]
    print(f"Rows with -inf in column '{column}':")
    print(inf_rows[[column]])

Columns containing -inf values: []


## Scale/ Normalization of data

In [14]:
# Apply RobustScaler
scaler = RobustScaler()

scaled_data = scaler.fit_transform(data_to_scale)

  return fnb._ureduce(a, func=_nanmedian, keepdims=keepdims,
  return _nanquantile_unchecked(


In [15]:
# Convert scaled_data (numpy array) back to DataFrame with original column names
scaled_df = pd.DataFrame(scaled_data, columns=data_to_scale.columns)

In [16]:
# Add back the label columns for ML Model training:
final_df = pd.concat([scaled_df, kepler_data[['koi_disposition', 'koi_pdisposition']].reset_index(drop=True)], axis=1)

## Split the data into features and labels

In [17]:
# Split the data into features and labels
X = final_df.drop("koi_disposition", axis=1)
y = final_df["koi_disposition"]

X = pd.get_dummies(X)

## Train - Test Split

In [18]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Initialize and Train Models

In [19]:
# Initialize and train the Random Forest Classifier
random_forest_clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=42
)
random_forest_clf.fit(X_train, y_train)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
# Initialize and train the XGBoost Classifier
xgboost_clf = XGBClassifier(
    n_estimators=100,
    max_depth=6,  # XGBoost default is 6, None means unlimited
    random_state=42,
    objective='multi:softmax',  # For multi-class classification
    eval_metric='mlogloss'  # Multi-class log loss
)
xgboost_clf.fit(X_train, y_train)

0,1,2
,objective,'multi:softmax'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


## Predictions against test dataset

In [21]:
# Random Forest predictions aginst the test dataset
y_pred = random_forest_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9283847360167277
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       405
           1       0.88      0.89      0.88       569
           2       1.00      1.00      1.00       939

    accuracy                           0.93      1913
   macro avg       0.90      0.90      0.90      1913
weighted avg       0.93      0.93      0.93      1913



In [22]:
# XGboost predictions aginst the test dataset
y_pred = xgboost_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9362258233141663
              precision    recall  f1-score   support

           0       0.87      0.83      0.85       405
           1       0.88      0.91      0.89       569
           2       1.00      1.00      1.00       939

    accuracy                           0.94      1913
   macro avg       0.92      0.91      0.91      1913
weighted avg       0.94      0.94      0.94      1913



## Save the models

In [23]:
# Save the Random Forest Model using joblib
joblib.dump(random_forest_clf, './models/random_forest_exoplanet_model.joblib')

# Save the XGBoost Model using joblib
joblib.dump(xgboost_clf, './models/xgboost_exoplanet_model.joblib')

['./models/xgboost_exoplanet_model.joblib']

## Load the saved Models

In [24]:
# Load the saved models
random_forest_clf_loaded = joblib.load('./models/random_forest_exoplanet_model.joblib')
xgboost_clf_loaded = joblib.load('./models/xgboost_exoplanet_model.joblib')

## Prediction against new input

In [25]:
# Test the loaded model with a sample input
user_input = {
    'koi_score': 0.9,
    'koi_fpflag_nt': 1,
    'koi_fpflag_ss': 0,
    'koi_fpflag_co': 0,
    'koi_fpflag_ec': 0,
    'koi_period': 10.5,
    'koi_period_err1': 0.01,
    'koi_period_err2': -0.01,
    'koi_time0bk': 130.5,
    'koi_time0bk_err1': 0.1,
    'koi_time0bk_err2': -0.1,
    'koi_impact': 0.2,
    'koi_impact_err1': 0.01,
    'koi_impact_err2': -0.01,
    'koi_duration': 5.0,
    'koi_duration_err1': 0.2,
    'koi_duration_err2': -0.2,
    'koi_depth': 1500,
    'koi_depth_err1': 100,
    'koi_depth_err2': -100,
    'koi_prad': 1.2,
    'koi_prad_err1': 0.05,
    'koi_prad_err2': -0.05,
    'koi_teq': 500,
    'koi_teq_err1': 10,
    'koi_teq_err2': -10,
    'koi_insol': 1.1,
    'koi_insol_err1': 0.1,
    'koi_insol_err2': -0.1,
    'koi_model_snr': 15.0,
    'koi_tce_plnt_num': 1,
    'koi_steff': 5700,
    'koi_steff_err1': 50,
    'koi_steff_err2': -50,
    'koi_slogg': 4.4,
    'koi_slogg_err1': 0.1,
    'koi_slogg_err2': -0.1,
    'koi_srad': 1.0,
    'koi_srad_err1': 0.05,
    'koi_srad_err2': -0.05,
    'ra': 290.0,
    'dec': 44.5,
    'koi_kepmag': 14.0,
    'koi_pdisposition': 1  # If this was used as a feature
}

# Convert user input to DataFrame and ensure columns match training data
user_df = pd.DataFrame([user_input])

user_df = pd.get_dummies(user_df)
user_df = user_df.reindex(columns=X.columns, fill_value=0)

In [26]:
# Prediction using the loaded Random Forest model
prediction = random_forest_clf_loaded.predict(user_df)
print("Predicted koi_disposition code:", prediction[0])

Predicted koi_disposition code: 2


In [27]:
# Predict using the loaded XGBoost model
prediction = xgboost_clf_loaded.predict(user_df)
print("Predicted koi_disposition code:", prediction[0])

Predicted koi_disposition code: 2
