<a href="https://colab.research.google.com/github/SAYEM088/exoai_model_creation/blob/main/ExoAi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
url = "https://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI?table=cumulative&format=csv"
df = pd.read_csv(url)

In [4]:
df

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra_str,dec_str,koi_kepmag,koi_kepmag_err
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.000,0,0,0,0,...,4.467,0.064,-0.096,0.927,0.105,-0.061,19h27m44.22s,+48d08m29.9s,15.347,
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,4.467,0.064,-0.096,0.927,0.105,-0.061,19h27m44.22s,+48d08m29.9s,15.347,
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.000,0,0,0,0,...,4.544,0.044,-0.176,0.868,0.233,-0.078,19h48m01.16s,+48d08m02.9s,15.436,
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,1,0,0,...,4.564,0.053,-0.168,0.791,0.201,-0.067,19h02m08.31s,+48d17m06.8s,15.597,
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.000,0,0,0,0,...,4.438,0.070,-0.210,1.046,0.334,-0.133,19h15m01.17s,+48d13m34.3s,15.509,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,10090151,K07985.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,1,1,0,...,4.529,0.035,-0.196,0.903,0.237,-0.079,19h48m45.30s,+47d05m37.7s,14.082,
9560,10128825,K07986.01,,CANDIDATE,CANDIDATE,0.497,0,0,0,0,...,4.444,0.056,-0.224,1.031,0.341,-0.114,19h06m02.25s,+47d09m47.6s,14.757,
9561,10147276,K07987.01,,FALSE POSITIVE,FALSE POSITIVE,0.021,0,0,1,0,...,4.447,0.056,-0.224,1.041,0.341,-0.114,19h36m39.57s,+47d10m34.6s,15.385,
9562,10155286,K07988.01,,CANDIDATE,CANDIDATE,0.092,0,0,0,0,...,2.992,0.030,-0.027,7.824,0.223,-1.896,19h47m03.09s,+47d08m42.5s,10.998,


In [5]:
nan_counts = df.isna().sum()
nan_counts = nan_counts[nan_counts > 0]
print(nan_counts)

kepler_name          6817
koi_score            1510
koi_period_err1       454
koi_period_err2       454
koi_time0bk_err1      454
koi_time0bk_err2      454
koi_impact            363
koi_impact_err1       454
koi_impact_err2       454
koi_duration_err1     454
koi_duration_err2     454
koi_depth             363
koi_depth_err1        454
koi_depth_err2        454
koi_prad              363
koi_prad_err1         363
koi_prad_err2         363
koi_teq               363
koi_teq_err1         9564
koi_teq_err2         9564
koi_insol             321
koi_insol_err1        321
koi_insol_err2        321
koi_model_snr         363
koi_tce_plnt_num      346
koi_tce_delivname     346
koi_steff             363
koi_steff_err1        468
koi_steff_err2        483
koi_slogg             363
koi_slogg_err1        468
koi_slogg_err2        468
koi_srad              363
koi_srad_err1         468
koi_srad_err2         468
koi_kepmag              1
koi_kepmag_err       9564
dtype: int64


In [7]:
all_columns = df.columns.tolist()
for col in all_columns:
  print(col)

kepid
kepoi_name
kepler_name
koi_disposition
koi_pdisposition
koi_score
koi_fpflag_nt
koi_fpflag_ss
koi_fpflag_co
koi_fpflag_ec
koi_period
koi_period_err1
koi_period_err2
koi_time0bk
koi_time0bk_err1
koi_time0bk_err2
koi_impact
koi_impact_err1
koi_impact_err2
koi_duration
koi_duration_err1
koi_duration_err2
koi_depth
koi_depth_err1
koi_depth_err2
koi_prad
koi_prad_err1
koi_prad_err2
koi_teq
koi_teq_err1
koi_teq_err2
koi_insol
koi_insol_err1
koi_insol_err2
koi_model_snr
koi_tce_plnt_num
koi_tce_delivname
koi_steff
koi_steff_err1
koi_steff_err2
koi_slogg
koi_slogg_err1
koi_slogg_err2
koi_srad
koi_srad_err1
koi_srad_err2
ra_str
dec_str
koi_kepmag
koi_kepmag_err


In [9]:
y = df['koi_disposition']


In [10]:
features = [
    'koi_period', 'koi_duration', 'koi_depth', 'koi_prad',
    'koi_teq', 'koi_insol', 'koi_model_snr', 'koi_score',
    'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec',
    'koi_steff', 'koi_slogg', 'koi_srad'
]
X = df[features]

In [11]:
X = X.fillna(-1)

In [12]:
print(X,y)

      koi_period  koi_duration  koi_depth  koi_prad  koi_teq  koi_insol  \
0       9.488036       2.95750      615.8      2.26    793.0      93.59   
1      54.418383       4.50700      874.8      2.83    443.0       9.11   
2      19.899140       1.78220    10829.0     14.60    638.0      39.30   
3       1.736952       2.40641     8079.2     33.46   1395.0     891.96   
4       2.525592       1.65450      603.3      2.75   1406.0     926.16   
...          ...           ...        ...       ...      ...        ...   
9559    0.527699       3.22210     1579.2     29.35   2088.0    4500.53   
9560    1.739849       3.11400       48.5      0.72   1608.0    1585.81   
9561    0.681402       0.86500      103.6      1.07   2218.0    5713.41   
9562  333.486169       3.19900      639.1     19.30    557.0      22.68   
9563    4.856035       3.07800       76.7      1.05   1266.0     607.42   

      koi_model_snr  koi_score  koi_fpflag_nt  koi_fpflag_ss  koi_fpflag_co  \
0              35.8 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

                precision    recall  f1-score   support

     CANDIDATE       0.80      0.76      0.78       405
     CONFIRMED       0.86      0.89      0.87       569
FALSE POSITIVE       0.98      0.99      0.98       939

      accuracy                           0.91      1913
     macro avg       0.88      0.88      0.88      1913
  weighted avg       0.91      0.91      0.91      1913

