In [149]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

In [150]:
df1 = pd.read_csv("kepler_dataset.csv")

In [151]:
df1.head()

Unnamed: 0,DispositionScore,OrbitalPeriod_days,OrbitalPeriodUpperUnc_days,OrbitalPeriodLowerUnc_days,TransitEpoch_BKJD,TransitEpochUpperUnc_BKJD,TransitEpochLowerUnc_BKJD,ImpactParamete,ImpactParameterUpperUnc,ImpactParameterLowerUnc,...,StellarSurfaceGravityUpperUnc_log10(cm/s**2),StellarSurfaceGravityLowerUnc_log10(cm/s**2),StellarRadius_Solarradii,StellarRadiusUpperUnc_Solarradii,StellarRadiusLowerUnc_Solarradii,RA_decimaldegrees,Dec_decimaldegrees,Kepler-band_mag,ExoplanetCandidate,ExoplanetConfirmed
0,1.0,9.488036,2.78e-05,-2.78e-05,170.53875,0.00216,-0.00216,0.146,0.318,-0.146,...,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1,2
1,0.969,54.418383,0.000248,-0.000248,162.51384,0.00352,-0.00352,0.586,0.059,-0.443,...,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347,1,2
2,0.0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,-0.000581,0.969,5.126,-0.077,...,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436,1,1
3,0.0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,-0.000115,1.276,0.115,-0.092,...,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597,0,0
4,1.0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,-0.00113,0.701,0.235,-0.478,...,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509,1,2


In [152]:
df1.shape

(5599, 39)

In [153]:
df1.isnull().sum()

DispositionScore                                0
OrbitalPeriod_days                              0
OrbitalPeriodUpperUnc_days                      0
OrbitalPeriodLowerUnc_days                      0
TransitEpoch_BKJD                               0
TransitEpochUpperUnc_BKJD                       0
TransitEpochLowerUnc_BKJD                       0
ImpactParamete                                  0
ImpactParameterUpperUnc                         0
ImpactParameterLowerUnc                         0
TransitDuration_hrs                             0
TransitDurationUpperUnc_hrs                     0
TransitDurationLowerUnc_hrs                     0
TransitDepth_ppm                                0
TransitDepthUpperUnc_ppm                        0
TransitDepthLowerUnc_ppm                        0
PlanetaryRadius_Earthradii                      0
PlanetaryRadiusUpperUnc_Earthradii              0
PlanetaryRadiusLowerUnc_Earthradii              0
EquilibriumTemperatureK                         0


##### Dataset is clean

In [154]:
X_temp = df1.drop(['ExoplanetCandidate','ExoplanetConfirmed'],axis='columns')
Y1_temp = df1['ExoplanetCandidate']
Y2_temp = df1['ExoplanetConfirmed']

In [155]:
Y = Y1_temp.to_numpy()
len(Y)

5599

In [156]:
X = X_temp.to_numpy()
X.shape

(5599, 37)

In [157]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=37)

In [158]:
X_train.shape

(4479, 37)

### Neural Network to train the model

In [159]:
import tensorflow as tf
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.activations import relu,linear
from tensorflow.keras.regularizers import l2

##### Trying out a simple Neural Network

In [160]:
tf.random.set_seed(37)
my_model = Sequential(
    [
        Input(shape=(37,)),
        Dense(units=25, activation='relu'),
        Dense(units=15,activation='relu'),
        Dense(units=5,activation='relu'),
        Dense(units=1,activation='sigmoid')
    ]
)

my_model.compile(
    loss=BinaryCrossentropy(),
    optimizer=Adam(learning_rate=0.001)
)

In [161]:
my_model.summary()

In [162]:
# X_train
Y_train

array([0, 1, 1, ..., 1, 0, 0], dtype=int64)

In [163]:
my_model.fit(
    X_train,Y_train,
    epochs = 100)

Epoch 1/100
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 439us/step - loss: 43.9390
Epoch 2/100
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 388us/step - loss: 0.7826
Epoch 3/100
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 374us/step - loss: 0.6729
Epoch 4/100
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 396us/step - loss: 0.6636
Epoch 5/100
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 403us/step - loss: 0.6580
Epoch 6/100
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 388us/step - loss: 0.6548
Epoch 7/100
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 401us/step - loss: 0.6522
Epoch 8/100
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 396us/step - loss: 0.6504
Epoch 9/100
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 396us/step - loss: 0.6490
Epoch 10/100
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x26ed6f98b90>

In [166]:
y_pred = my_model.predict(X_train)
y_pred

[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 288us/step


array([[2.4574232e-10],
       [9.4486350e-01],
       [7.7188182e-01],
       ...,
       [9.4486350e-01],
       [8.6180228e-01],
       [1.6891377e-01]], dtype=float32)

In [167]:
y_hat = (y_pred >= 0.5).astype(int)
y_hat

array([[0],
       [1],
       [1],
       ...,
       [1],
       [1],
       [0]])

In [168]:
accuracy = np.mean(y_hat==Y_train)
accuracy*100

53.94671300032734

In [169]:
y_pred1=my_model.predict(X_test)
y_hat1 = (y_pred1 >= 0.5).astype(int)
acc = np.mean(y_hat1==Y_test)
acc*100

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 323us/step


54.32780612244898

#### High bias problem!!!

##### Let's try incresing complexity of NN

## XG Boost: Tree Ensemble

In [170]:
from xgboost import XGBClassifier

In [171]:
X_train, X_, Y_train, Y_ = train_test_split(X,Y,test_size=0.4,random_state=37)
X_test, X_eval , Y_test, Y_eval = train_test_split(X_,Y_,test_size=0.5,random_state=37)

In [172]:
xgb_model = XGBClassifier(n_estimators = 500, learning_rate = 0.1,verbosity = 1, random_state = 7, early_stopping_rounds = 10)
xgb_model.fit(X_train,Y_train, eval_set = [(X_eval,Y_eval)])

[0]	validation_0-logloss:0.57812
[1]	validation_0-logloss:0.51192
[2]	validation_0-logloss:0.45872
[3]	validation_0-logloss:0.41348
[4]	validation_0-logloss:0.37578
[5]	validation_0-logloss:0.34297
[6]	validation_0-logloss:0.31412
[7]	validation_0-logloss:0.28992
[8]	validation_0-logloss:0.26921
[9]	validation_0-logloss:0.25057
[10]	validation_0-logloss:0.23440
[11]	validation_0-logloss:0.21994
[12]	validation_0-logloss:0.20716
[13]	validation_0-logloss:0.19561
[14]	validation_0-logloss:0.18594
[15]	validation_0-logloss:0.17742
[16]	validation_0-logloss:0.16989
[17]	validation_0-logloss:0.16332
[18]	validation_0-logloss:0.15774
[19]	validation_0-logloss:0.15245
[20]	validation_0-logloss:0.14811
[21]	validation_0-logloss:0.14346
[22]	validation_0-logloss:0.13953
[23]	validation_0-logloss:0.13596
[24]	validation_0-logloss:0.13307
[25]	validation_0-logloss:0.13051
[26]	validation_0-logloss:0.12802
[27]	validation_0-logloss:0.12596
[28]	validation_0-logloss:0.12392
[29]	validation_0-loglos

In [173]:
xgb_model.best_iteration

55

In [174]:
y1_pred_tree = xgb_model.predict(X_train)
y2_pred_tree = xgb_model.predict(X_eval)
y3_pred_tree = xgb_model.predict(X_test)

In [175]:
acc1 = np.mean(Y_train==y1_pred_tree)
acc2 = np.mean(Y_eval==y2_pred_tree)
acc3 = np.mean(Y_test==y3_pred_tree)

In [176]:
print(acc1*100," ",acc2*100," ",acc3*100)

99.49389699315272   96.51785714285714   96.07142857142857


#### XGBoost performs great in our dataset... finding aggregate score of the model using K-fold method

In [177]:
from sklearn.model_selection import KFold

In [178]:
def get_score(X,Y):
    kf = KFold(n_splits=4)
    '''
    uses K-fold method to find average score of the model over 3 folds. 
    '''
    score = 0
    train_score = 0
    for train_index,test_index in kf.split(X):
        #75% train, 12.5% eval, 12.5% test
        X_train, X_, Y_train, Y_ = X[train_index], X[test_index], Y[train_index], Y[test_index]
        X_test, X_eval, Y_test, Y_eval = train_test_split(X_,Y_,test_size=0.5,random_state=37)
        xgb_model = XGBClassifier(n_estimators = 500, learning_rate = 0.1,verbosity = 1, random_state = 37, early_stopping_rounds = 10)
        xgb_model.fit(X_train,Y_train, eval_set = [(X_eval,Y_eval)])
        prediction = xgb_model.predict(X_test)
        prediction2 = xgb_model.predict(X_train)
        accuracy = np.mean(prediction==Y_test) * 100
        accuracy2 = np.mean(prediction2==Y_train) * 100
        score += accuracy
        train_score+= accuracy2
    score = score/4
    train_score = train_score/4
    return score, train_score

In [179]:
score, train_score = get_score(X,Y)

[0]	validation_0-logloss:0.55967
[1]	validation_0-logloss:0.49817
[2]	validation_0-logloss:0.44734
[3]	validation_0-logloss:0.40391
[4]	validation_0-logloss:0.36761
[5]	validation_0-logloss:0.33604
[6]	validation_0-logloss:0.30869
[7]	validation_0-logloss:0.28487
[8]	validation_0-logloss:0.26472
[9]	validation_0-logloss:0.24757
[10]	validation_0-logloss:0.23196
[11]	validation_0-logloss:0.21836
[12]	validation_0-logloss:0.20570
[13]	validation_0-logloss:0.19445
[14]	validation_0-logloss:0.18518
[15]	validation_0-logloss:0.17645
[16]	validation_0-logloss:0.16858
[17]	validation_0-logloss:0.16168
[18]	validation_0-logloss:0.15459
[19]	validation_0-logloss:0.14939
[20]	validation_0-logloss:0.14478
[21]	validation_0-logloss:0.14128
[22]	validation_0-logloss:0.13711
[23]	validation_0-logloss:0.13420
[24]	validation_0-logloss:0.13084
[25]	validation_0-logloss:0.12787
[26]	validation_0-logloss:0.12463
[27]	validation_0-logloss:0.12184
[28]	validation_0-logloss:0.11991
[29]	validation_0-loglos

In [180]:
print("Average Score of model in 4 fold test in train set: ",train_score)

Average Score of model in 4 fold test in train set:  99.2974928837932


In [181]:
print("Average Score of model in 4 fold test in test set: ",score)

Average Score of model in 4 fold test in test set:  96.60606989576947


### Yayy!! 96% accuracy on test_set and 99% in training set