In [1]:
from numpy.random import seed
seed(1)
import pandas as pd
import tensorflow
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
# loading data into a dataframe

file_path = '../Resources/exoplanet_data.csv'

df = pd.read_csv(file_path)
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [3]:
# Setting features and target
target = df['koi_disposition']
selected_features = df.drop(['koi_disposition'], axis=1)

# Selected Feature Names
feature_names = selected_features.columns.tolist()

# looking at the shape of the variables
print(target.shape, selected_features.shape)

(6991,) (6991, 40)


In [4]:
# Splitting the data into a training set and testing set and scaling

X_train, X_test, y_train, y_test = train_test_split(selected_features, target, random_state=42)

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Encoding the target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Converting the encoded target variable into a catagorical form
y_train_catagorical = to_categorical(encoded_y_train)
y_test_catagorical = to_categorical(encoded_y_test)

In [5]:
# Creating the deep model

deep_model = Sequential()
deep_model.add(Dense(units=10, activation='relu', input_dim=40))
deep_model.add(Dense(units=10, activation='relu'))
deep_model.add(Dense(units=3, activation='softmax'))

In [6]:
# Compiling the model

deep_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
deep_model.fit(X_train_scaled, y_train_catagorical, epochs=100, shuffle=True, verbose=2)

Epoch 1/100
164/164 - 0s - loss: 0.9352 - accuracy: 0.4946
Epoch 2/100
164/164 - 0s - loss: 0.6548 - accuracy: 0.6813
Epoch 3/100
164/164 - 0s - loss: 0.4864 - accuracy: 0.8011
Epoch 4/100
164/164 - 0s - loss: 0.4022 - accuracy: 0.8198
Epoch 5/100
164/164 - 0s - loss: 0.3753 - accuracy: 0.8198
Epoch 6/100
164/164 - 0s - loss: 0.3629 - accuracy: 0.8272
Epoch 7/100
164/164 - 0s - loss: 0.3554 - accuracy: 0.8272
Epoch 8/100
164/164 - 0s - loss: 0.3480 - accuracy: 0.8341
Epoch 9/100
164/164 - 0s - loss: 0.3444 - accuracy: 0.8274
Epoch 10/100
164/164 - 0s - loss: 0.3390 - accuracy: 0.8352
Epoch 11/100
164/164 - 0s - loss: 0.3360 - accuracy: 0.8360
Epoch 12/100
164/164 - 0s - loss: 0.3331 - accuracy: 0.8415
Epoch 13/100
164/164 - 0s - loss: 0.3295 - accuracy: 0.8432
Epoch 14/100
164/164 - 0s - loss: 0.3267 - accuracy: 0.8440
Epoch 15/100
164/164 - 1s - loss: 0.3253 - accuracy: 0.8425
Epoch 16/100
164/164 - 0s - loss: 0.3244 - accuracy: 0.8442
Epoch 17/100
164/164 - 0s - loss: 0.3222 - accura

<tensorflow.python.keras.callbacks.History at 0x7fc8b1103310>

In [7]:
# scoring the model

deep_model_loss, deep_model_accuracy = deep_model.evaluate(X_test_scaled, y_test_catagorical, verbose=2)
print(f"Normal Neural Network - Loss: {deep_model_loss}, Accuracy: {deep_model_accuracy}")

55/55 - 0s - loss: 0.3011 - accuracy: 0.8678
Normal Neural Network - Loss: 0.30113375186920166, Accuracy: 0.8678489923477173


In [8]:
# Creating a model with more neural layers

deep_model2 = Sequential()
deep_model2.add(Dense(units=10, activation='relu', input_dim=40))
deep_model2.add(Dense(units=10, activation='relu'))
deep_model2.add(Dense(units=10, activation='relu'))
deep_model2.add(Dense(units=10, activation='relu'))
deep_model2.add(Dense(units=10, activation='relu'))
deep_model2.add(Dense(units=3, activation='softmax'))


In [9]:
# Compiling the model with more layers

deep_model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
deep_model2.fit(X_train_scaled, y_train_catagorical, epochs=100, shuffle=True, verbose=2)

Epoch 1/100
164/164 - 0s - loss: 0.9224 - accuracy: 0.5056
Epoch 2/100
164/164 - 0s - loss: 0.4432 - accuracy: 0.7459
Epoch 3/100
164/164 - 0s - loss: 0.3937 - accuracy: 0.7589
Epoch 4/100
164/164 - 0s - loss: 0.3834 - accuracy: 0.7952
Epoch 5/100
164/164 - 0s - loss: 0.3746 - accuracy: 0.8056
Epoch 6/100
164/164 - 0s - loss: 0.3646 - accuracy: 0.8198
Epoch 7/100
164/164 - 0s - loss: 0.3587 - accuracy: 0.8209
Epoch 8/100
164/164 - 0s - loss: 0.3604 - accuracy: 0.8159
Epoch 9/100
164/164 - 0s - loss: 0.3489 - accuracy: 0.8291
Epoch 10/100
164/164 - 0s - loss: 0.3424 - accuracy: 0.8331
Epoch 11/100
164/164 - 0s - loss: 0.3388 - accuracy: 0.8348
Epoch 12/100
164/164 - 0s - loss: 0.3419 - accuracy: 0.8337
Epoch 13/100
164/164 - 0s - loss: 0.3393 - accuracy: 0.8306
Epoch 14/100
164/164 - 0s - loss: 0.3332 - accuracy: 0.8442
Epoch 15/100
164/164 - 0s - loss: 0.3306 - accuracy: 0.8411
Epoch 16/100
164/164 - 0s - loss: 0.3333 - accuracy: 0.8365
Epoch 17/100
164/164 - 0s - loss: 0.3270 - accura

<tensorflow.python.keras.callbacks.History at 0x7fc8b1978210>

In [10]:
# scoring the model with more layers

deep_model2_loss, deep_model2_accuracy = deep_model2.evaluate(X_test_scaled, y_test_catagorical, verbose=2)
print(f"Normal Neural Network - Loss: {deep_model2_loss}, Accuracy: {deep_model2_accuracy}")

55/55 - 0s - loss: 0.2643 - accuracy: 0.8959
Normal Neural Network - Loss: 0.2643410265445709, Accuracy: 0.8958809971809387


In [11]:
# Finding most important features

feat_model = RandomForestClassifier()
feat_model.fit(X_train_scaled, y_train_catagorical)
feats = feat_model.feature_importances_
feat_list = sorted(zip(feats,feature_names), reverse=True)
feat_list

[(0.10609241249936005, 'koi_fpflag_nt'),
 (0.10490828383393712, 'koi_fpflag_co'),
 (0.07239708426099763, 'koi_fpflag_ss'),
 (0.053618395316294185, 'koi_prad'),
 (0.052256945280473734, 'koi_model_snr'),
 (0.040349159298171296, 'koi_duration_err2'),
 (0.03522721436727802, 'koi_fpflag_ec'),
 (0.034591183465736794, 'koi_duration_err1'),
 (0.03265592411164437, 'koi_steff_err1'),
 (0.03154042362046396, 'koi_prad_err2'),
 (0.02425896131747935, 'koi_prad_err1'),
 (0.02259951924316135, 'koi_steff_err2'),
 (0.022265663734593964, 'koi_duration'),
 (0.022214998514511888, 'koi_depth'),
 (0.0213132689302937, 'koi_period'),
 (0.020792771869247585, 'koi_insol_err1'),
 (0.020656788233389886, 'koi_time0bk_err1'),
 (0.019599388860085623, 'koi_impact'),
 (0.0192262905932186, 'koi_time0bk_err2'),
 (0.0176505973771433, 'koi_period_err2'),
 (0.017628791439029377, 'koi_period_err1'),
 (0.016183682397578572, 'koi_teq'),
 (0.013929580749234182, 'koi_time0bk'),
 (0.013882238460555052, 'koi_depth_err2'),
 (0.0137

In [12]:
# creating a selected features variable with only the top 15 most important variables!

short_feat_list = feat_list[:15]
short_feat_list = list(zip(*short_feat_list))
short_feat_list = list(short_feat_list[1])
tuned_X = df[short_feat_list]
tuned_X.shape

(6991, 15)

In [13]:
# creating a deep model with the most important variables

# Spliting the tuned data
tuned_X_train, tuned_X_test, y_train, y_test = train_test_split(tuned_X, target, random_state=42)

# Scaling the tuned data
tuned_X_scaler = MinMaxScaler().fit(tuned_X_train)

tuned_X_train_scaled = tuned_X_scaler.transform(tuned_X_train)
tuned_X_test_scaled = tuned_X_scaler.transform(tuned_X_test)

# Retraining the tuned data
tuned_deep_model = Sequential()
tuned_deep_model.add(Dense(units=10, activation='relu', input_dim=15))
tuned_deep_model.add(Dense(units=10, activation='relu'))
tuned_deep_model.add(Dense(units=10, activation='relu'))
tuned_deep_model.add(Dense(units=10, activation='relu'))
tuned_deep_model.add(Dense(units=10, activation='relu'))
tuned_deep_model.add(Dense(units=10, activation='relu'))
tuned_deep_model.add(Dense(units=3, activation='softmax'))

# Compiling the tuned data
tuned_deep_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
tuned_deep_model.fit(tuned_X_train_scaled, y_train_catagorical, epochs=100, shuffle=True, verbose=2)



Epoch 1/100
164/164 - 0s - loss: 0.7332 - accuracy: 0.5987
Epoch 2/100
164/164 - 0s - loss: 0.4089 - accuracy: 0.7486
Epoch 3/100
164/164 - 0s - loss: 0.3906 - accuracy: 0.7562
Epoch 4/100
164/164 - 0s - loss: 0.3858 - accuracy: 0.7862
Epoch 5/100
164/164 - 0s - loss: 0.3786 - accuracy: 0.7961
Epoch 6/100
164/164 - 0s - loss: 0.3638 - accuracy: 0.8230
Epoch 7/100
164/164 - 0s - loss: 0.3565 - accuracy: 0.8261
Epoch 8/100
164/164 - 0s - loss: 0.3500 - accuracy: 0.8291
Epoch 9/100
164/164 - 0s - loss: 0.3491 - accuracy: 0.8297
Epoch 10/100
164/164 - 0s - loss: 0.3475 - accuracy: 0.8308
Epoch 11/100
164/164 - 0s - loss: 0.3390 - accuracy: 0.8392
Epoch 12/100
164/164 - 0s - loss: 0.3296 - accuracy: 0.8472
Epoch 13/100
164/164 - 0s - loss: 0.3328 - accuracy: 0.8428
Epoch 14/100
164/164 - 0s - loss: 0.3258 - accuracy: 0.8486
Epoch 15/100
164/164 - 0s - loss: 0.3173 - accuracy: 0.8554
Epoch 16/100
164/164 - 0s - loss: 0.3151 - accuracy: 0.8566
Epoch 17/100
164/164 - 0s - loss: 0.3117 - accura

<tensorflow.python.keras.callbacks.History at 0x7fc8b3b4ee90>

In [14]:
# Scoring the tuned data
tuned_deep_model_loss, tuned_deep_model_accuracy = tuned_deep_model.evaluate(tuned_X_test_scaled, y_test_catagorical, verbose=2)
print(f"Normal Neural Network - Loss: {tuned_deep_model_loss}, Accuracy: {tuned_deep_model_accuracy}")

55/55 - 0s - loss: 0.2682 - accuracy: 0.8913
Normal Neural Network - Loss: 0.26819220185279846, Accuracy: 0.8913043737411499


In [15]:
# Compare the diffent models

print(f'deep_model accuracy: {deep_model_accuracy}')
print(f'deep_model2 accuracy: {deep_model2_accuracy}')
print(f'tuned_deep_model accuracy: {tuned_deep_model_accuracy}')

model_list = ['deep_model', 'deep_model2', 'tuned_deep_model']
accuracy_list = [deep_model_accuracy ,deep_model2_accuracy, tuned_deep_model_accuracy]

summary_df = pd.DataFrame({'Model Name': model_list, 'Model Accuracy': accuracy_list})
summary_df

deep_model accuracy: 0.8678489923477173
deep_model2 accuracy: 0.8958809971809387
tuned_deep_model accuracy: 0.8913043737411499


Unnamed: 0,Model Name,Model Accuracy
0,deep_model,0.867849
1,deep_model2,0.895881
2,tuned_deep_model,0.891304


In [16]:
# choosing model with highest accuracy
best_model = summary_df.max()
best_model

Model Name        tuned_deep_model
Model Accuracy            0.895881
dtype: object

In [17]:
# Saving that model

best_model_name = best_model['Model Name']
best_model_name

'tuned_deep_model'

In [18]:
# Saving the model
tuned_deep_model.save('Models/Neural_Newtork_Model.h5')