In [1]:
# Data manipulation and visualization packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Pre-processing and setup functions
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.utils import to_categorical

# Algorithms
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense

# Report and model validation
from sklearn.metrics import classification_report

# Model persistence
from joblib import dump, load

pd.set_option('display.max_columns', 500)

exo = pd.read_csv('Data/cumulative.csv')

exo.head()

exo.sample(15)

exo.info()

### Load Data

In [2]:
exo2 = pd.read_csv('Data/exoplanet_data.csv')

In [3]:
exo2.sample(20)

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,koi_impact,koi_impact_err1,koi_impact_err2,koi_duration,koi_duration_err1,koi_duration_err2,koi_depth,koi_depth_err1,koi_depth_err2,koi_prad,koi_prad_err1,koi_prad_err2,koi_teq,koi_insol,koi_insol_err1,koi_insol_err2,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_steff_err1,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
5881,FALSE POSITIVE,1,0,0,0,2.305097,1.07e-06,-1.07e-06,133.705069,0.000369,-0.000369,1.286,6.59,-0.14,1.9378,0.0209,-0.0209,5880.0,50.3,-50.3,89.22,25.07,-22.57,2753,13581.41,11287.73,-7347.81,133.4,1,8851,246,-422,4.085,0.155,-0.155,2.178,0.612,-0.551,297.39349,48.410751,13.866
6614,FALSE POSITIVE,1,0,0,0,362.85801,0.03497,-0.03497,152.6104,0.0928,-0.0928,0.626,0.344,-0.409,17.03,2.8,-2.8,151.6,19.4,-19.4,1.9,0.55,-0.45,353,3.66,3.03,-1.82,10.1,1,6740,163,-245,4.165,0.204,-0.167,1.471,0.425,-0.348,291.34402,49.577091,13.503
779,CONFIRMED,0,0,0,0,15.866248,6.81e-05,-6.81e-05,141.08734,0.00323,-0.00323,0.858,0.096,-0.594,4.25,0.0941,-0.0941,77.3,2.9,-2.9,1.31,0.06,-0.05,879,141.18,19.99,-18.5,28.4,1,6099,79,-85,4.193,0.016,-0.018,1.308,0.054,-0.054,285.47659,41.632717,11.049
6344,FALSE POSITIVE,0,1,0,0,18.527824,5.6e-07,-5.6e-07,139.257607,2.5e-05,-2.5e-05,0.171,0.036,-0.021,5.88678,0.00152,-0.00152,76231.0,16.2,-16.2,64.66,10.67,-23.09,1080,322.19,163.93,-202.77,5547.2,1,6154,165,-165,3.793,0.315,-0.105,2.321,0.383,-0.829,287.67276,42.059158,12.893
4812,CANDIDATE,0,0,0,0,54.595746,0.0005848,-0.0005848,156.83764,0.00878,-0.00878,0.332,0.1177,-0.3319,2.21,0.232,-0.232,135.8,19.8,-19.8,2.0,0.9,-0.71,649,41.81,56.23,-26.58,8.7,1,6014,177,-181,3.989,0.341,-0.219,1.699,0.756,-0.609,295.8591,42.642189,13.67
5157,CONFIRMED,0,0,0,0,21.090791,0.0001467,-0.0001467,133.96316,0.00581,-0.00581,0.832,0.022,-0.574,4.751,0.159,-0.159,417.6,24.1,-24.1,1.88,0.48,-0.16,606,31.86,23.55,-8.17,19.7,1,5723,154,-154,4.546,0.055,-0.165,0.818,0.205,-0.073,285.47504,48.034931,14.223
5228,FALSE POSITIVE,0,1,0,0,3.319523,1.28e-06,-1.28e-06,131.923046,0.0003,-0.0003,1.162,0.442,-0.159,3.3465,0.0209,-0.0209,38832.0,181.0,-181.0,38.37,9.36,-3.11,1108,356.42,256.95,-86.46,249.4,1,5761,155,-155,4.581,0.04,-0.16,0.791,0.193,-0.064,297.35962,46.343552,15.101
1264,CONFIRMED,0,0,0,0,5.03728,1.57e-05,-1.57e-05,134.1094,0.00258,-0.00258,0.96,0.001,-0.665,2.0619,0.0628,-0.0628,76.3,4.3,-4.3,1.55,0.28,-0.17,1321,720.22,362.66,-201.6,20.9,1,6387,115,-140,4.271,0.066,-0.123,1.381,0.253,-0.156,294.46854,49.33102,12.189
1809,CONFIRMED,0,0,0,0,3.300408,2.21e-05,-2.21e-05,133.56723,0.00602,-0.00602,0.008,0.446,-0.008,4.462,0.164,-0.164,98.8,5.9,-5.9,1.43,0.26,-0.19,1543,1341.9,621.65,-382.51,19.3,1,6333,75,-82,4.229,0.099,-0.121,1.438,0.261,-0.19,296.29355,43.403271,13.749
5417,CANDIDATE,0,0,0,0,5.704593,4.44e-05,-4.44e-05,134.40102,0.00666,-0.00666,0.285,0.182,-0.285,1.149,0.231,-0.231,129.9,22.7,-22.7,0.95,0.2,-0.08,900,154.92,98.02,-39.91,8.5,1,5523,149,-166,4.566,0.034,-0.145,0.832,0.176,-0.07,287.04843,49.02861,14.572


In [4]:
exo2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6991 entries, 0 to 6990
Data columns (total 41 columns):
koi_disposition      6991 non-null object
koi_fpflag_nt        6991 non-null int64
koi_fpflag_ss        6991 non-null int64
koi_fpflag_co        6991 non-null int64
koi_fpflag_ec        6991 non-null int64
koi_period           6991 non-null float64
koi_period_err1      6991 non-null float64
koi_period_err2      6991 non-null float64
koi_time0bk          6991 non-null float64
koi_time0bk_err1     6991 non-null float64
koi_time0bk_err2     6991 non-null float64
koi_impact           6991 non-null float64
koi_impact_err1      6991 non-null float64
koi_impact_err2      6991 non-null float64
koi_duration         6991 non-null float64
koi_duration_err1    6991 non-null float64
koi_duration_err2    6991 non-null float64
koi_depth            6991 non-null float64
koi_depth_err1       6991 non-null float64
koi_depth_err2       6991 non-null float64
koi_prad             6991 non-null float64

In [None]:
#exo2['koi_disposition']

## Separate X and y

In [None]:
y = exo2['koi_disposition']
exo2_features = exo2.drop(['koi_disposition'], axis=1)

In [None]:
#exo2_features.sample(15)

## Pre-processing (Scaling)

In [None]:
scaler = MinMaxScaler()

scaled_features = scaler.fit_transform(exo2_features)

## Train, Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, y, random_state=42, stratify=y)

## Parameters

In [None]:
# SVM
svm = SVC(kernel = 'linear')
svm_grid = {'C': [1, 5, 10],
              'gamma': [0.001, 0.001, 0.01]}

# Random Forest
forest = RandomForestClassifier()
forest_grid = {
    'n_estimators': [25, 50, 100, 200, 400, 800],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

## SVM

In [None]:
grid = GridSearchCV(svm, svm_grid, verbose=3)
grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)

In [None]:
print(grid.best_score_)

In [None]:
predictions = grid.predict(X_test)
print(classification_report(y_test, predictions))

## Random Forest

In [None]:
grid = GridSearchCV(
    forest, 
    forest_grid, 
    scoring='accuracy', 
    cv=10, 
    n_jobs=-1, 
    verbose=3
)

grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)

In [None]:
print(grid.best_score_)

In [None]:
predictions = grid.predict(X_test)
print(classification_report(y_test, predictions))

## GBM

In [None]:
gbm_search = GradientBoostingClassifier()

gbm_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [25, 50, 100, 200, 400],
    'max_depth': [1, 2, 3, 4, 5]
}

grid = GridSearchCV(
    gbm_search,
    gbm_grid,
    scoring='accuracy',
    cv=5,
    verbose=3,
    n_jobs=-1
)

In [None]:
grid.fit(X_train, y_train)

In [None]:
print(grid.best_params_)

In [None]:
print(grid.best_score_)

In [None]:
predictions = grid.predict(X_test)
print(classification_report(y_test, predictions))

### Selected Parameter GBM

In [None]:
gbm_final = GradientBoostingClassifier(
    learning_rate = 0.1, 
    max_depth = 4, 
    n_estimators = 400,
    verbose = 3
)

gbm_final.fit(X_train, y_train)


In [None]:
predictions = grid.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
predictions

In [None]:
feature_importances = pd.Series(gbm_final.feature_importances_, index = exo2_features.columns)
feature_importances[:15].sort_values(ascending=False)

In [None]:
#dump(gbm_final, 'gbm_model_exoplanet.joblib')

## Deep Neural Network

In [5]:
y = exo2['koi_disposition']
X = exo2.drop(['koi_disposition'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

  return self.partial_fit(X, y)


In [6]:
X_train_scaled.shape, y_train_categorical.shape

((5243, 40), (5243, 3))

In [7]:
model = Sequential()
model.add(Dense(units=10, activation='relu', input_dim=40))
model.add(Dense(units=10, activation='relu'))
model.add(Dense(units=3, activation='softmax'))

Instructions for updating:
Colocations handled automatically by placer.


In [8]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                410       
_________________________________________________________________
dense_1 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 33        
Total params: 553
Trainable params: 553
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

model.fit(
    X_train_scaled, 
    y_train_categorical, 
    epochs=100, 
    shuffle=True, 
    verbose=2
)

Instructions for updating:
Use tf.cast instead.
Epoch 1/100
 - 0s - loss: 0.9822 - acc: 0.5010
Epoch 2/100
 - 0s - loss: 0.7418 - acc: 0.5211
Epoch 3/100
 - 0s - loss: 0.5824 - acc: 0.7366
Epoch 4/100
 - 0s - loss: 0.5251 - acc: 0.7917
Epoch 5/100
 - 0s - loss: 0.4939 - acc: 0.7904
Epoch 6/100
 - 0s - loss: 0.4731 - acc: 0.7932
Epoch 7/100
 - 0s - loss: 0.4572 - acc: 0.7967
Epoch 8/100
 - 0s - loss: 0.4446 - acc: 0.8104
Epoch 9/100
 - 0s - loss: 0.4331 - acc: 0.8144
Epoch 10/100
 - 0s - loss: 0.4238 - acc: 0.8205
Epoch 11/100
 - 0s - loss: 0.4149 - acc: 0.8259
Epoch 12/100
 - 0s - loss: 0.4057 - acc: 0.8293
Epoch 13/100
 - 0s - loss: 0.3982 - acc: 0.8282
Epoch 14/100
 - 0s - loss: 0.3907 - acc: 0.8339
Epoch 15/100
 - 0s - loss: 0.3829 - acc: 0.8379
Epoch 16/100
 - 0s - loss: 0.3778 - acc: 0.8282
Epoch 17/100
 - 0s - loss: 0.3722 - acc: 0.8354
Epoch 18/100
 - 0s - loss: 0.3666 - acc: 0.8365
Epoch 19/100
 - 0s - loss: 0.3618 - acc: 0.8348
Epoch 20/100
 - 0s - loss: 0.3596 - acc: 0.8323
E

<tensorflow.python.keras.callbacks.History at 0x17e60f719b0>

In [10]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)
print(f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

 - 0s - loss: 0.2978 - acc: 0.8839
Normal Neural Network - Loss: 0.2978257047911803, Accuracy: 0.8838672637939453


In [11]:
encoded_predictions = model.predict_classes(X_test_scaled[:10])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

print(f'Predicted classes: {prediction_labels}')
print(f'Actual labels: {list(y_test[:10])}')

Predicted classes: ['CANDIDATE' 'FALSE POSITIVE' 'FALSE POSITIVE' 'CANDIDATE'
 'FALSE POSITIVE' 'CONFIRMED' 'FALSE POSITIVE' 'FALSE POSITIVE'
 'CONFIRMED' 'FALSE POSITIVE']
Actual labels: ['CANDIDATE', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CANDIDATE', 'FALSE POSITIVE', 'CONFIRMED', 'FALSE POSITIVE', 'FALSE POSITIVE', 'CONFIRMED', 'CONFIRMED']
