# f_neuralNetwork_tree_features
----

Written in the Python 3.7.9 Environment with the following package versions

    * joblib 1.0.1
    * numpy 1.19.5
    * pandas 1.3.1
    * scikit-learn 0.24.2
    * tensorflow 2.5.0

By Nicole Lund 

This Jupyter Notebook tunes a neural network model for Exoplanet classification from Kepler Exoplanet study data.

Column descriptions can be found at https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html 

**Source Data**

The source data used was provided by University of Arizona's Data Analytics homework assignment. Their data was derived from https://www.kaggle.com/nasa/kepler-exoplanet-search-results?select=cumulative.csv

The full data set was released by NASA at
https://exoplanetarchive.ipac.caltech.edu/cgi-bin/TblView/nph-tblView?app=ExoTbls&config=koi

In [1]:
# Import Dependencies

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt

# Data manipulation
import numpy as np
import pandas as pd
from statistics import mean
from operator import itemgetter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

# Parameter Selection
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Model Development
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Model Metrics
from sklearn.metrics import classification_report

# Save/load files
from tensorflow.keras.models import load_model
import joblib

# # Ignore deprecation warnings
# import warnings
# warnings.simplefilter('ignore', FutureWarning)

In [2]:
# Set the seed value for the notebook, so the results are reproducible
from numpy.random import seed
seed(1)

# Read the CSV and Perform Basic Data Cleaning

In [3]:
# Import data
df = pd.read_csv("../b_source_data/exoplanet_data.csv")
# print(df.info())

# Drop columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop rows containing null values
df = df.dropna()

# Display data info
print(df.info())
print(df.head())
print(df.koi_disposition.unique())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6991 entries, 0 to 6990
Data columns (total 41 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   koi_disposition    6991 non-null   object 
 1   koi_fpflag_nt      6991 non-null   int64  
 2   koi_fpflag_ss      6991 non-null   int64  
 3   koi_fpflag_co      6991 non-null   int64  
 4   koi_fpflag_ec      6991 non-null   int64  
 5   koi_period         6991 non-null   float64
 6   koi_period_err1    6991 non-null   float64
 7   koi_period_err2    6991 non-null   float64
 8   koi_time0bk        6991 non-null   float64
 9   koi_time0bk_err1   6991 non-null   float64
 10  koi_time0bk_err2   6991 non-null   float64
 11  koi_impact         6991 non-null   float64
 12  koi_impact_err1    6991 non-null   float64
 13  koi_impact_err2    6991 non-null   float64
 14  koi_duration       6991 non-null   float64
 15  koi_duration_err1  6991 non-null   float64
 16  koi_duration_err2  6991 

In [4]:
# Rename "FALSE POSITIVE" disposition values
df.koi_disposition = df.koi_disposition.str.replace(' ','_')
print(df.koi_disposition.unique())

['CONFIRMED' 'FALSE_POSITIVE' 'CANDIDATE']


# Select features


In [5]:
# Split dataframe into X and y
tree_features = ['koi_fpflag_nt', 'koi_fpflag_co', 'koi_fpflag_ss', 'koi_model_snr']
forest_features = ['koi_fpflag_co', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_model_snr', 'koi_prad']
X = df[set(tree_features + forest_features)]
y = df["koi_disposition"]
print(X.shape, y.shape)

(6991, 5) (6991,)


# Create a Train Test Split

Use `koi_disposition` for the y values

In [6]:
# Split X and y into training and testing groups
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [7]:
# Display training data
X_train.head()

Unnamed: 0,koi_fpflag_co,koi_model_snr,koi_fpflag_ss,koi_prad,koi_fpflag_nt
4954,0,7.8,0,2.29,0
4235,1,12.7,0,1.82,0
848,0,17.9,0,2.31,0
2874,1,43.3,1,23.81,0
3016,0,17.4,0,1.4,0


# Pre-processing

In [8]:
# Scale the data with MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# One-Hot-Encode the y data

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [10]:
print('Unique KOI Disposition Values')
print(y.unique())
print('-----------')
print('Sample KOI Disposition Values and Encoding')
print(y_test[:5])
print(y_test_categorical[:5])

Unique KOI Disposition Values
['CONFIRMED' 'FALSE_POSITIVE' 'CANDIDATE']
-----------
Sample KOI Disposition Values and Encoding
4982    FALSE_POSITIVE
4866         CANDIDATE
2934    FALSE_POSITIVE
5007    FALSE_POSITIVE
3869    FALSE_POSITIVE
Name: koi_disposition, dtype: object
[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [11]:
# Code was modified from sample code presented on
# https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/

# Function to create model, required for KerasClassifier
def create_model(neurons=5):
	# create model
	model = Sequential()
	model.add(Dense(neurons, input_dim=X_train_scaled.shape[1], activation='relu'))
	model.add(Dropout(0.2))
	model.add(Dense(units=y_train_categorical.shape[1], activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

In [24]:
# Code was modified from sample code presented on
# https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/

# Use scikit-learn to grid search the batch size and epochs

# create model
grid_model = KerasClassifier(build_fn=create_model, verbose=0)

# define the grid search parameters
batch_size = [10, 20]
epochs = [100, 1000]
neurons = [3, 4 , 5]
param_grid = dict(batch_size=batch_size, epochs=epochs, neurons=neurons)

# Apply GridSearchCV
grid = GridSearchCV(estimator=grid_model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train_scaled, y_train_categorical)

# summarize results
print("--------------------------")
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

--------------------------
Best: 0.894543 using {'batch_size': 20, 'epochs': 1000, 'neurons': 20}
0.801553 (0.018024) with: {'batch_size': 10, 'epochs': 100, 'neurons': 5}
0.821582 (0.007559) with: {'batch_size': 10, 'epochs': 100, 'neurons': 10}
0.816881 (0.004758) with: {'batch_size': 10, 'epochs': 100, 'neurons': 15}
0.827713 (0.002294) with: {'batch_size': 10, 'epochs': 100, 'neurons': 20}
0.822399 (0.005039) with: {'batch_size': 10, 'epochs': 1000, 'neurons': 5}
0.847537 (0.024228) with: {'batch_size': 10, 'epochs': 1000, 'neurons': 10}
0.849785 (0.031933) with: {'batch_size': 10, 'epochs': 1000, 'neurons': 15}
0.891273 (0.006797) with: {'batch_size': 10, 'epochs': 1000, 'neurons': 20}
0.805641 (0.011514) with: {'batch_size': 20, 'epochs': 100, 'neurons': 5}
0.819743 (0.003045) with: {'batch_size': 20, 'epochs': 100, 'neurons': 10}
0.819743 (0.007947) with: {'batch_size': 20, 'epochs': 100, 'neurons': 15}
0.822195 (0.005902) with: {'batch_size': 20, 'epochs': 100, 'neurons': 20}
0

# Create and Train the Model - Neural Network

In [27]:
# Create model
nn_model = Sequential()

# Define first layer
nn_model.add(Dense(units=5,
                activation='relu', input_dim=X_train_scaled.shape[1]))

# Define output layer
nn_model.add(Dense(units=y_train_categorical.shape[1], activation='softmax'))    

# Review Model
print(nn_model.summary())

# Compile Model
nn_model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train model
nn_model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=1000,
    batch_size=10,
    shuffle=True,
    verbose=0
)

# Evaluate the model using the testing data
model_loss, model_accuracy = nn_model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 20)                420       
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 63        
Total params: 483
Trainable params: 483
Non-trainable params: 0
_________________________________________________________________
None
66/66 - 0s - loss: 0.2576 - accuracy: 0.8942
Loss: 0.25760945677757263, Accuracy: 0.894184947013855


# Option 3: Model Results when using selected features from Decision Tree and Random Forest Classifiers
* Grid Definition: 
    * batch_size = [10, 20]
    * epochs = [100, 1000]
    * neurons = [3, 4 , 5]
* Grid Best Result: 
* Tuned Model Results: 

# Save the Model

In [29]:
# Save the model results
nn_model.save("./f_neuralNetwork_tree_Features_model.h5")

# Model Discussion

The model score using the neural network method is one of the best for predicting exoplanet observations. However, the hyperparameter tuning is very slow.