In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head(10)

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714
5,CONFIRMED,0,0,0,0,2.566589,1.78e-05,-1.78e-05,179.55437,0.00461,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714
6,CONFIRMED,0,0,0,0,16.068647,1.09e-05,-1.09e-05,173.621937,0.000517,...,-83,4.485,0.083,-0.028,0.848,0.033,-0.072,286.99948,48.37579,15.841
7,CONFIRMED,0,0,0,0,2.470613,2.7e-08,-2.7e-08,122.763305,9e-06,...,-78,4.457,0.024,-0.024,0.964,0.038,-0.038,286.80847,49.316399,11.338
8,CONFIRMED,0,1,0,0,2.204735,4.3e-08,-4.3e-08,121.358542,1.6e-05,...,-89,4.019,0.033,-0.027,1.952,0.099,-0.11,292.24728,47.969521,10.463
9,CONFIRMED,0,0,0,0,3.522498,1.98e-07,-1.98e-07,121.119423,4.7e-05,...,-137,4.169,0.055,-0.045,1.451,0.11,-0.11,281.28812,42.45108,13.563


# Select your features (columns)

In [3]:
X = df.drop(["koi_disposition"], axis=1)
y = df["koi_disposition"]
print(X.shape, y.shape)

(6991, 40) (6991,)


# Create a Train Test Split

Use `koi_disposition` for the y values

In [4]:
# Return to this step to alter features as needed
#X = X.drop("koi_fpflag_nt", axis=1)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

In [6]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
1238,0,0,0,0,37.815416,0.000267,-0.000267,141.10346,0.00568,-0.00568,...,-106,4.624,0.012,-0.048,0.705,0.049,-0.025,297.29242,49.981628,12.879
3671,0,0,0,0,65.427645,0.001087,-0.001087,176.2733,0.014,-0.014,...,-198,4.504,0.052,-0.221,0.926,0.292,-0.097,296.0069,50.281811,14.821
1258,0,0,0,0,9.613753,1.1e-05,-1.1e-05,132.185679,0.000971,-0.000971,...,-199,4.364,0.108,-0.201,1.11,0.33,-0.178,290.35815,44.518749,14.283
5028,0,1,1,0,5.758437,2.1e-05,-2.1e-05,135.11279,0.00299,-0.00299,...,-120,2.194,0.033,-0.03,25.287,0.546,-10.374,291.98096,38.97118,13.819
4081,0,1,0,0,1.120646,1e-06,-1e-06,131.59197,0.001,-0.001,...,-163,3.449,0.483,-0.241,4.034,1.327,-1.99,297.61398,47.811401,13.695


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [7]:
# Scale your data
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
label_encoder.fit(y_test)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [9]:
for label, original_class in zip(encoded_y_train, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CO

Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Enc

------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded L

Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Ori

------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALS

------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
---

------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Origi

Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
-----------

Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIV

Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Ori

In [10]:
for label, original_class in zip(encoded_y_test, y):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CO

Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Clas

------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded La

------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Origi

------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded L

------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CONFIRMED

------------
Original Class: CANDIDATE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 2
------------
Original Class:

In [11]:
from tensorflow.keras.utils import to_categorical
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
y_train_categorical

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [12]:
from tensorflow.keras.models import Sequential

model = Sequential()

In [13]:
from tensorflow.keras.layers import Dense
number_inputs = 40
number_hidden_nodes = 10
model.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))
model.add(Dense(units=10, activation='relu'))
model.add(Dense(units=10, activation='relu'))

In [14]:
number_classes = 3
model.add(Dense(units=number_classes, activation='softmax'))

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                410       
_________________________________________________________________
dense_1 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_2 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 33        
Total params: 663
Trainable params: 663
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [17]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=1000,
    shuffle=True,
    verbose=2
)

Epoch 1/1000
164/164 - 0s - loss: 0.9247 - accuracy: 0.4970
Epoch 2/1000
164/164 - 0s - loss: 0.4834 - accuracy: 0.7706
Epoch 3/1000
164/164 - 0s - loss: 0.3844 - accuracy: 0.8083
Epoch 4/1000
164/164 - 0s - loss: 0.3645 - accuracy: 0.8211
Epoch 5/1000
164/164 - 0s - loss: 0.3552 - accuracy: 0.8280
Epoch 6/1000
164/164 - 0s - loss: 0.3451 - accuracy: 0.8282
Epoch 7/1000
164/164 - 0s - loss: 0.3426 - accuracy: 0.8316
Epoch 8/1000
164/164 - 0s - loss: 0.3387 - accuracy: 0.8297
Epoch 9/1000
164/164 - 0s - loss: 0.3351 - accuracy: 0.8341
Epoch 10/1000
164/164 - 0s - loss: 0.3314 - accuracy: 0.8346
Epoch 11/1000
164/164 - 0s - loss: 0.3308 - accuracy: 0.8377
Epoch 12/1000
164/164 - 0s - loss: 0.3286 - accuracy: 0.8434
Epoch 13/1000
164/164 - 0s - loss: 0.3246 - accuracy: 0.8413
Epoch 14/1000
164/164 - 0s - loss: 0.3227 - accuracy: 0.8447
Epoch 15/1000
164/164 - 0s - loss: 0.3209 - accuracy: 0.8488
Epoch 16/1000
164/164 - 0s - loss: 0.3188 - accuracy: 0.8476
Epoch 17/1000
164/164 - 0s - loss

Epoch 135/1000
164/164 - 0s - loss: 0.2533 - accuracy: 0.8934
Epoch 136/1000
164/164 - 0s - loss: 0.2466 - accuracy: 0.8995
Epoch 137/1000
164/164 - 0s - loss: 0.2542 - accuracy: 0.8917
Epoch 138/1000
164/164 - 0s - loss: 0.2512 - accuracy: 0.8930
Epoch 139/1000
164/164 - 0s - loss: 0.2501 - accuracy: 0.8976
Epoch 140/1000
164/164 - 0s - loss: 0.2475 - accuracy: 0.8961
Epoch 141/1000
164/164 - 0s - loss: 0.2436 - accuracy: 0.8985
Epoch 142/1000
164/164 - 0s - loss: 0.2437 - accuracy: 0.8993
Epoch 143/1000
164/164 - 0s - loss: 0.2468 - accuracy: 0.8978
Epoch 144/1000
164/164 - 0s - loss: 0.2449 - accuracy: 0.8987
Epoch 145/1000
164/164 - 0s - loss: 0.2463 - accuracy: 0.8938
Epoch 146/1000
164/164 - 0s - loss: 0.2440 - accuracy: 0.8987
Epoch 147/1000
164/164 - 0s - loss: 0.2502 - accuracy: 0.8947
Epoch 148/1000
164/164 - 0s - loss: 0.2518 - accuracy: 0.8947
Epoch 149/1000
164/164 - 0s - loss: 0.2522 - accuracy: 0.8915
Epoch 150/1000
164/164 - 0s - loss: 0.2533 - accuracy: 0.8928
Epoch 15

164/164 - 0s - loss: 0.2349 - accuracy: 0.9001
Epoch 268/1000
164/164 - 0s - loss: 0.2357 - accuracy: 0.9016
Epoch 269/1000
164/164 - 0s - loss: 0.2398 - accuracy: 0.8991
Epoch 270/1000
164/164 - 0s - loss: 0.2355 - accuracy: 0.9004
Epoch 271/1000
164/164 - 0s - loss: 0.2332 - accuracy: 0.9006
Epoch 272/1000
164/164 - 0s - loss: 0.2347 - accuracy: 0.9033
Epoch 273/1000
164/164 - 0s - loss: 0.2345 - accuracy: 0.9022
Epoch 274/1000
164/164 - 0s - loss: 0.2336 - accuracy: 0.9012
Epoch 275/1000
164/164 - 0s - loss: 0.2308 - accuracy: 0.9039
Epoch 276/1000
164/164 - 0s - loss: 0.2328 - accuracy: 0.9001
Epoch 277/1000
164/164 - 0s - loss: 0.2383 - accuracy: 0.8991
Epoch 278/1000
164/164 - 0s - loss: 0.2427 - accuracy: 0.8951
Epoch 279/1000
164/164 - 0s - loss: 0.2333 - accuracy: 0.9046
Epoch 280/1000
164/164 - 0s - loss: 0.2310 - accuracy: 0.9029
Epoch 281/1000
164/164 - 0s - loss: 0.2359 - accuracy: 0.9001
Epoch 282/1000
164/164 - 0s - loss: 0.2375 - accuracy: 0.8978
Epoch 283/1000
164/164 

Epoch 400/1000
164/164 - 0s - loss: 0.2250 - accuracy: 0.9079
Epoch 401/1000
164/164 - 0s - loss: 0.2350 - accuracy: 0.9029
Epoch 402/1000
164/164 - 0s - loss: 0.2280 - accuracy: 0.9084
Epoch 403/1000
164/164 - 0s - loss: 0.2309 - accuracy: 0.9050
Epoch 404/1000
164/164 - 0s - loss: 0.2265 - accuracy: 0.9046
Epoch 405/1000
164/164 - 0s - loss: 0.2244 - accuracy: 0.9086
Epoch 406/1000
164/164 - 0s - loss: 0.2267 - accuracy: 0.9037
Epoch 407/1000
164/164 - 0s - loss: 0.2271 - accuracy: 0.9062
Epoch 408/1000
164/164 - 0s - loss: 0.2322 - accuracy: 0.9016
Epoch 409/1000
164/164 - 0s - loss: 0.2292 - accuracy: 0.9046
Epoch 410/1000
164/164 - 0s - loss: 0.2271 - accuracy: 0.9064
Epoch 411/1000
164/164 - 0s - loss: 0.2240 - accuracy: 0.9083
Epoch 412/1000
164/164 - 0s - loss: 0.2252 - accuracy: 0.9064
Epoch 413/1000
164/164 - 0s - loss: 0.2310 - accuracy: 0.9031
Epoch 414/1000
164/164 - 0s - loss: 0.2288 - accuracy: 0.9048
Epoch 415/1000
164/164 - 0s - loss: 0.2233 - accuracy: 0.9079
Epoch 41

164/164 - 0s - loss: 0.2212 - accuracy: 0.9083
Epoch 533/1000
164/164 - 0s - loss: 0.2243 - accuracy: 0.9083
Epoch 534/1000
164/164 - 0s - loss: 0.2217 - accuracy: 0.9065
Epoch 535/1000
164/164 - 0s - loss: 0.2239 - accuracy: 0.9086
Epoch 536/1000
164/164 - 0s - loss: 0.2258 - accuracy: 0.9025
Epoch 537/1000
164/164 - 0s - loss: 0.2208 - accuracy: 0.9067
Epoch 538/1000
164/164 - 0s - loss: 0.2228 - accuracy: 0.9094
Epoch 539/1000
164/164 - 0s - loss: 0.2212 - accuracy: 0.9088
Epoch 540/1000
164/164 - 0s - loss: 0.2218 - accuracy: 0.9090
Epoch 541/1000
164/164 - 0s - loss: 0.2210 - accuracy: 0.9090
Epoch 542/1000
164/164 - 0s - loss: 0.2198 - accuracy: 0.9102
Epoch 543/1000
164/164 - 0s - loss: 0.2255 - accuracy: 0.9039
Epoch 544/1000
164/164 - 0s - loss: 0.2221 - accuracy: 0.9077
Epoch 545/1000
164/164 - 0s - loss: 0.2232 - accuracy: 0.9058
Epoch 546/1000
164/164 - 0s - loss: 0.2191 - accuracy: 0.9090
Epoch 547/1000
164/164 - 0s - loss: 0.2214 - accuracy: 0.9046
Epoch 548/1000
164/164 

Epoch 665/1000
164/164 - 0s - loss: 0.2219 - accuracy: 0.9077
Epoch 666/1000
164/164 - 0s - loss: 0.2192 - accuracy: 0.9075
Epoch 667/1000
164/164 - 0s - loss: 0.2153 - accuracy: 0.9117
Epoch 668/1000
164/164 - 0s - loss: 0.2154 - accuracy: 0.9109
Epoch 669/1000
164/164 - 0s - loss: 0.2135 - accuracy: 0.9104
Epoch 670/1000
164/164 - 0s - loss: 0.2174 - accuracy: 0.9094
Epoch 671/1000
164/164 - 0s - loss: 0.2178 - accuracy: 0.9084
Epoch 672/1000
164/164 - 0s - loss: 0.2171 - accuracy: 0.9090
Epoch 673/1000
164/164 - 0s - loss: 0.2184 - accuracy: 0.9105
Epoch 674/1000
164/164 - 0s - loss: 0.2181 - accuracy: 0.9069
Epoch 675/1000
164/164 - 0s - loss: 0.2173 - accuracy: 0.9084
Epoch 676/1000
164/164 - 0s - loss: 0.2148 - accuracy: 0.9092
Epoch 677/1000
164/164 - 0s - loss: 0.2163 - accuracy: 0.9130
Epoch 678/1000
164/164 - 0s - loss: 0.2146 - accuracy: 0.9105
Epoch 679/1000
164/164 - 0s - loss: 0.2198 - accuracy: 0.9092
Epoch 680/1000
164/164 - 0s - loss: 0.2184 - accuracy: 0.9084
Epoch 68

164/164 - 0s - loss: 0.2127 - accuracy: 0.9109
Epoch 798/1000
164/164 - 0s - loss: 0.2128 - accuracy: 0.9142
Epoch 799/1000
164/164 - 0s - loss: 0.2134 - accuracy: 0.9144
Epoch 800/1000
164/164 - 0s - loss: 0.2174 - accuracy: 0.9096
Epoch 801/1000
164/164 - 0s - loss: 0.2133 - accuracy: 0.9105
Epoch 802/1000
164/164 - 0s - loss: 0.2150 - accuracy: 0.9125
Epoch 803/1000
164/164 - 0s - loss: 0.2170 - accuracy: 0.9058
Epoch 804/1000
164/164 - 0s - loss: 0.2116 - accuracy: 0.9107
Epoch 805/1000
164/164 - 0s - loss: 0.2125 - accuracy: 0.9119
Epoch 806/1000
164/164 - 0s - loss: 0.2132 - accuracy: 0.9102
Epoch 807/1000
164/164 - 0s - loss: 0.2121 - accuracy: 0.9136
Epoch 808/1000
164/164 - 0s - loss: 0.2159 - accuracy: 0.9140
Epoch 809/1000
164/164 - 0s - loss: 0.2118 - accuracy: 0.9126
Epoch 810/1000
164/164 - 0s - loss: 0.2156 - accuracy: 0.9094
Epoch 811/1000
164/164 - 0s - loss: 0.2119 - accuracy: 0.9123
Epoch 812/1000
164/164 - 0s - loss: 0.2137 - accuracy: 0.9102
Epoch 813/1000
164/164 

Epoch 930/1000
164/164 - 0s - loss: 0.2097 - accuracy: 0.9140
Epoch 931/1000
164/164 - 0s - loss: 0.2077 - accuracy: 0.9111
Epoch 932/1000
164/164 - 0s - loss: 0.2072 - accuracy: 0.9142
Epoch 933/1000
164/164 - 0s - loss: 0.2098 - accuracy: 0.9111
Epoch 934/1000
164/164 - 0s - loss: 0.2062 - accuracy: 0.9151
Epoch 935/1000
164/164 - 0s - loss: 0.2083 - accuracy: 0.9149
Epoch 936/1000
164/164 - 0s - loss: 0.2099 - accuracy: 0.9100
Epoch 937/1000
164/164 - 0s - loss: 0.2087 - accuracy: 0.9147
Epoch 938/1000
164/164 - 0s - loss: 0.2071 - accuracy: 0.9155
Epoch 939/1000
164/164 - 0s - loss: 0.2094 - accuracy: 0.9136
Epoch 940/1000
164/164 - 0s - loss: 0.2095 - accuracy: 0.9136
Epoch 941/1000
164/164 - 0s - loss: 0.2095 - accuracy: 0.9126
Epoch 942/1000
164/164 - 0s - loss: 0.2085 - accuracy: 0.9105
Epoch 943/1000
164/164 - 0s - loss: 0.2080 - accuracy: 0.9130
Epoch 944/1000
164/164 - 0s - loss: 0.2080 - accuracy: 0.9117
Epoch 945/1000
164/164 - 0s - loss: 0.2072 - accuracy: 0.9155
Epoch 94

<tensorflow.python.keras.callbacks.History at 0x20a82d3f160>

In [18]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

55/55 - 0s - loss: 0.3144 - accuracy: 0.8850
Loss: 0.3144262731075287, Accuracy: 0.8850114345550537


In [19]:
predictions = model.predict(X_test_scaled)
print(f"First 10 Predictions:   {predictions[:20]}")
print(f"First 10 Actual labels: {y_test_categorical[:20].tolist()}")

First 10 Predictions:   [[2.12756931e-05 8.73015021e-18 9.99978781e-01]
 [8.46659916e-08 3.91448839e-06 9.99995947e-01]
 [1.76285741e-16 2.94115621e-10 1.00000000e+00]
 [7.95375407e-01 2.04617277e-01 7.31287264e-06]
 [5.43892839e-12 9.37236848e-08 9.99999881e-01]
 [4.37901258e-01 5.62098742e-01 3.32631345e-08]
 [7.93597543e-09 1.85037851e-02 9.81496274e-01]
 [5.07223187e-03 2.57557560e-12 9.94927764e-01]
 [2.88128319e-07 3.44100408e-03 9.96558726e-01]
 [2.65066717e-07 1.36124409e-05 9.99986172e-01]
 [1.95538560e-05 3.19232853e-08 9.99980450e-01]
 [3.71333539e-01 6.28517568e-01 1.48845371e-04]
 [9.97815728e-01 2.18429556e-03 6.90586421e-10]
 [2.02298069e-20 2.50431774e-19 1.00000000e+00]
 [1.37116871e-07 6.83554935e-10 9.99999881e-01]
 [3.59968059e-02 9.63891268e-01 1.11920126e-04]
 [1.54628713e-21 3.66928210e-10 1.00000000e+00]
 [1.62657827e-07 1.68404751e-03 9.98315811e-01]
 [4.33855169e-02 9.56504345e-01 1.10142108e-04]
 [5.57118028e-07 4.09455830e-03 9.95904863e-01]]
First 10 Actual

# Train the Model



In [109]:
from sklearn.ensemble import RandomForestClassifier

In [110]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X, y)
rf.score(X, y)

1.0

In [111]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

In [112]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [113]:
model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=20))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=3, activation='softmax'))

In [114]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [115]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               2100      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 303       
Total params: 12,503
Trainable params: 12,503
Non-trainable params: 0
_________________________________________________________________


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [16]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'penalty': ["l1", "l2"]}
model = LogisticRegression(solver="liblinear")
grid = GridSearchCV(model, param_grid, verbose=3)

In [17]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train.ravel())

Fitting 5 folds for each of 6 candidates, totalling 30 fits




[CV 1/5] END ................................C=1, penalty=l1; total time=   2.7s
[CV 2/5] END ................................C=1, penalty=l1; total time=   4.4s
[CV 3/5] END ................................C=1, penalty=l1; total time=   2.9s




[CV 4/5] END ................................C=1, penalty=l1; total time=   2.4s
[CV 5/5] END ................................C=1, penalty=l1; total time=   3.5s
[CV 1/5] END ................................C=1, penalty=l2; total time=   0.0s
[CV 2/5] END ................................C=1, penalty=l2; total time=   0.0s
[CV 3/5] END ................................C=1, penalty=l2; total time=   0.0s
[CV 4/5] END ................................C=1, penalty=l2; total time=   0.0s
[CV 5/5] END ................................C=1, penalty=l2; total time=   0.0s




[CV 1/5] END ................................C=5, penalty=l1; total time=  15.9s




[CV 2/5] END ................................C=5, penalty=l1; total time=  13.1s




[CV 3/5] END ................................C=5, penalty=l1; total time=  18.9s




[CV 4/5] END ................................C=5, penalty=l1; total time=  11.8s




[CV 5/5] END ................................C=5, penalty=l1; total time=  13.2s
[CV 1/5] END ................................C=5, penalty=l2; total time=   0.0s
[CV 2/5] END ................................C=5, penalty=l2; total time=   0.0s
[CV 3/5] END ................................C=5, penalty=l2; total time=   0.0s
[CV 4/5] END ................................C=5, penalty=l2; total time=   0.0s
[CV 5/5] END ................................C=5, penalty=l2; total time=   0.0s


KeyboardInterrupt: 

In [None]:
print(grid.best_params_)
print(grid.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)