In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade



In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [5]:
# # Determine which columns to use for the model.  Removed the Error ratings
list(df.columns)


['koi_disposition',
 'koi_fpflag_nt',
 'koi_fpflag_ss',
 'koi_fpflag_co',
 'koi_fpflag_ec',
 'koi_period',
 'koi_period_err1',
 'koi_period_err2',
 'koi_time0bk',
 'koi_time0bk_err1',
 'koi_time0bk_err2',
 'koi_impact',
 'koi_impact_err1',
 'koi_impact_err2',
 'koi_duration',
 'koi_duration_err1',
 'koi_duration_err2',
 'koi_depth',
 'koi_depth_err1',
 'koi_depth_err2',
 'koi_prad',
 'koi_prad_err1',
 'koi_prad_err2',
 'koi_teq',
 'koi_insol',
 'koi_insol_err1',
 'koi_insol_err2',
 'koi_model_snr',
 'koi_tce_plnt_num',
 'koi_steff',
 'koi_steff_err1',
 'koi_steff_err2',
 'koi_slogg',
 'koi_slogg_err1',
 'koi_slogg_err2',
 'koi_srad',
 'koi_srad_err1',
 'koi_srad_err2',
 'ra',
 'dec',
 'koi_kepmag']

# Select your features (columns)

In [6]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [7]:
# Set features. This will also be used as your x values.
selected_features = df[[
#  'koi_fpflag_nt',
#  'koi_fpflag_ss',
#  'koi_fpflag_co',
#  'koi_fpflag_ec',
 'koi_disposition',
 'koi_period',
 'koi_time0bk',
 'koi_impact',
 'koi_duration',
 'koi_depth',
 'koi_prad',
 'koi_teq',
 'koi_insol',
 'koi_model_snr',
 'koi_tce_plnt_num',
 'koi_steff',
 'koi_slogg',
 'koi_srad',
 'ra',
 'dec',
 'koi_kepmag']]

selected_features

Unnamed: 0,koi_disposition,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
0,CONFIRMED,54.418383,162.513840,0.586,4.50700,874.8,2.83,443,9.11,25.8,2,5455,4.467,0.927,291.93423,48.141651,15.347
1,FALSE POSITIVE,19.899140,175.850252,0.969,1.78220,10829.0,14.60,638,39.30,76.3,1,5853,4.544,0.868,297.00482,48.134129,15.436
2,FALSE POSITIVE,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395,891.96,505.6,1,5805,4.564,0.791,285.53461,48.285210,15.597
3,CONFIRMED,2.525592,171.595550,0.701,1.65450,603.3,2.75,1406,926.16,40.9,1,6031,4.438,1.046,288.75488,48.226200,15.509
4,CONFIRMED,4.134435,172.979370,0.762,3.14020,686.0,2.77,1160,427.65,40.2,2,6046,4.486,0.972,296.28613,48.224670,15.714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986,FALSE POSITIVE,8.589871,132.016100,0.765,4.80600,87.7,1.11,929,176.40,8.4,1,5638,4.296,1.088,298.74921,46.973351,14.478
6987,FALSE POSITIVE,0.527699,131.705093,1.252,3.22210,1579.2,29.35,2088,4500.53,453.3,1,5638,4.529,0.903,297.18875,47.093819,14.082
6988,CANDIDATE,1.739849,133.001270,0.043,3.11400,48.5,0.72,1608,1585.81,10.6,1,6119,4.444,1.031,286.50937,47.163219,14.757
6989,FALSE POSITIVE,0.681402,132.181750,0.147,0.86500,103.6,1.07,2218,5713.41,12.3,1,6173,4.447,1.041,294.16489,47.176281,15.385


In [8]:
target = selected_features['koi_disposition']
data = selected_features.drop('koi_disposition', axis=1)

X=data
y = pd.get_dummies(target)
# y=target

y

Unnamed: 0,CANDIDATE,CONFIRMED,FALSE POSITIVE
0,0,1,0
1,0,0,1
2,0,0,1
3,0,1,0
4,0,1,0
...,...,...,...
6986,0,0,1
6987,0,0,1
6988,1,0,0
6989,0,0,1


# Create a Train Test Split

Use `koi_disposition` for the y values

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# X_test

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [11]:
# Scale your data

from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)



In [12]:
# scaler based on training is applied to the test data set

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



# Model 1: Random Forest 



In [13]:
#Using Random Forest Classifier Model
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=200)

model_rf.fit(X_train_scaled, y_train)
model_rf.score(X_train_scaled, y_train)

1.0

In [14]:
# Calculate feature importance & sort
importances = model_rf.feature_importances_
# importances
sorted(zip(model_rf.feature_importances_, selected_features), reverse=True)

[(0.1454281601440294, 'koi_insol'),
 (0.11801007266575513, 'koi_depth'),
 (0.08309872209739975, 'koi_disposition'),
 (0.08160765782387402, 'koi_time0bk'),
 (0.0792555725598481, 'koi_duration'),
 (0.06802166562503706, 'koi_impact'),
 (0.06129626017216969, 'koi_prad'),
 (0.05502859914938828, 'koi_period'),
 (0.05472140905310959, 'koi_teq'),
 (0.046493443163233686, 'koi_srad'),
 (0.04210215773242442, 'koi_tce_plnt_num'),
 (0.038503283184632996, 'ra'),
 (0.03825304640787834, 'dec'),
 (0.03704698609083124, 'koi_slogg'),
 (0.035352058993758574, 'koi_steff'),
 (0.0157809051366298, 'koi_model_snr')]

In [15]:
#Create the predicted model
y_predicted = model_rf.predict(X_test_scaled)
y_predicted



array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 0],
       ...,
       [0, 1, 0],
       [0, 0, 0],
       [0, 0, 1]], dtype=uint8)

In [16]:
# Calculate classification report for model predictions

from sklearn.metrics import classification_report
print(classification_report(y_test,y_predicted))

              precision    recall  f1-score   support

           0       0.61      0.38      0.47       411
           1       0.81      0.72      0.76       484
           2       0.84      0.81      0.82       853

   micro avg       0.79      0.68      0.73      1748
   macro avg       0.75      0.64      0.69      1748
weighted avg       0.78      0.68      0.72      1748
 samples avg       0.68      0.68      0.68      1748



  _warn_prf(average, modifier, msg_start, len(result))


# Hyperparameter Tuning


Two Step Tuning Process: 
1) Used RandomSearchCV to determine best parameters for the GridSeachCV tuning.
2) Used GridSearchCV to tune the model's parameters.

In [17]:
# Look at parameters used by our current forest
from pprint import pprint
print('Parameters currently in use:\n')
pprint(model_rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [18]:
# Tuning the Random Forest Model

import numpy as np
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [29]:
# Random Search training of the model.  These parameters will also be used for GridSearchCV

from sklearn.ensemble import RandomForestRegressor
# Use the random grid to search for best hyperparameters
# First create the base model to tune
model_rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
model_rf_random = RandomizedSearchCV(estimator = model_rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# # Fit the random search model
model_rf_random.fit(X_train_scaled, y_train)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [30]:
#Get the best parameters for the GridSearchSVC Tuning
model_rf_random.best_params_

{'n_estimators': 1400,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 80,
 'bootstrap': False}

In [31]:
#Score the random model
model_rf_random.score(X_train_scaled, y_train)


0.8717412939796795

In [33]:
#Create the predicted model
y_predicted_random = model_rf_random.predict(X_test_scaled)
y_predicted_random


array([[0.29333673, 0.12972109, 0.57694218],
       [0.47962755, 0.05237925, 0.4679932 ],
       [0.27104252, 0.23151701, 0.49744048],
       ...,
       [0.26886905, 0.5697619 , 0.16136905],
       [0.26164796, 0.51786395, 0.2204881 ],
       [0.19552721, 0.21684184, 0.58763095]])

In [35]:
# Evaluate Random Search to determine if a better model was created
from sklearn.ensemble import RandomForestRegressor

def evaluate(model_rf, X_test_scaled, y_test):
    predictions = model_rf.predict(X_test_scaled)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Model Performance')
#     print(y_binary_train)
    print(np.mean(errors))
    print(accuracy)
#     print('Average Error: {:0.4f} degrees.'(np.mean(errors)))
#     print('Accuracy = {:0.2f}%.'(accuracy))
#     print(accuracy)
#     print(errors)
    
    return accuracy

# Base model Accuracy
model_rf = RandomForestRegressor(n_estimators = 10, random_state = 42)
model_rf.fit(X_train_scaled, y_train)
base_accuracy = evaluate(model_rf, X_test_scaled, y_test)

from sklearn.model_selection import GridSearchCV

# Random model accuracy
# best_random = model_rf_random.best_estimator_
random_accuracy = evaluate(model_rf_random, X_test_scaled, y_test)


Model Performance
CANDIDATE         0.271739
CONFIRMED         0.181007
FALSE POSITIVE    0.248513
dtype: float64
CANDIDATE        -inf
CONFIRMED        -inf
FALSE POSITIVE   -inf
dtype: float64
Model Performance
CANDIDATE         0.272905
CONFIRMED         0.186649
FALSE POSITIVE    0.246021
dtype: float64
CANDIDATE        -inf
CONFIRMED        -inf
FALSE POSITIVE   -inf
dtype: float64


In [37]:
# Using the paprameters established in Random Search in Grid Search to further hypertune the model

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Create a based model
model_rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(model_rf, param_grid, verbose = 3)

In [38]:
# Fit the grid search to the data

grid_search.fit(X_train_scaled, y_train)

grid_search.best_params_

{'n_estimators': 1400,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 80,
 'bootstrap': False}

best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test_scaled, y_test)


Fitting 5 folds for each of 288 candidates, totalling 1440 fits
[CV 1/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 2/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 3/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 4/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 5/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 1/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   1.0s
[CV 2/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=2

[CV 5/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   7.5s
[CV 1/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 2/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 3/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 4/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 5/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 1/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   1.2s
[CV 2/5] END bootstrap=True, max_depth=

[CV 5/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   5.7s
[CV 1/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 2/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 3/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 4/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.4s
[CV 5/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 1/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=200; total time=   1.1s
[CV 2/5] END bootstrap=True, max_depth=

[CV 5/5] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   5.5s
[CV 1/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV 2/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 3/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 4/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV 5/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV 1/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   1.5s
[CV 2/5] END bootstrap=True, max_depth=

[CV 5/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   7.1s
[CV 1/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV 2/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 3/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV 4/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV 5/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV 1/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   1.4s
[CV 2/5] END bootstrap=True, max_depth=

[CV 5/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   9.8s
[CV 1/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.9s
[CV 2/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.9s
[CV 3/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.9s
[CV 4/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.9s
[CV 5/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.9s
[CV 1/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=5, min_samples_split=8, n_estimators=200; total time=   1.9s
[CV 2/5] END bootstrap=True, max_depth=

[CV 5/5] END bootstrap=True, max_depth=80, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   9.6s
[CV 1/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.8s
[CV 2/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 3/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 4/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 5/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 1/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   1.4s
[CV 2/5] END bootstrap=True, max_depth=

[CV 5/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   5.9s
[CV 1/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 2/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 3/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 4/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 5/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 1/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   1.1s
[CV 2/5] END bootstrap=True, max_depth=

[CV 5/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   6.0s
[CV 1/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 2/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 3/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 4/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV 5/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV 1/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=200; total time=   1.3s
[CV 2/5] END bootstrap=True, max_depth=

[CV 5/5] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   5.5s
[CV 1/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 2/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 3/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 4/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 5/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 1/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   1.5s
[CV 2/5] END bootstrap=True, max_depth=

[CV 5/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   9.6s
[CV 1/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.9s
[CV 2/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.9s
[CV 3/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.9s
[CV 4/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.9s
[CV 5/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.9s
[CV 1/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   1.8s
[CV 2/5] END bootstrap=True, max_depth=

[CV 5/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   7.4s
[CV 1/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 2/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV 3/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 4/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 5/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV 1/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=5, min_samples_split=8, n_estimators=200; total time=   1.5s
[CV 2/5] END bootstrap=True, max_depth=

[CV 5/5] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   6.8s
[CV 1/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 2/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 3/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 4/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 5/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 1/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   1.0s
[CV 2/5] END bootstrap=True, max_

[CV 4/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   5.7s
[CV 5/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   5.4s
[CV 1/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 2/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV 3/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.4s
[CV 4/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.4s
[CV 5/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.4s
[CV 1/5] END bootstrap=True, m

[CV 3/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   5.2s
[CV 4/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   5.4s
[CV 5/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   5.0s
[CV 1/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.4s
[CV 2/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.4s
[CV 3/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.4s
[CV 4/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.4s
[CV 5/5] END bootstrap=True,

[CV 2/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   5.7s
[CV 3/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   5.4s
[CV 4/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   5.9s
[CV 5/5] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   6.4s
[CV 1/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.8s
[CV 2/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.9s
[CV 3/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.8s
[CV 4/5] END bootstrap=Tru

[CV 1/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   7.3s
[CV 2/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   7.1s
[CV 3/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   7.3s
[CV 4/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   6.8s
[CV 5/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   6.9s
[CV 1/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV 2/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV 3/5] END bootstrap=T

[CV 5/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=300; total time=   2.1s
[CV 1/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   7.1s
[CV 2/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   7.2s
[CV 3/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   7.3s
[CV 4/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   7.2s
[CV 5/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   7.4s
[CV 1/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV 2/5] END bootstrap=

[CV 4/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=300; total time=   2.0s
[CV 5/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=300; total time=   2.0s
[CV 1/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   7.2s
[CV 2/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   7.0s
[CV 3/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   7.2s
[CV 4/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   6.8s
[CV 5/5] END bootstrap=True, max_depth=100, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   7.0s
[CV 1/5] END bootstrap

[CV 3/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=300; total time=   1.6s
[CV 4/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=300; total time=   1.6s
[CV 5/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=300; total time=   1.6s
[CV 1/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   5.7s
[CV 2/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   5.6s
[CV 3/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   5.5s
[CV 4/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   5.5s
[CV 5/5] END bootstrap=

[CV 2/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=4, min_samples_split=12, n_estimators=300; total time=   1.6s
[CV 3/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=4, min_samples_split=12, n_estimators=300; total time=   1.6s
[CV 4/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=4, min_samples_split=12, n_estimators=300; total time=   1.5s
[CV 5/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=4, min_samples_split=12, n_estimators=300; total time=   1.6s
[CV 1/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   5.5s
[CV 2/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   5.4s
[CV 3/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=4, min_samples_split=12, n_estimators=1000; total time=   5.5s
[CV 4/5] END bootstrap=T

[CV 1/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=5, min_samples_split=12, n_estimators=300; total time=   1.6s
[CV 2/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=5, min_samples_split=12, n_estimators=300; total time=   1.5s
[CV 3/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=5, min_samples_split=12, n_estimators=300; total time=   1.6s
[CV 4/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=5, min_samples_split=12, n_estimators=300; total time=   1.5s
[CV 5/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=5, min_samples_split=12, n_estimators=300; total time=   1.7s
[CV 1/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   5.4s
[CV 2/5] END bootstrap=True, max_depth=110, max_features=2, min_samples_leaf=5, min_samples_split=12, n_estimators=1000; total time=   5.0s
[CV 3/5] END bootstrap=Tr

[CV 5/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=200; total time=   1.3s
[CV 1/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=300; total time=   2.1s
[CV 2/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=300; total time=   2.0s
[CV 3/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=300; total time=   2.1s
[CV 4/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=300; total time=   2.1s
[CV 5/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=300; total time=   2.1s
[CV 1/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   7.1s
[CV 2/5] END bootstrap=Tru

[CV 4/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=200; total time=   1.6s
[CV 5/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=200; total time=   1.5s
[CV 1/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=300; total time=   2.4s
[CV 2/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=300; total time=   2.4s
[CV 3/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=300; total time=   2.4s
[CV 4/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=300; total time=   2.3s
[CV 5/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=4, min_samples_split=12, n_estimators=300; total time=   2.4s
[CV 1/5] END bootstrap=True

[CV 3/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=200; total time=   1.4s
[CV 4/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=200; total time=   1.3s
[CV 5/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=200; total time=   1.3s
[CV 1/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=300; total time=   2.0s
[CV 2/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=300; total time=   2.0s
[CV 3/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=300; total time=   2.1s
[CV 4/5] END bootstrap=True, max_depth=110, max_features=3, min_samples_leaf=5, min_samples_split=12, n_estimators=300; total time=   2.0s
[CV 5/5] END bootstrap=True

# Model 2: Neural Network Model 


In [41]:
X=data

y_nn=target


In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_nn_train, y_nn_test = train_test_split(X, y_nn, random_state=42)

# X_test

In [53]:
# Once Hot Encoding of Y

from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder_y_nn_train = LabelEncoder()
label_encoder_y_nn_test = LabelEncoder()
label_encoder_y_nn_train.fit(y_nn_train)
label_encoder_y_nn_test.fit(y_nn_test)

encoded_y_train = label_encoder_y_nn_train.transform(y_nn_train)
encoded_y_test = label_encoder_y_nn_test.transform(y_nn_test)
encoded_y_test


array([2, 0, 2, ..., 1, 1, 1])

In [54]:
for label, original_class in zip(encoded_y_test, y_nn_test):
    print('Original Class: ' + str(original_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Clas

------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CANDIDATE
Encoded Label: 0
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: CONFIRMED
Encoded Label: 1
------------
Original Class: FALSE POSITIVE
Encoded Label: 2
------------
Original Class: FALSE POSITIVE
Encoded Label: 

In [56]:
from keras.utils import to_categorical

# Step 2: One-hot encoding

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

y_test_categorical


array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

In [57]:
# Defining Model Architecture
from tensorflow.keras.models import Sequential
model_nn = Sequential()

from tensorflow.keras.layers import Dense
number_inputs = 16
number_hidden_nodes = 48
model_nn.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))



In [58]:
# Defining the Output Variables - Candidate, Confirmed, False Positive
number_classes = 3
model_nn.add(Dense(units=number_classes, activation='softmax'))

In [59]:
# Create a Model Summary

model_nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 48)                816       
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 147       
Total params: 963
Trainable params: 963
Non-trainable params: 0
_________________________________________________________________


In [60]:
# Compiling the model

model_nn.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [61]:
# Fit (train) the model
model_nn.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=1000,
    shuffle=True,
    verbose=2
)

Train on 5243 samples
Epoch 1/1000
5243/5243 - 0s - loss: 1.0230 - accuracy: 0.4816
Epoch 2/1000
5243/5243 - 0s - loss: 0.9452 - accuracy: 0.5337
Epoch 3/1000
5243/5243 - 0s - loss: 0.8986 - accuracy: 0.5712
Epoch 4/1000
5243/5243 - 0s - loss: 0.8735 - accuracy: 0.5825
Epoch 5/1000
5243/5243 - 0s - loss: 0.8561 - accuracy: 0.5880
Epoch 6/1000
5243/5243 - 0s - loss: 0.8453 - accuracy: 0.5941
Epoch 7/1000
5243/5243 - 0s - loss: 0.8351 - accuracy: 0.6035
Epoch 8/1000
5243/5243 - 0s - loss: 0.8280 - accuracy: 0.6031
Epoch 9/1000
5243/5243 - 0s - loss: 0.8220 - accuracy: 0.6061
Epoch 10/1000
5243/5243 - 0s - loss: 0.8154 - accuracy: 0.6149
Epoch 11/1000
5243/5243 - 0s - loss: 0.8129 - accuracy: 0.6151
Epoch 12/1000
5243/5243 - 0s - loss: 0.8049 - accuracy: 0.6241
Epoch 13/1000
5243/5243 - 0s - loss: 0.8018 - accuracy: 0.6239
Epoch 14/1000
5243/5243 - 0s - loss: 0.7992 - accuracy: 0.6237
Epoch 15/1000
5243/5243 - 0s - loss: 0.7943 - accuracy: 0.6292
Epoch 16/1000
5243/5243 - 0s - loss: 0.792

Epoch 131/1000
5243/5243 - 0s - loss: 0.6592 - accuracy: 0.7215
Epoch 132/1000
5243/5243 - 0s - loss: 0.6561 - accuracy: 0.7183
Epoch 133/1000
5243/5243 - 0s - loss: 0.6566 - accuracy: 0.7189
Epoch 134/1000
5243/5243 - 0s - loss: 0.6538 - accuracy: 0.7206
Epoch 135/1000
5243/5243 - 0s - loss: 0.6553 - accuracy: 0.7223
Epoch 136/1000
5243/5243 - 0s - loss: 0.6535 - accuracy: 0.7177
Epoch 137/1000
5243/5243 - 0s - loss: 0.6535 - accuracy: 0.7204
Epoch 138/1000
5243/5243 - 0s - loss: 0.6539 - accuracy: 0.7173
Epoch 139/1000
5243/5243 - 0s - loss: 0.6534 - accuracy: 0.7198
Epoch 140/1000
5243/5243 - 0s - loss: 0.6521 - accuracy: 0.7196
Epoch 141/1000
5243/5243 - 0s - loss: 0.6529 - accuracy: 0.7213
Epoch 142/1000
5243/5243 - 0s - loss: 0.6529 - accuracy: 0.7213
Epoch 143/1000
5243/5243 - 0s - loss: 0.6521 - accuracy: 0.7175
Epoch 144/1000
5243/5243 - 0s - loss: 0.6509 - accuracy: 0.7240
Epoch 145/1000
5243/5243 - 0s - loss: 0.6491 - accuracy: 0.7198
Epoch 146/1000
5243/5243 - 0s - loss: 0.

5243/5243 - 0s - loss: 0.5947 - accuracy: 0.7446
Epoch 260/1000
5243/5243 - 0s - loss: 0.5924 - accuracy: 0.7547
Epoch 261/1000
5243/5243 - 0s - loss: 0.5928 - accuracy: 0.7524
Epoch 262/1000
5243/5243 - 0s - loss: 0.5945 - accuracy: 0.7471
Epoch 263/1000
5243/5243 - 0s - loss: 0.5917 - accuracy: 0.7536
Epoch 264/1000
5243/5243 - 0s - loss: 0.5916 - accuracy: 0.7505
Epoch 265/1000
5243/5243 - 0s - loss: 0.5923 - accuracy: 0.7545
Epoch 266/1000
5243/5243 - 0s - loss: 0.5894 - accuracy: 0.7580
Epoch 267/1000
5243/5243 - 0s - loss: 0.5877 - accuracy: 0.7608
Epoch 268/1000
5243/5243 - 0s - loss: 0.5908 - accuracy: 0.7526
Epoch 269/1000
5243/5243 - 0s - loss: 0.5871 - accuracy: 0.7503
Epoch 270/1000
5243/5243 - 0s - loss: 0.5919 - accuracy: 0.7488
Epoch 271/1000
5243/5243 - 0s - loss: 0.5896 - accuracy: 0.7536
Epoch 272/1000
5243/5243 - 0s - loss: 0.5908 - accuracy: 0.7530
Epoch 273/1000
5243/5243 - 0s - loss: 0.5878 - accuracy: 0.7517
Epoch 274/1000
5243/5243 - 0s - loss: 0.5871 - accuracy

Epoch 388/1000
5243/5243 - 0s - loss: 0.5579 - accuracy: 0.7702
Epoch 389/1000
5243/5243 - 0s - loss: 0.5568 - accuracy: 0.7719
Epoch 390/1000
5243/5243 - 0s - loss: 0.5556 - accuracy: 0.7761
Epoch 391/1000
5243/5243 - 0s - loss: 0.5592 - accuracy: 0.7690
Epoch 392/1000
5243/5243 - 0s - loss: 0.5604 - accuracy: 0.7665
Epoch 393/1000
5243/5243 - 0s - loss: 0.5593 - accuracy: 0.7715
Epoch 394/1000
5243/5243 - 0s - loss: 0.5558 - accuracy: 0.7717
Epoch 395/1000
5243/5243 - 0s - loss: 0.5581 - accuracy: 0.7702
Epoch 396/1000
5243/5243 - 0s - loss: 0.5591 - accuracy: 0.7696
Epoch 397/1000
5243/5243 - 0s - loss: 0.5594 - accuracy: 0.7706
Epoch 398/1000
5243/5243 - 0s - loss: 0.5566 - accuracy: 0.7715
Epoch 399/1000
5243/5243 - 0s - loss: 0.5562 - accuracy: 0.7726
Epoch 400/1000
5243/5243 - 0s - loss: 0.5599 - accuracy: 0.7690
Epoch 401/1000
5243/5243 - 0s - loss: 0.5582 - accuracy: 0.7702
Epoch 402/1000
5243/5243 - 0s - loss: 0.5577 - accuracy: 0.7704
Epoch 403/1000
5243/5243 - 0s - loss: 0.

5243/5243 - 0s - loss: 0.5433 - accuracy: 0.7776
Epoch 517/1000
5243/5243 - 0s - loss: 0.5448 - accuracy: 0.7765
Epoch 518/1000
5243/5243 - 0s - loss: 0.5428 - accuracy: 0.7807
Epoch 519/1000
5243/5243 - 0s - loss: 0.5418 - accuracy: 0.7770
Epoch 520/1000
5243/5243 - 0s - loss: 0.5418 - accuracy: 0.7793
Epoch 521/1000
5243/5243 - 0s - loss: 0.5418 - accuracy: 0.7757
Epoch 522/1000
5243/5243 - 0s - loss: 0.5395 - accuracy: 0.7782
Epoch 523/1000
5243/5243 - 0s - loss: 0.5401 - accuracy: 0.7824
Epoch 524/1000
5243/5243 - 0s - loss: 0.5401 - accuracy: 0.7786
Epoch 525/1000
5243/5243 - 0s - loss: 0.5421 - accuracy: 0.7778
Epoch 526/1000
5243/5243 - 0s - loss: 0.5399 - accuracy: 0.7831
Epoch 527/1000
5243/5243 - 0s - loss: 0.5422 - accuracy: 0.7738
Epoch 528/1000
5243/5243 - 0s - loss: 0.5466 - accuracy: 0.7767
Epoch 529/1000
5243/5243 - 0s - loss: 0.5402 - accuracy: 0.7749
Epoch 530/1000
5243/5243 - 0s - loss: 0.5407 - accuracy: 0.7782
Epoch 531/1000
5243/5243 - 0s - loss: 0.5399 - accuracy

Epoch 645/1000
5243/5243 - 0s - loss: 0.5316 - accuracy: 0.7807
Epoch 646/1000
5243/5243 - 0s - loss: 0.5285 - accuracy: 0.7843
Epoch 647/1000
5243/5243 - 0s - loss: 0.5310 - accuracy: 0.7839
Epoch 648/1000
5243/5243 - 0s - loss: 0.5301 - accuracy: 0.7816
Epoch 649/1000
5243/5243 - 0s - loss: 0.5293 - accuracy: 0.7831
Epoch 650/1000
5243/5243 - 0s - loss: 0.5321 - accuracy: 0.7810
Epoch 651/1000
5243/5243 - 0s - loss: 0.5333 - accuracy: 0.7824
Epoch 652/1000
5243/5243 - 0s - loss: 0.5293 - accuracy: 0.7797
Epoch 653/1000
5243/5243 - 0s - loss: 0.5281 - accuracy: 0.7826
Epoch 654/1000
5243/5243 - 0s - loss: 0.5301 - accuracy: 0.7809
Epoch 655/1000
5243/5243 - 0s - loss: 0.5346 - accuracy: 0.7791
Epoch 656/1000
5243/5243 - 0s - loss: 0.5293 - accuracy: 0.7816
Epoch 657/1000
5243/5243 - 0s - loss: 0.5292 - accuracy: 0.7850
Epoch 658/1000
5243/5243 - 0s - loss: 0.5294 - accuracy: 0.7822
Epoch 659/1000
5243/5243 - 0s - loss: 0.5302 - accuracy: 0.7847
Epoch 660/1000
5243/5243 - 0s - loss: 0.

5243/5243 - 0s - loss: 0.5246 - accuracy: 0.7849
Epoch 774/1000
5243/5243 - 0s - loss: 0.5230 - accuracy: 0.7885
Epoch 775/1000
5243/5243 - 0s - loss: 0.5239 - accuracy: 0.7850
Epoch 776/1000
5243/5243 - 0s - loss: 0.5243 - accuracy: 0.7845
Epoch 777/1000
5243/5243 - 0s - loss: 0.5232 - accuracy: 0.7862
Epoch 778/1000
5243/5243 - 0s - loss: 0.5260 - accuracy: 0.7841
Epoch 779/1000
5243/5243 - 0s - loss: 0.5220 - accuracy: 0.7860
Epoch 780/1000
5243/5243 - 0s - loss: 0.5220 - accuracy: 0.7841
Epoch 781/1000
5243/5243 - 0s - loss: 0.5235 - accuracy: 0.7871
Epoch 782/1000
5243/5243 - 0s - loss: 0.5214 - accuracy: 0.7856
Epoch 783/1000
5243/5243 - 0s - loss: 0.5250 - accuracy: 0.7824
Epoch 784/1000
5243/5243 - 0s - loss: 0.5232 - accuracy: 0.7860
Epoch 785/1000
5243/5243 - 0s - loss: 0.5254 - accuracy: 0.7835
Epoch 786/1000
5243/5243 - 0s - loss: 0.5229 - accuracy: 0.7812
Epoch 787/1000
5243/5243 - 0s - loss: 0.5227 - accuracy: 0.7835
Epoch 788/1000
5243/5243 - 0s - loss: 0.5246 - accuracy

Epoch 902/1000
5243/5243 - 0s - loss: 0.5188 - accuracy: 0.7850
Epoch 903/1000
5243/5243 - 0s - loss: 0.5160 - accuracy: 0.7902
Epoch 904/1000
5243/5243 - 0s - loss: 0.5158 - accuracy: 0.7881
Epoch 905/1000
5243/5243 - 0s - loss: 0.5186 - accuracy: 0.7885
Epoch 906/1000
5243/5243 - 0s - loss: 0.5193 - accuracy: 0.7870
Epoch 907/1000
5243/5243 - 0s - loss: 0.5154 - accuracy: 0.7873
Epoch 908/1000
5243/5243 - 0s - loss: 0.5185 - accuracy: 0.7805
Epoch 909/1000
5243/5243 - 0s - loss: 0.5166 - accuracy: 0.7892
Epoch 910/1000
5243/5243 - 0s - loss: 0.5167 - accuracy: 0.7881
Epoch 911/1000
5243/5243 - 0s - loss: 0.5151 - accuracy: 0.7883
Epoch 912/1000
5243/5243 - 0s - loss: 0.5170 - accuracy: 0.7843
Epoch 913/1000
5243/5243 - 0s - loss: 0.5196 - accuracy: 0.7879
Epoch 914/1000
5243/5243 - 0s - loss: 0.5159 - accuracy: 0.7879
Epoch 915/1000
5243/5243 - 0s - loss: 0.5198 - accuracy: 0.7849
Epoch 916/1000
5243/5243 - 0s - loss: 0.5154 - accuracy: 0.7913
Epoch 917/1000
5243/5243 - 0s - loss: 0.

<tensorflow.python.keras.callbacks.History at 0x2191cf5bda0>

In [62]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model_nn.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1748/1748 - 0s - loss: 0.6197 - accuracy: 0.7414
Loss: 0.6197039609097235, Accuracy: 0.7414187788963318


# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash

import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
#Tuning the Random Forest Model 



In [32]:
# Create the GridSearchCV model

# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
# grid_rf = GridSearchCV(model_rf, param_grid, verbose=3)
grid_nn = GridSearchCV(model_nn, param_grid, verbose=3)

grid_nn

GridSearchCV(estimator=<tensorflow.python.keras.engine.sequential.Sequential object at 0x000001530DB0C518>,
             param_grid={'C': [1, 5, 10, 50],
                         'gamma': [0.0001, 0.0005, 0.001, 0.005]},
             verbose=3)

In [33]:
# Train the model with GridSearch

# This will take the SVC model and try each combination of parameters
# grid_rf.fit(X_train, y_train)
grid_nn.fit(X_train, y_train)


TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator <tensorflow.python.keras.engine.sequential.Sequential object at 0x000001530DB0C518> does not.

In [29]:
# List the best parameters for this dataset
print(grid_rf.best_params_)
print(grid_nn.best_params_)



AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [None]:
# Make predictions with the hypertuned model
prediction_rf = grid_rf.predict(X_test)
prediction_nn = grid_nn.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,prediction_rf))
print(classification_report(y_test,prediction_nn))