# <span style="color:slateblue"><b>Import Dependencies

In [1]:
# Imports
import pandas as pd
import numpy as np
import tensorflow as tf
import os 
import seaborn as sn
import scipy.stats as sp
import itertools
import pickle
from imblearn.ensemble import BalancedRandomForestClassifier


import plotly.express as px
import plotly.graph_objects as go

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline

import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

import sklearn as skl
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score, balanced_accuracy_score, accuracy_score, mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, cross_val_predict, train_test_split
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier, KNeighborsRegressor

from imblearn.metrics import classification_report_imbalanced

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.plotting import plot_decision_regions

import joblib
from tensorflow.keras.utils import to_categorical

# <span style="color:slateblue"><b>Data Load</span>

In [2]:
# file paths
file_path_Raw = "./Resources/cumulative.csv"
file_path_Pickles = "./Pickles/"

In [3]:
with open(file_path_Pickles+'kepler_processed.pkl', 'rb') as pickle_file:
    keplerProcessed_df= pickle.load(pickle_file)

In [4]:
keplerProcessed_df

Unnamed: 0,Exoplanet_Archive_Disposition,Not_Transit-Like_FPF,Stellar_Eclipse_FPF,Centroid_Offset_FPF,Ephemeris_Match_Indicates_Contamination_FPF,Orbital_Period_[days],Transit_Epoch_[BKJD],Impact_Parameter,Transit_Duration_[hrs],Transit_Depth_[ppm],...,TCE_Planet_Number,Stellar_Effective_Temperature_[K],Stellar_Surface_Gravity,Stellar_Radius_[Solar_radii],right_ascension,declination,Kepler_band [mag],TCE_Delivery_q1_q16_tce,TCE_Delivery_q1_q17_dr24_tce,TCE_Delivery_q1_q17_dr25_tce
K00752.01,1,0,0,0,0,9.488036,170.538750,0.146,2.95750,615.8,...,1.0,5455.0,4.467,0.927,291.93423,48.141651,15.347,0,0,1
K00752.02,1,0,0,0,0,54.418383,162.513840,0.586,4.50700,874.8,...,2.0,5455.0,4.467,0.927,291.93423,48.141651,15.347,0,0,1
K00753.01,2,0,1,0,0,19.899140,175.850252,0.969,1.78220,10829.0,...,1.0,5853.0,4.544,0.868,297.00482,48.134129,15.436,0,0,1
K00754.01,2,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,...,1.0,5805.0,4.564,0.791,285.53461,48.285210,15.597,0,0,1
K00755.01,1,0,0,0,0,2.525592,171.595550,0.701,1.65450,603.3,...,1.0,6031.0,4.438,1.046,288.75488,48.226200,15.509,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
K07984.01,2,0,0,0,1,8.589871,132.016100,0.765,4.80600,87.7,...,1.0,5638.0,4.296,1.088,298.74921,46.973351,14.478,0,0,1
K07985.01,2,0,1,1,0,0.527699,131.705093,1.252,3.22210,1579.2,...,1.0,5638.0,4.529,0.903,297.18875,47.093819,14.082,0,0,1
K07986.01,0,0,0,0,0,1.739849,133.001270,0.043,3.11400,48.5,...,1.0,6119.0,4.444,1.031,286.50937,47.163219,14.757,0,0,1
K07987.01,2,0,0,1,0,0.681402,132.181750,0.147,0.86500,103.6,...,1.0,6173.0,4.447,1.041,294.16489,47.176281,15.385,0,0,1


### <span style="color:orange"> Preprocessing

### Separate the Features (X) from the Target (y)

In [5]:
y = keplerProcessed_df["Exoplanet_Archive_Disposition"]
X = keplerProcessed_df.drop(["Exoplanet_Archive_Disposition"], axis =1)

### Split our data into training and testing

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(6708, 23)

### Scale data

In [7]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# <span style="color:slateblue"><b>Supervised ML Logistic Regression Model

In [8]:
# Create the model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [9]:
# Train model
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [10]:
# List the features sorted in descending order by feature importance
x = sorted(zip(classifier.coef_[0], X.columns), reverse=True)
featureImp = pd.DataFrame(np.array(x).reshape(len(x),2), columns = list(["Importance","Feature"]))
featureImp

Unnamed: 0,Importance,Feature
0,1.4194370887669014,Transit_Depth_[ppm]
1,0.7182450900781062,Stellar_Radius_[Solar_radii]
2,0.5530591539637524,Impact_Parameter
3,0.3202506668688165,Orbital_Period_[days]
4,0.2748571489915717,Equilibrium_Temperature_[K]
5,0.138097944100539,TCE_Delivery_q1_q17_dr24_tce
6,0.0658106644670683,Stellar_Surface_Gravity
7,0.0487578858763507,TCE_Delivery_q1_q16_tce
8,0.0436907421300663,Stellar_Effective_Temperature_[K]
9,0.0429458856094524,Kepler_band [mag]


In [11]:
# Make predictions
y_pred = classifier.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,0,1
1,1,1
2,1,1
3,2,2
4,2,2
...,...,...
2232,0,0
2233,2,2
2234,0,1
2235,2,2


### Evaluate the model

In [12]:
# from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

0.8323647742512293


In [13]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix
confusion_matrix_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1", "Actual 2"], columns=["Predicted 0", "Predicted 1", "Predicted 2"])
confusion_matrix_df

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,307,219,8
Actual 1,135,426,11
Actual 2,2,0,1129


### Classfication Report

In [14]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.69      0.57      0.63       534
           1       0.66      0.74      0.70       572
           2       0.98      1.00      0.99      1131

    accuracy                           0.83      2237
   macro avg       0.78      0.77      0.77      2237
weighted avg       0.83      0.83      0.83      2237



# <span style="color:slateblue"><b>GRADIENT BOOSTED TREE

In [15]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=.25, max_depth=3, random_state=0)

    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.870
Accuracy score (validation): 0.857

Learning rate:  0.1
Accuracy score (training): 0.882
Accuracy score (validation): 0.878

Learning rate:  0.25
Accuracy score (training): 0.904
Accuracy score (validation): 0.894

Learning rate:  0.5
Accuracy score (training): 0.916
Accuracy score (validation): 0.898

Learning rate:  0.75
Accuracy score (training): 0.924
Accuracy score (validation): 0.894

Learning rate:  1
Accuracy score (training): 0.935
Accuracy score (validation): 0.896



In [16]:
# Using the learning_rate value from above (is this instance they are all the same), instantiate a model, train it, then create predictions
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.5, max_features=.5, max_depth=3, random_state=0)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

### Evaluate the model

In [17]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.9016540008940546


In [18]:
cm = confusion_matrix(y_test, predictions)
cm_GBT_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1", "Actual 2"],
   columns=["Predicted 0", "Predicted 1", "Predicted 2"]
)
cm_GBT_df

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,426,101,7
Actual 1,95,465,12
Actual 2,5,0,1126


In [19]:
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.81      0.80      0.80       534
           1       0.82      0.81      0.82       572
           2       0.98      1.00      0.99      1131

    accuracy                           0.90      2237
   macro avg       0.87      0.87      0.87      2237
weighted avg       0.90      0.90      0.90      2237



# <span style="color:slateblue"><b>Balanced Random Forest Classifier

In [20]:
# Resample and fitting the training data with the BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) 
brf_model.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(random_state=1)

In [21]:
# Calculated the balanced accuracy score
y_pred = brf_model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.8652250607887958

In [22]:
# Calculating the confusion matrix.
cm_brfc = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix
confusion_matrix_brfc_df = pd.DataFrame(cm_brfc, index=["Actual 0", "Actual 1", "Actual 2"], columns=["Predicted 0", "Predicted 1", "Predicted 2"])

confusion_matrix_brfc_df

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,425,97,12
Actual 1,103,459,10
Actual 2,2,1,1128


### Evaluate the model

In [23]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.80      0.80      0.94      0.80      0.86      0.74       534
          1       0.82      0.80      0.94      0.81      0.87      0.74       572
          2       0.98      1.00      0.98      0.99      0.99      0.98      1131

avg / total       0.90      0.90      0.96      0.90      0.93      0.86      2237



# <span style="color:slateblue"><b>MODELS RESHAPING FOR THE WEB APPLICATION

### Cutting down some features and exporting the trained mode
### The scaler parameters is the same that the one export in Neural Network Notebbok

In [24]:
y = keplerProcessed_df["Exoplanet_Archive_Disposition"]
X = keplerProcessed_df.drop(["Exoplanet_Archive_Disposition","TCE_Planet_Number","right_ascension","declination","TCE_Delivery_q1_q16_tce","TCE_Delivery_q1_q17_dr24_tce","TCE_Delivery_q1_q17_dr25_tce"], axis =1)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(6708, 17)

In [26]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### <span style="color:slateblue"><b>Supervised ML Logistic Regression Model (WEBAPP)

In [27]:
# Create the model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [28]:
# Train model
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [29]:
# from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))

0.8994188645507376


<b>Small increase in accuracy: 0.8971837282074206 vs. 8292355833705856 previously</b>

In [30]:
# # Exporting the model
# save_location=os.path.join("webapp","model","SLR.pkl")
# joblib.dump(classifier, save_location)

### <span style="color:slateblue"><b>GRADIENT BOOSTED TREE (WEBAPP)

In [31]:
# Create the model
classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.5, max_features=.5, max_depth=3, random_state=0)

classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [32]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.8976307554760841


<b>Small decrease in accuracy: 0.8936075100581136 vs. 0.902101028162718 previously</b>

In [33]:
# # Exporting the model
# save_location=os.path.join("webapp","model","GBT.pkl")
# joblib.dump(classifier, save_location)

### <span style="color:slateblue"><b>Balanced Random Forest Classifier (WEBAPP)

In [34]:
# Resample and fitting the training data with the BalancedRandomForestClassifier
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1) 
brf_model.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(random_state=1)

In [35]:
# Calculated the balanced accuracy score
y_pred = brf_model.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.8651443803516644

<b>Slightly increase in accuracy: 0.863978879186163 vs. 0.8619380855917083 previously</b>

In [36]:
# # Exporting the model
# save_location=os.path.join("webapp","model","BRF.pkl")
# joblib.dump(brf_model, save_location)