In [1]:
# Import packages
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn Packages
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Sklearn Evaluation Metrics
from sklearn import metrics
from sklearn.metrics import mean_squared_error, precision_score, confusion_matrix, accuracy_score

# Visualizes all the columns
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
# Import dataset
#path = '/content/Exoplanet_ml'

df = pd.read_csv('cleaned.csv')


In [3]:
features = df.drop(columns=['row','kepid','koi_disposition','koi_score','ExoplanetCandidate'])
target = df.ExoplanetCandidate

In [4]:
df[df.select_dtypes(np.float64).columns] = df.select_dtypes(np.float64).astype(np.float32)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=1, test_size=.30)

In [6]:
# Evaluation function

def evaluation(y_true, y_pred):
    
# Print Accuracy, Recall, F1 Score, and Precision metrics.
    print('Evaluation Metrics:')
    print('Accuracy: ' + str(metrics.accuracy_score(y_test, y_pred)))
    print('Recall: ' + str(metrics.recall_score(y_test, y_pred)))
    print('F1 Score: ' + str(metrics.f1_score(y_test, y_pred)))
    print('Precision: ' + str(metrics.precision_score(y_test, y_pred)))
    
# Print Confusion Matrix
    print('\nConfusion Matrix:')
    print(' TN,  FP, FN, TP')
    print(confusion_matrix(y_true, y_pred).ravel())
    
# Function Prints best parameters for GridSearchCV
def print_results(results):
    print('Best Parameters: {}\n'.format(results.best_params_)) 
    

In [7]:
# Logistic Regression Model
lr = LogisticRegression(C=100, max_iter=200, class_weight='balanced')

# Fitting Model to the train set
lr.fit(X_train, y_train)

# Predicting on the test set
y_pred = lr.predict(X_test)

# Evaluating model
evaluation(y_test, y_pred)

Evaluation Metrics:
Accuracy: 0.5849858356940509
Recall: 0.6760299625468165
F1 Score: 0.7113300492610837
Precision: 0.7505197505197505

Confusion Matrix:
 TN,  FP, FN, TP
[ 52 120 173 361]


In [8]:
knn = KNeighborsClassifier(leaf_size=8, metric='manhattan',weights='uniform')

# Fitting Model to the train set
knn.fit(X_train, y_train)

# Predicting on the test set
y_pred = knn.predict(X_test)

# Evaluating model
evaluation(y_test, y_pred)

Evaluation Metrics:
Accuracy: 0.7082152974504249
Recall: 0.9063670411985019
F1 Score: 0.8245315161839863
Precision: 0.75625

Confusion Matrix:
 TN,  FP, FN, TP
[ 16 156  50 484]


In [9]:
tree = DecisionTreeClassifier()

# Fitting Model to the train set
tree.fit(X_train, y_train)

# Predicting on the test set
y_pred = tree.predict(X_test)

# Evaluating model
evaluation(y_test, y_pred)

Evaluation Metrics:
Accuracy: 0.6614730878186968
Recall: 0.7827715355805244
F1 Score: 0.7776744186046511
Precision: 0.7726432532347505

Confusion Matrix:
 TN,  FP, FN, TP
[ 49 123 116 418]


In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tqdm.notebook import tqdm
from time import sleep

In [11]:
parameter_n_estimators = [100]
for i in tqdm(parameter_n_estimators):
    # Instantiate model
    forest = RandomForestClassifier(n_estimators=i, criterion='gini')
    # Fitting Model to the train set
    forest.fit(X_train, y_train)
    # Predicting on the test set
    y_pred = forest.predict(X_test)

    # Evaluating model
    evaluation(y_test, y_pred)
    print('Tree: %s ' % (i))

  0%|          | 0/1 [00:00<?, ?it/s]

Evaluation Metrics:
Accuracy: 0.6869688385269122
Recall: 0.8426966292134831
F1 Score: 0.8028545941123997
Precision: 0.7666098807495741

Confusion Matrix:
 TN,  FP, FN, TP
[ 35 137  84 450]
Tree: 100 


In [12]:
import joblib

In [17]:
joblib.dump(forest, 'trained_models/ml_model3.joblib')

['trained_models/ml_model3.joblib']