# Preprocessing

In [2]:
# import libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

### Simple Cleaning

In [3]:
df = pd.read_csv("Resources/exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


In [4]:
# find out exactly what candidate means 
df['koi_disposition'].unique()

array(['CONFIRMED', 'FALSE POSITIVE', 'CANDIDATE'], dtype=object)

In [5]:
# Set features
target = df['koi_disposition']
features = df.drop(columns=['koi_disposition'])

In [25]:
# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=0)
# X_train, X_, y_train, y_ = train_test_split(features, target, test_size=0.4, random_state=0)
# X_validation, X_test, y_validation, y_test = train_test_split(X_, y_, test_size=0.5, random_state=0)

Preprocess the Data

Preprocess the dataset prior to fitting the model.

Perform feature selection and remove unnecessary features.

Use MinMaxScaler to scale the numerical data.

Separate the data into training and testing data.


Tune Model Parameters

Use GridSearch to tune model parameters.

Tune and compare at least two different classifiers.

In [26]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler
X_scale = MinMaxScaler().fit(X_train)

X_train_scaled = X_scale.transform(X_train)
X_test_scaled = X_scale.transform(X_test)
# X_validation_scaled = X_scale.transform(X_validation)

In [27]:
# create decision tree model
from sklearn import tree
dtree = tree.DecisionTreeClassifier()
dtree = dtree.fit(X_train, y_train)

In [28]:
# Test decision tree model
# noooooooooooooooooooooooooo
training_score = dtree.score(X_train, y_train)
testing_score = dtree.score(X_test, y_test)
print(f'training_score: '+str(training_score)+ ' testing_score:' +str(testing_score))

training_score: 1.0 testing_score:0.8466819221967964


In [21]:
# Determine importance of each feature
feature_importance = dtree.feature_importances_
feature_importance

array([0.19386961, 0.16822121, 0.1848859 , 0.03306815, 0.01329742,
       0.00201191, 0.00324464, 0.01229933, 0.00534522, 0.00491867,
       0.03199721, 0.00757726, 0.00599815, 0.01466096, 0.00583335,
       0.00157886, 0.00742689, 0.00274118, 0.00543288, 0.01463891,
       0.00589529, 0.0049473 , 0.00305406, 0.00200403, 0.00510656,
       0.00599038, 0.14663399, 0.00907765, 0.00506754, 0.00890015,
       0.00832139, 0.00675725, 0.00827917, 0.00939449, 0.00814288,
       0.00413685, 0.0028515 , 0.01469183, 0.01176342, 0.00993656])

In [35]:
feature_importance = sorted(zip(feature_importance, features.columns), reverse=True)
feature_importance[0:10]

[((0.19386960974939993, 'koi_fpflag_nt'), 'koi_fpflag_nt'),
 ((0.1848859014579099, 'koi_fpflag_co'), 'koi_fpflag_ss'),
 ((0.16822120580171573, 'koi_fpflag_ss'), 'koi_fpflag_co'),
 ((0.1466339873593977, 'koi_model_snr'), 'koi_fpflag_ec'),
 ((0.03306815098044292, 'koi_fpflag_ec'), 'koi_period'),
 ((0.03199721356795746, 'koi_impact'), 'koi_period_err1'),
 ((0.01469182941842192, 'ra'), 'koi_period_err2'),
 ((0.014660960933375013, 'koi_duration'), 'koi_time0bk'),
 ((0.014638907158986466, 'koi_prad'), 'koi_time0bk_err1'),
 ((0.013297423372836554, 'koi_period'), 'koi_time0bk_err2')]

### Different Split

In [29]:
# First attempt was overfit sooooo hopefully this is better
X_train, X_, y_train, y_ = train_test_split(features, target, test_size=0.4, random_state=0)
X_validation, X_test, y_validation, y_test = train_test_split(X_, y_, test_size=0.5, random_state=0)

In [37]:
# Rescale 
X_scale = MinMaxScaler().fit(X_train)
X_train_scaled = X_scale.transform(X_train)
X_validation_scaled = X_scale.transform(X_validation)
X_test_scaled = X_scale.transform(X_test)

In [32]:
dtree2 = tree.DecisionTreeClassifier()
dtree2 = dtree2.fit(X_train, y_train)

train_score = dtree2.score(X_train, y_train)
validation_score = dtree2.score(X_validation, y_validation)
test_score = dtree2.score(X_test, y_test)
print(f'train_score: '+str(train_score)+ ' validation_score:' +str(validation_score)+ ' test_score:' +str(test_score))

train_score: 1.0 validation_score:0.8562231759656652 test_score:0.8548963545389564


In [34]:
# Determine importance of each feature
feature_importance_2 = dtree2.feature_importances_
feature_importance_2 = sorted(zip(feature_importance_2, features.columns), reverse=True)

feature_importance_2[0:10]

[(0.1915162348666469, 'koi_fpflag_nt'),
 (0.18915032629421094, 'koi_fpflag_co'),
 (0.16874260021907506, 'koi_fpflag_ss'),
 (0.14728246928064673, 'koi_model_snr'),
 (0.03163593064040356, 'koi_fpflag_ec'),
 (0.02921882792069679, 'koi_impact'),
 (0.021932348396797658, 'dec'),
 (0.01785844961879119, 'koi_duration'),
 (0.014519405508687157, 'koi_steff_err1'),
 (0.014200002198135305, 'koi_prad_err1')]

### Another tree

In [38]:
# build and fit
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=250)
rf = rf.fit(X_train, y_train)

In [40]:
# evaluate model
rf_train_score = rf.score(X_train, y_train)
rf_validation_score = rf.score(X_validation, y_validation)
rf_test_score = rf.score(X_test, y_test)
print(f'train_score: '+str(rf_train_score)+ ' validation_score:' +str(rf_validation_score)+ ' test_score:' +str(rf_test_score))

train_score: 1.0 validation_score:0.8855507868383404 test_score:0.8977841315225161


# Create a Train Test Split

Use `koi_disposition` for the y values

In [None]:
X_train.head()

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [None]:
# Scale your data

# Train the Model



In [None]:
print(f"Training Data Score: {model2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model2.score(X_test_scaled, y_test)}")

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model

In [None]:
# Train the model with GridSearch

In [None]:
print(grid2.best_params_)
print(grid2.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)