In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in c:\users\test\anaconda3\envs\may2\lib\site-packages (0.0)


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [3]:
import pandas as pd

 ## Read the CSV and Perform Basic Data Cleaning    

In [4]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


## Select your features (columns)     

In [5]:
# Set X & y before determining most important features
X = df.drop(columns='koi_disposition')
y = df['koi_disposition']

In [6]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,y)
#print(model.feature_importances_)

feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(20)

koi_fpflag_ss        0.168508
koi_fpflag_nt        0.136406
koi_fpflag_co        0.133262
koi_fpflag_ec        0.048675
koi_model_snr        0.033360
koi_duration_err2    0.028831
koi_duration_err1    0.027600
koi_time0bk_err2     0.024244
koi_time0bk_err1     0.022484
koi_depth            0.022301
koi_duration         0.018276
koi_steff_err2       0.017141
koi_steff_err1       0.016667
koi_period           0.016328
koi_period_err2      0.015607
koi_period_err1      0.014810
koi_depth_err1       0.013172
koi_time0bk          0.013058
koi_teq              0.012775
koi_impact_err1      0.012368
dtype: float64

In [None]:
# After testing, taking the top 14 weighted inputs
Xsel = df[['koi_fpflag_ss', 'koi_fpflag_nt', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_model_snr',
           'koi_duration_err1', 'koi_duration_err2', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_steff_err1',
           'koi_depth', 'koi_period', 'koi_duration', 'koi_period_err1', 'koi_period_err2']]
Xsel

## Create a Train Test Split     
*Use koi_disposition for the y values*

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xsel, y, random_state = 1, stratify = y)

## Pre-processing     
*Scale the data using the MinMaxScaler and perform some feature selection*

In [None]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties

from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
train_scores = []
test_scores = []
for k in range(1, 100, 10):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    #print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
plt.plot(range(1, 100, 10), train_scores, marker='o')
plt.plot(range(1, 100, 10), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()


In [None]:
# Note that k: 33 seems to be the best choice for this dataset
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
print('k=33 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

In [None]:
knn.fit(X_train_scaled, y_train)

In [None]:
print(f"Training Data Score: {knn.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {knn.score(X_test_scaled, y_test)}")

# Hyperparameter Tuning
#### Use GridSearchCV to tune the model's parameters

In [None]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

# Set parameters
param_grid = {'leaf_size': [1], 'n_neighbors': [33], 'p': [1],
             'algorithm': ['auto']} #0.8622936056877144
grid = GridSearchCV(knn, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
grid.fit(X_train_scaled,y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)
# all 40 columns {'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 33, 'p': 1}
#0.8622936056877144

In [None]:
# #List Hyperparameters that we want to tune.
# leaf_size = list(range(1,50))
# n_neighbors = list(range(1,50))
# p=[1,10]
# #Convert to dictionary
# hyperparameters = dict(leaf_size=leaf_size)#, n_neighbors=n_neighbors, p=p)
# #Create new KNN object
# knn_2 = KNeighborsClassifier()
# #Use GridSearch
# clf = GridSearchCV(knn_2, hyperparameters, cv=10)
# #Fit the model
# best_model = clf.fit(X_train_scaled,y_train)
# #Print The value of best Hyperparameters
# print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
# print('Best p:', best_model.best_estimator_.get_params()['p'])
# print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

# Save the Model

In [None]:
import joblib
filename = 'Model_Files/KNN.sav'
joblib.dump(grid, filename)