In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [None]:
import pandas as pd

 ## Read the CSV and Perform Basic Data Cleaning    

In [None]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

## Select your features (columns)     

In [None]:
# Set X & y before determining most important features
X = df.drop(columns='koi_disposition')
y = df['koi_disposition']

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,y)

feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(5)

In [None]:
# After testing, taking the top 14 weighted inputs
Xsel = df[['koi_fpflag_ss', 'koi_fpflag_nt', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_model_snr',
           'koi_duration_err1', 'koi_duration_err2', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_steff_err1',
           'koi_depth', 'koi_period', 'koi_duration']]#, 'koi_period_err1']]
Xsel

## Create a Train Test Split     
*Use koi_disposition for the y values*

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xsel, y, random_state = 1, stratify = y)

## Pre-processing     
*Scale the data using the MinMaxScaler and perform some feature selection*

In [None]:
# Scale the data using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties

from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    #print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
# Note that k:__ seems to be the best choice for this dataset
knn = KNeighborsClassifier()
gs = knn.fit(X_train_scaled, y_train)
print('k=__ Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

In [None]:
print(f"Training Data Score: {knn.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {knn.score(X_test_scaled, y_test)}")

# Hyperparameter Tuning
#### Use GridSearchCV to tune the model's parameters

In [None]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

# Set parameters
param_grid = {'leaf_size': [1, 30], 'n_neighbors': [5, 11, 31],
             'p': [1, 2]} 
grid = GridSearchCV(knn, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
gs = grid.fit(X_train_scaled,y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
# Print out the classification report

# First need to create variable to hold prediction
predictions = gs.predict(X_test_scaled)

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))