In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os

 ## Read the CSV and Perform Basic Data Cleaning   

In [None]:
df = pd.read_csv(os.path.join("exoplanet_data.csv"))
df.head()

In [None]:
# Assign X (data) and y (target)
X = df.drop("koi_disposition", axis=1)
y = df["koi_disposition"]
print(X.shape, y.shape)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,y)

feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(5)

In [None]:
# After testing, taking the top 14 weighted inputs
Xsel = df[['koi_fpflag_ss', 'koi_fpflag_nt', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_model_snr',
           'koi_duration_err1', 'koi_duration_err2', 'koi_time0bk_err1', 'koi_time0bk_err2', 'koi_steff_err1',
           'koi_depth', 'koi_period', 'koi_duration']]#, 'koi_period_err1']]
Xsel

## Create a Train Test Split     
*Use koi_disposition for the y values*

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(Xsel, y, random_state=1)

## Pre-processing     
*Scale the data using the MinMaxScaler and perform some feature selection*

In [None]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler

#X
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Train the Model     
---------------

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=10000)

In [None]:
classifier.fit(X_train, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

# Hyperparameter Tuning
#### Use GridSearchCV to tune the model's parameters

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 25], 'solver': ['newton-cg']} 
grid = GridSearchCV(classifier, param_grid, verbose=3)

In [None]:
gs = grid.fit(X_train,y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
predictions = gs.predict(X_test_scaled)

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))