In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [None]:
import pandas as pd

 ## Read the CSV and Perform Basic Data Cleaning     

In [None]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
df

## Select your features (columns)     

In [None]:
# Set features. This will also be used as your x values.
X = df.drop(columns='koi_disposition')
y = df['koi_disposition']

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X,y)

feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(20)

In [None]:
Xsel = X.drop(['koi_impact_err2', 'koi_depth_err1', 'koi_depth_err2', 'koi_insol_err2', 'koi_prad_err2',
              'koi_insol', 'koi_prad', 'koi_prad_err1', 'koi_insol_err1', 'koi_kepmag', 'koi_time0bk'], axis=1)
Xsel

## Create a Train Test Split     
*Use koi_disposition for the y values*

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xsel, y, random_state = 1, stratify = y)

## Pre-processing     
*Scale the data using the MinMaxScaler and perform some feature selection*

In [None]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Train the Model     
---------------

In [None]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_scaled, y_train)

In [None]:
print(f"Training Data Score: {svm_model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {svm_model.score(X_test_scaled, y_test)}")

## Hyperparameter Tuning     
*Use GridSearchCV to tune the model's parameters*

In [None]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [1, 5, 10, 2500], 'tol': [.02]} 
grid = GridSearchCV(svm_model, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
gs = grid.fit(X_train_scaled,y_train)

In [None]:
print(grid.best_params_)
print(grid.best_score_)

In [None]:
# Print out classifications report
predictions = gs.predict(X_test_scaled)

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

## Save the Model     
----------------

In [None]:
import joblib
filename = 'Model_Files/SVM_final.sav'
joblib.dump(grid, filename)