In [1]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Collecting sklearn
  Downloading https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py): started
  Building wheel for sklearn (setup.py): finished with status 'done'
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1321 sha256=d9bd67ab545ca0fd580f3dd0faedc9bc92f159db2f20e30c277d7ef33db3d49c
  Stored in directory: C:\Users\Saranya\AppData\Local\pip\Cache\wheels\76\03\bb\589d421d27431bcd2c6da284d5f2286c8e3b2ea3cf1594c074
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


In [2]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

# Read the CSV and Perform Basic Data Cleaning

In [75]:
exoplanet_df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
exoplanet_df = exoplanet_df.dropna(axis='columns', how='all')
# Drop the null rows
exoplanet_df = exoplanet_df.dropna()
exoplanet_df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Select your features (columns)

In [76]:
# Set features. This will also be used as your x values.
sf_exoplanet_df = exoplanet_df
sf_exoplanet_df.head()

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,CONFIRMED,0,0,0,0,54.418383,0.0002479,-0.0002479,162.51384,0.00352,...,-81,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,FALSE POSITIVE,0,1,0,0,19.89914,1.49e-05,-1.49e-05,175.850252,0.000581,...,-176,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
2,FALSE POSITIVE,0,1,0,0,1.736952,2.63e-07,-2.63e-07,170.307565,0.000115,...,-174,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.28521,15.597
3,CONFIRMED,0,0,0,0,2.525592,3.76e-06,-3.76e-06,171.59555,0.00113,...,-211,4.438,0.07,-0.21,1.046,0.334,-0.133,288.75488,48.2262,15.509
4,CONFIRMED,0,0,0,0,4.134435,1.05e-05,-1.05e-05,172.97937,0.0019,...,-232,4.486,0.054,-0.229,0.972,0.315,-0.105,296.28613,48.22467,15.714


# Create a Train Test Split

Use `koi_disposition` for the y values

In [77]:
#Assigning X & Y Values 

X = exoplanet_df.drop("koi_disposition", axis=1)
y = exoplanet_df["koi_disposition"]

#Split Training and Testing Data

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=115, stratify=y)

In [78]:
X_train.head()

Unnamed: 0,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,koi_time0bk_err2,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
2700,0,0,0,0,31.80476,9.1e-05,-9.1e-05,190.25413,0.00224,-0.00224,...,-128,4.235,0.143,-0.117,1.356,0.212,-0.259,287.62082,43.141647,13.178
21,0,0,0,0,4.280964,6e-06,-6e-06,171.89659,0.00115,-0.00115,...,-169,4.564,0.032,-0.168,0.831,0.207,-0.069,294.26581,49.314091,15.356
5040,0,1,1,0,0.829019,2e-06,-2e-06,131.46667,0.00401,-0.00401,...,-255,4.271,0.124,-0.186,1.328,0.408,-0.22,296.03485,45.490822,14.661
2795,0,0,1,0,1.387838,6e-06,-6e-06,132.11351,0.00318,-0.00318,...,-219,4.442,0.054,-0.216,1.044,0.335,-0.112,295.64212,48.773071,15.167
1765,0,0,0,0,14.383227,8.6e-05,-8.6e-05,140.57648,0.0053,-0.0053,...,-135,4.249,0.137,-0.112,1.291,0.216,-0.216,290.08276,50.86348,13.312


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [79]:
# Scale your data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



## 1) SVM Model 

In [80]:
# Training The SVM Model

from sklearn.svm import SVC 
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_scaled, y_train)
predictions = svm_model.predict(X_test)

In [81]:
#score
print(f"Training Data Score: {svm_model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {svm_model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8498950982262063
Testing Data Score: 0.8329519450800915


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [82]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
svm_param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
svm_grid = GridSearchCV(svm_model, svm_param_grid, verbose=3)

In [83]:
# Train the model with GridSearch
svm_grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.846, total=   0.7s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.838, total=   0.7s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.3s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.844, total=   0.6s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.846, total=   0.5s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.838, total=   0.7s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.844, total=   0.7s
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.846, total=   0.6s
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.838, total=   0.7s
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.844, total=   0.7s
[CV] C=5, gamma=0.0001 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   18.5s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [84]:
#Best param & score - SVM
print(svm_grid.best_params_)
print(svm_grid.best_score_)

{'C': 10, 'gamma': 0.0001}
0.8706847224871257


In [85]:
#Training & Test Score - SVM
print(svm_grid.score(X_train_scaled, y_train))
print(svm_grid.score(X_test_scaled, y_test))

0.8775510204081632
0.8655606407322655


In [86]:
# Make predictions with the hypertuned model
svm_predictions = svm_grid.predict(X_test_scaled)

# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, svm_predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.83      0.58      0.68       422
     CONFIRMED       0.70      0.88      0.78       450
FALSE POSITIVE       0.99      1.00      0.99       876

      accuracy                           0.87      1748
     macro avg       0.84      0.82      0.82      1748
  weighted avg       0.87      0.87      0.86      1748



## 2) Random Forest Model

In [87]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model = rf_model.fit(X_train_scaled, y_train)
print(rf_model.score(X_train_scaled, y_train))
print(rf_model.score(X_test_scaled, y_test))



0.9959946595460614
0.8724256292906178


In [88]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
importances

array([0.09402749, 0.04520277, 0.11317398, 0.03522461, 0.01643225,
       0.01212607, 0.01587736, 0.01352613, 0.02361713, 0.0199786 ,
       0.02289346, 0.01142494, 0.01122559, 0.02065096, 0.02334779,
       0.04836973, 0.03272287, 0.01413617, 0.01135553, 0.0630962 ,
       0.03348144, 0.02287483, 0.02474232, 0.01642256, 0.0166422 ,
       0.00961153, 0.05035787, 0.00227459, 0.01051266, 0.03485174,
       0.03453577, 0.00758918, 0.00974172, 0.00937767, 0.00775814,
       0.01497602, 0.00717575, 0.01554759, 0.01153567, 0.01158113])

In [89]:
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, sf_exoplanet_df), reverse=True)

[(0.11317398195084771, 'koi_fpflag_ss'),
 (0.0940274866671785, 'koi_disposition'),
 (0.063096204425158, 'koi_depth_err2'),
 (0.050357870304360196, 'koi_insol_err2'),
 (0.04836973276946972, 'koi_duration_err1'),
 (0.04520276861235594, 'koi_fpflag_nt'),
 (0.035224607589705775, 'koi_fpflag_co'),
 (0.03485173579576202, 'koi_steff'),
 (0.034535770271109156, 'koi_steff_err1'),
 (0.03348144073600211, 'koi_prad'),
 (0.03272286691797811, 'koi_duration_err2'),
 (0.024742317401547566, 'koi_prad_err2'),
 (0.023617127437950282, 'koi_time0bk'),
 (0.023347787892337095, 'koi_duration'),
 (0.022893460313568363, 'koi_time0bk_err2'),
 (0.022874829219772557, 'koi_prad_err1'),
 (0.02065096255141456, 'koi_impact_err2'),
 (0.01997860354459286, 'koi_time0bk_err1'),
 (0.01664220438773422, 'koi_insol'),
 (0.01643224806780981, 'koi_fpflag_ec'),
 (0.01642255760954929, 'koi_teq'),
 (0.015877364793531617, 'koi_period_err1'),
 (0.01554758769447231, 'koi_srad_err2'),
 (0.014976021721872925, 'koi_srad'),
 (0.014136171

In [90]:
rf_param_grid = {'n_estimators': [200, 250, 300],
              'max_depth': [125, 150, 175]}
rf_grid = GridSearchCV(rf_model, rf_param_grid, verbose=3)

In [91]:
# Train the model with GridSearch
rf_grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] max_depth=125, n_estimators=200 .................................
[CV] ..... max_depth=125, n_estimators=200, score=0.892, total=   5.4s
[CV] max_depth=125, n_estimators=200 .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.3s remaining:    0.0s


[CV] ..... max_depth=125, n_estimators=200, score=0.898, total=   5.3s
[CV] max_depth=125, n_estimators=200 .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   10.6s remaining:    0.0s


[CV] ..... max_depth=125, n_estimators=200, score=0.890, total=   5.4s
[CV] max_depth=125, n_estimators=250 .................................
[CV] ..... max_depth=125, n_estimators=250, score=0.888, total=   6.6s
[CV] max_depth=125, n_estimators=250 .................................
[CV] ..... max_depth=125, n_estimators=250, score=0.899, total=   6.5s
[CV] max_depth=125, n_estimators=250 .................................
[CV] ..... max_depth=125, n_estimators=250, score=0.894, total=   6.4s
[CV] max_depth=125, n_estimators=300 .................................
[CV] ..... max_depth=125, n_estimators=300, score=0.887, total=   9.0s
[CV] max_depth=125, n_estimators=300 .................................
[CV] ..... max_depth=125, n_estimators=300, score=0.898, total=   8.4s
[CV] max_depth=125, n_estimators=300 .................................
[CV] ..... max_depth=125, n_estimators=300, score=0.887, total=   7.7s
[CV] max_depth=150, n_estimators=200 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  3.0min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [92]:
#Best param & score - SVM
print(rf_grid.best_params_)
print(rf_grid.best_score_)

{'max_depth': 125, 'n_estimators': 250}
0.8935723822239175


In [93]:
#Training & Test Score - SVM
print(rf_grid.score(X_train_scaled, y_train))
print(rf_grid.score(X_test_scaled, y_test))

1.0
0.8901601830663616


In [94]:
# Make predictions with the hypertuned model
rf_predictions = rf_grid.predict(X_test_scaled)

# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, rf_predictions))

                precision    recall  f1-score   support

     CANDIDATE       0.82      0.72      0.77       422
     CONFIRMED       0.78      0.85      0.81       450
FALSE POSITIVE       0.98      1.00      0.99       876

      accuracy                           0.89      1748
     macro avg       0.86      0.85      0.85      1748
  weighted avg       0.89      0.89      0.89      1748



# Save the Model

In [97]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib

In [98]:
#SVM Model

svm_filename = 'svm_model_sara.sav'
joblib.dump(svm_model, svm_filename)

['svm_model_sara.sav']

In [99]:
#Random Forest Model 

rf_filename = 'rf_model_sara.sav'
joblib.dump(rf_model, rf_filename)


['rf_model_sara.sav']