# Support Vector Machine Linear Classifier

In [1]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import os 


In [2]:
# Read the csv file and clead the data
df = pd.read_csv("data/cumulative.csv")
df = df.dropna(axis='columns', how='all')
pd.set_option('display.max_columns', None)

# Drop unnecesary columns 
data= df.drop(['rowid','kepid','kepoi_name','kepler_name','koi_pdisposition','koi_period_err1','koi_period_err2',
        'koi_time0bk_err1','koi_time0bk_err2','koi_impact_err1','koi_impact_err2','koi_duration_err1',
       'koi_duration_err2','koi_depth_err1','koi_depth_err2','koi_prad_err1','koi_prad_err2','koi_insol_err1','koi_insol_err2','koi_tce_delivname','koi_steff_err1','koi_steff_err2',
         'koi_slogg_err1','koi_slogg_err2','koi_srad_err1','koi_srad_err2'], axis=1)

# Drop the null rows
data = data.dropna()
data

Unnamed: 0,koi_disposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_time0bk,koi_impact,koi_duration,koi_depth,koi_prad,koi_teq,koi_insol,koi_model_snr,koi_tce_plnt_num,koi_steff,koi_slogg,koi_srad,ra,dec,koi_kepmag
0,CONFIRMED,1.000,0,0,0,0,9.488036,170.538750,0.146,2.95750,615.8,2.26,793.0,93.59,35.8,1.0,5455.0,4.467,0.927,291.93423,48.141651,15.347
1,CONFIRMED,0.969,0,0,0,0,54.418383,162.513840,0.586,4.50700,874.8,2.83,443.0,9.11,25.8,2.0,5455.0,4.467,0.927,291.93423,48.141651,15.347
2,FALSE POSITIVE,0.000,0,1,0,0,19.899140,175.850252,0.969,1.78220,10829.0,14.60,638.0,39.30,76.3,1.0,5853.0,4.544,0.868,297.00482,48.134129,15.436
3,FALSE POSITIVE,0.000,0,1,0,0,1.736952,170.307565,1.276,2.40641,8079.2,33.46,1395.0,891.96,505.6,1.0,5805.0,4.564,0.791,285.53461,48.285210,15.597
4,CONFIRMED,1.000,0,0,0,0,2.525592,171.595550,0.701,1.65450,603.3,2.75,1406.0,926.16,40.9,1.0,6031.0,4.438,1.046,288.75488,48.226200,15.509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,FALSE POSITIVE,0.000,0,0,0,1,8.589871,132.016100,0.765,4.80600,87.7,1.11,929.0,176.40,8.4,1.0,5638.0,4.296,1.088,298.74921,46.973351,14.478
9560,FALSE POSITIVE,0.000,0,1,1,0,0.527699,131.705093,1.252,3.22210,1579.2,29.35,2088.0,4500.53,453.3,1.0,5638.0,4.529,0.903,297.18875,47.093819,14.082
9561,CANDIDATE,0.497,0,0,0,0,1.739849,133.001270,0.043,3.11400,48.5,0.72,1608.0,1585.81,10.6,1.0,6119.0,4.444,1.031,286.50937,47.163219,14.757
9562,FALSE POSITIVE,0.021,0,0,1,0,0.681402,132.181750,0.147,0.86500,103.6,1.07,2218.0,5713.41,12.3,1.0,6173.0,4.447,1.041,294.16489,47.176281,15.385


In [3]:
# Assign X (data) and y (target)
y = data["koi_disposition"]
X = data.drop("koi_disposition", axis=1)


In [4]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)
encoded_y

array([1, 1, 2, ..., 0, 2, 2])

In [5]:
# Split the train and test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, random_state=1)       
print(y_train)

[2 1 1 ... 2 1 2]


In [6]:
# Import dependencies
from sklearn.preprocessing import LabelEncoder, MinMaxScaler,StandardScaler
#from tensorflow.keras.utils import to_categorical

# scale the data with StandardScaler
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train_scaled,y_train)
predictions = model.predict(X_test_scaled)
model

SVC(kernel='linear')

In [8]:
print(f"Training Data Score: {model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(X_test_scaled, y_test)}")


Training Data Score: 0.8245204336947456
Testing Data Score: 0.8199099549774888


In [9]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                           target_names =["confirmed", "false positive", "candidate"]))


                precision    recall  f1-score   support

     confirmed       0.82      0.31      0.45       457
false positive       0.63      0.93      0.75       575
     candidate       0.99      1.00      0.99       967

      accuracy                           0.82      1999
     macro avg       0.81      0.74      0.73      1999
  weighted avg       0.84      0.82      0.80      1999



# Grid Search and Hyper-Parameter Tuning



In [10]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [11]:
# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.827, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.827, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.5s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.816, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.813, total=   0.3s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=0.825, total=   0.3s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.827, total=   0.3s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.827, total=   0.3s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.816, total=   0.3s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=0.813, total=   0.3s
[CV] C=1, gamma=0.001 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   24.3s finished


GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             verbose=3)

In [15]:
# List the best parameters for this dataset
print(grid.best_params_) 
print(grid.best_score_)


{'C': 10, 'gamma': 0.0001}
0.8248540450375312


In [16]:
# Make predictions with the hypertuned model
predictions = grid.predict(X_test_scaled) 

In [17]:
print('Test Acc: %.3f' % grid.score(X_test_scaled, y_test)) 

Test Acc: 0.823
