In [326]:
# import libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import SimpleImputer
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# 1. Pre-proccessing

In [327]:
data = pd.read_csv("neo_v2.csv")
data.head()

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,2162635,162635 (2000 SS164),1.198271,2.679415,13569.249224,54839740.0,Earth,False,16.73,False
1,2277475,277475 (2005 WK4),0.2658,0.594347,73588.726663,61438130.0,Earth,False,20.0,True
2,2512244,512244 (2015 YE18),0.72203,1.614507,114258.692129,49798720.0,Earth,False,17.83,False
3,3596030,(2012 BV13),0.096506,0.215794,24764.303138,25434970.0,Earth,False,22.2,False
4,3667127,(2014 GE35),0.255009,0.570217,42737.733765,46275570.0,Earth,False,20.09,True


In [328]:
data.shape

(90836, 10)

## 1.2 Removing some features

- **remove "id" column**
- **remove "name" column to avoid possible correlation between the name and label by the classifiers** 
- **remove "orbiting_body" and "sentry_object" since they are the samle for all rows**

In [329]:
data.drop(["id", "name", "orbiting_body", "sentry_object"], axis=1, inplace=True)
data.head()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,1.198271,2.679415,13569.249224,54839740.0,16.73,False
1,0.2658,0.594347,73588.726663,61438130.0,20.0,True
2,0.72203,1.614507,114258.692129,49798720.0,17.83,False
3,0.096506,0.215794,24764.303138,25434970.0,22.2,False
4,0.255009,0.570217,42737.733765,46275570.0,20.09,True


## 1.3 Handle missing values

In [330]:
sip = SimpleImputer(missing_values=np.nan, strategy="mean")
data.iloc[:, :-1] = sip.fit_transform(data.iloc[:, :-1].values)

## 1.4 Detect outliers with z-score

In [331]:
rows_to_drop = []
for col in data.iloc[:, :-1].columns:
    z_scores = stats.zscore(data[col])
    for i in range(len(z_scores)):
        if abs(z_scores[i]) > 3:
            rows_to_drop.append(i)

rows_to_drop = list(set(rows_to_drop))
data = data.drop(rows_to_drop)
data.reset_index(drop=True, inplace=True)

In [332]:
data.shape

(89021, 6)

## 1.5 set features and label

In [333]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [334]:
X.shape

(89021, 5)

## 1.6 Label Encoding

In [335]:
lb_encoder = LabelEncoder()
y = lb_encoder.fit_transform(y)

## 1.7 Split data sets

In [336]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## 1.8 Feature Scaling

In [337]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 2. Logistic Regression Classifying

In [338]:
lr_classif = LogisticRegression(random_state=0)
lr_classif.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [339]:
y_pred_lr_classif = lr_classif.predict(X_test)

In [340]:
cm_lr_classif = confusion_matrix(y_test, y_pred_lr_classif)
cm_lr_classif

array([[20023,   190],
       [ 1870,   173]])

In [341]:
acc_lr_classif = accuracy_score(y_test, y_pred_lr_classif)
acc_lr_classif = round(acc_lr_classif, 3)
print("Accuracy for Logistic Regression Classifier: {}".format(str(acc_lr_classif)))

Accuracy for Logistic Regression Classifier: 0.907


# 3. K-Nearest Neighbors

In [342]:
knn_classif = KNeighborsClassifier(n_neighbors=70, metric="minkowski", p=2)
knn_classif.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=70)

In [343]:
y_pred_knn_classif = knn_classif.predict(X_test)

In [344]:
cm_knn_classif = confusion_matrix(y_test, y_pred_knn_classif)
cm_knn_classif

array([[20098,   115],
       [ 1758,   285]])

In [345]:
acc_knn_classif = accuracy_score(y_test, y_pred_knn_classif)
acc_knn_classif = round(acc_knn_classif, 3)
print("Accuracy for KNN Classifier: {}".format(str(acc_knn_classif)))

Accuracy for KNN Classifier: 0.916


# 4. SVC

In [346]:
svc_classif = SVC(kernel='linear', random_state=0)
svc_classif.fit(X_train, y_train)

SVC(kernel='linear', random_state=0)

In [347]:
y_pred_svc_classif = svc_classif.predict(X_test)

In [348]:
cm_svc_classif = confusion_matrix(y_test, y_pred_svc_classif)
cm_svc_classif

array([[20213,     0],
       [ 2043,     0]])

In [349]:
acc_svc_classif = accuracy_score(y_test, y_pred_svc_classif)
acc_svc_classif = round(acc_svc_classif, 3)
print("Accuracy for Support Vector Classifier: {}".format(str(acc_svc_classif)))

Accuracy for Support Vector Classifier: 0.908


# 5. Kernel SVC

In [352]:
svc_kernel_classif = SVC(kernel='rbf', random_state=0)
svc_kernel_classif.fit(X_train, y_train)

SVC(random_state=0)

In [353]:
y_pred_kernel_svc_classif = svc_kernel_classif.predict(X_test)

In [354]:
cm_kernel_svc_classif = confusion_matrix(y_test, y_pred_kernel_svc_classif)
cm_kernel_svc_classif

array([[20120,    93],
       [ 1784,   259]])

In [355]:
acc_kernel_svc_classif = accuracy_score(y_test, y_pred_kernel_svc_classif)
acc_kernel_svc_classif = round(acc_kernel_svc_classif, 3)
print("Accuracy for Kernel Support Vector Classifier: {}".format(str(acc_kernel_svc_classif)))

Accuracy for Support Vector Classifier: 0.916
