In [100]:
# import libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import SimpleImputer
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import collections
from sklearn.naive_bayes import GaussianNB

# 1. Pre-proccessing

In [81]:
data = pd.read_csv("neo_v2.csv")
data.head()

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,2162635,162635 (2000 SS164),1.198271,2.679415,13569.249224,54839740.0,Earth,False,16.73,False
1,2277475,277475 (2005 WK4),0.2658,0.594347,73588.726663,61438130.0,Earth,False,20.0,True
2,2512244,512244 (2015 YE18),0.72203,1.614507,114258.692129,49798720.0,Earth,False,17.83,False
3,3596030,(2012 BV13),0.096506,0.215794,24764.303138,25434970.0,Earth,False,22.2,False
4,3667127,(2014 GE35),0.255009,0.570217,42737.733765,46275570.0,Earth,False,20.09,True


In [82]:
data.shape

(90836, 10)

## 1.2 Removing some features

- **remove "id" column**
- **remove "name" column to avoid possible correlation between the name and label by the classifiers** 
- **remove "orbiting_body" and "sentry_object" since they are the samle for all rows**

In [83]:
data.drop(["id", "name", "orbiting_body", "sentry_object"], axis=1, inplace=True)
data.head()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,1.198271,2.679415,13569.249224,54839740.0,16.73,False
1,0.2658,0.594347,73588.726663,61438130.0,20.0,True
2,0.72203,1.614507,114258.692129,49798720.0,17.83,False
3,0.096506,0.215794,24764.303138,25434970.0,22.2,False
4,0.255009,0.570217,42737.733765,46275570.0,20.09,True


## 1.3 Handle missing values

In [84]:
sip = SimpleImputer(missing_values=np.nan, strategy="mean")
data.iloc[:, :-1] = sip.fit_transform(data.iloc[:, :-1].values)

## 1.4 Detect outliers with z-score

In [85]:
rows_to_drop = []
for col in data.iloc[:, :-1].columns:
    z_scores = stats.zscore(data[col])
    for i in range(len(z_scores)):
        if abs(z_scores[i]) > 3:
            rows_to_drop.append(i)

rows_to_drop = list(set(rows_to_drop))
data = data.drop(rows_to_drop)
data.reset_index(drop=True, inplace=True)

In [86]:
data.shape

(89021, 6)

## 1.5 set features and label

In [87]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [88]:
X.shape

(89021, 5)

## 1.6 Label Encoding

In [89]:
lb_encoder = LabelEncoder()
y = lb_encoder.fit_transform(y)

## 1.7 Split data sets and Base Accuracy

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
base_line_for_accuracy = 0
counters = collections.Counter(y_test)
labels = list(set(y_test))
if counters[labels[0]] > counters[labels[1]]:
    base_line_for_accuracy = counters[labels[0]] / len(y_test)
else:
    base_line_for_accuracy = counters[labels[1]] / len(y_test)
base_line_for_accuracy = round(base_line_for_accuracy, 3)
print("Base Accuracy: {}".format(str(base_line_for_accuracy)))

Base Accuracy: 0.908


## 1.8 Feature Scaling

In [91]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 2. Logistic Regression Classifying

In [92]:
def run_Logistic_Reg_Classifier(X_train, X_test, y_train, y_test, base_acc):
    lr_classif = LogisticRegression(random_state=0)
    lr_classif.fit(X_train, y_train)
    y_pred_lr_classif = lr_classif.predict(X_test)
    cm_lr_classif = confusion_matrix(y_test, y_pred_lr_classif)
    print("Confusion Matrix")
    print(cm_lr_classif)
    acc_lr_classif = accuracy_score(y_test, y_pred_lr_classif)
    acc_lr_classif = round(acc_lr_classif, 3)
    print()
    print("Accuracy for Logistic Regression Classifier: {}".format(str(acc_lr_classif)))
    print("Base Line Accuracy is: {}".format(str(base_acc)))

In [93]:
#run_Logistic_Reg_Classifier(X_train, X_test, y_train, y_test, base_line_for_accuracy)

Confusion Matrix
[[20023   190]
 [ 1870   173]]

Accuracy for Logistic Regression Classifier: 0.907
Base Line Accuracy is: 0.908


# 3. K-Nearest Neighbors

In [94]:
def run_KNN_Classifier(X_train, X_test, y_train, y_test, base_acc):
    knn_classif = KNeighborsClassifier(n_neighbors=70, metric="minkowski", p=2)
    knn_classif.fit(X_train, y_train)
    y_pred_knn_classif = knn_classif.predict(X_test)
    cm_knn_classif = confusion_matrix(y_test, y_pred_knn_classif)
    print("Confusion Matrix")
    print(cm_knn_classif)
    acc_knn_classif = accuracy_score(y_test, y_pred_knn_classif)
    acc_knn_classif = round(acc_knn_classif, 3)
    print()
    print("Accuracy for KNN Classifier: {}".format(str(acc_knn_classif)))
    print("Base Line Accuracy is: {}".format(str(base_acc)))

In [95]:
#run_KNN_Classifier(X_train, X_test, y_train, y_test, base_line_for_accuracy)

Confusion Matrix
[[20098   115]
 [ 1758   285]]

Accuracy for KNN Classifier: 0.916
Base Line Accuracy is: 0.908


# 4. SVC

In [96]:
def run_svc_Classifier(X_train, X_test, y_train, y_test, kernel, base_acc):
    svc_classif = SVC(kernel=kernel, random_state=0)
    svc_classif.fit(X_train, y_train)
    y_pred_svc_classif = svc_classif.predict(X_test)
    cm_svc_classif = confusion_matrix(y_test, y_pred_svc_classif)
    print("Confusion Matrix")
    print(cm_svc_classif)
    acc_svc_classif = accuracy_score(y_test, y_pred_svc_classif)
    acc_svc_classif = round(acc_svc_classif, 3)
    print()
    print("Accuracy for Support Vector Classifier with " + kernel +  " kernel: {}".format(str(acc_svc_classif)))
    print("Base Line Accuracy is: {}".format(str(base_acc)))

In [97]:
#run_svc_Classifier(X_train, X_test, y_train, y_test, 'linear', base_line_for_accuracy)

Confusion Matrix
[[20213     0]
 [ 2043     0]]

Accuracy for Support Vector Classifier with linear kernel: 0.908
Base Line Accuracy is: 0.908


# 5. Kernel SVC

In [98]:
#run_svc_Classifier(X_train, X_test, y_train, y_test, 'rbf', base_line_for_accuracy)

Confusion Matrix
[[20120    93]
 [ 1784   259]]

Accuracy for Support Vector Classifier with rbf kernel: 0.916
Base Line Accuracy is: 0.908


# 6. Naive Bayes

In [None]:
def run_naive_bayes(X_train, X_test, y_train, y_test, base_acc):
    nb_classif = GaussianNB()
    nb_classif.fit(X_train, y_train)
    y_nb_pred = nb_classif.predict(X_test)
    cm_nb_classif = confusion_matrix(y_test, y_nb_pred)
    print("Confusion Matrix")
    print(cm_nb_classif)
    acc_nb_classif = accuracy_score(y_test, y_nb_pred)
    acc_nb_classif = round(acc_nb_classif, 3)
    print()
    print("Accuracy for Naive Bayes Classifier: {}".format(str(acc_nb_classif)))
    print("Base Line Accuracy is: {}".format(str(base_acc)))

In [None]:
run_naive_bayes(X_train, X_test, y_train, y_test, base_line_for_accuracy)