In [109]:
# import libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import SimpleImputer
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

# 1. Pre-proccessing

In [110]:
data = pd.read_csv("neo_v2.csv")
data.head()

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,2162635,162635 (2000 SS164),1.198271,2.679415,13569.249224,54839740.0,Earth,False,16.73,False
1,2277475,277475 (2005 WK4),0.2658,0.594347,73588.726663,61438130.0,Earth,False,20.0,True
2,2512244,512244 (2015 YE18),0.72203,1.614507,114258.692129,49798720.0,Earth,False,17.83,False
3,3596030,(2012 BV13),0.096506,0.215794,24764.303138,25434970.0,Earth,False,22.2,False
4,3667127,(2014 GE35),0.255009,0.570217,42737.733765,46275570.0,Earth,False,20.09,True


In [111]:
data.shape

(90836, 10)

## 1.2 Removing some features

- **remove "id" column**
- **remove "name" column to avoid possible correlation between the name and label by the classifiers** 
- **remove "orbiting_body" and "sentry_object" since they are the samle for all rows**

In [112]:
data.drop(["id", "name", "orbiting_body", "sentry_object"], axis=1, inplace=True)
data.head()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,1.198271,2.679415,13569.249224,54839740.0,16.73,False
1,0.2658,0.594347,73588.726663,61438130.0,20.0,True
2,0.72203,1.614507,114258.692129,49798720.0,17.83,False
3,0.096506,0.215794,24764.303138,25434970.0,22.2,False
4,0.255009,0.570217,42737.733765,46275570.0,20.09,True


## 1.3 Handle missing values

In [113]:
sip = SimpleImputer(missing_values=np.nan, strategy="mean")
data.iloc[:, :-1] = sip.fit_transform(data.iloc[:, :-1].values)

## 1.4 Detect outliers with z-score

In [114]:
rows_to_drop = []
for col in data.iloc[:, :-1].columns:
    z_scores = stats.zscore(data[col])
    for i in range(len(z_scores)):
        if abs(z_scores[i]) > 3:
            rows_to_drop.append(i)

rows_to_drop = list(set(rows_to_drop))
data = data.drop(rows_to_drop)
data.reset_index(drop=True, inplace=True)

In [115]:
data.shape

(89021, 6)

## 1.5 set features and label

In [116]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [117]:
X.shape

(89021, 5)

## 1.6 Split data sets

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## 1.7 Feature Scaling

In [119]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 2. Logistic Regression Classifying

In [120]:
lr_classif = LogisticRegression(random_state=0)
lr_classif.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [121]:
y_pred_lr_classif = lr_classif.predict(X_test)

In [122]:
cm_lr_classif = confusion_matrix(y_test, y_pred_lr_classif)
cm_lr_classif

array([[20023,   190],
       [ 1870,   173]])

In [123]:
acc_lr_classif = accuracy_score(y_test, y_pred_lr_classif)
acc_lr_classif = round(acc_lr_classif, 3)
print("Accuracy for Logistic Regression Classifier: {}".format(str(acc_lr_classif)))

Accuracy for Logistic Regression Classifier: 0.9074406901509705
