In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

In [34]:
data_df = pd.read_csv('neo.csv')

In [35]:
useless = ['id','name','orbiting_body']
data_df = data_df.drop(useless,axis=1)
data_df

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,sentry_object,absolute_magnitude,hazardous
0,1.198271,2.679415,13569.249224,5.483974e+07,False,16.73,False
1,0.265800,0.594347,73588.726663,6.143813e+07,False,20.00,True
2,0.722030,1.614507,114258.692129,4.979872e+07,False,17.83,False
3,0.096506,0.215794,24764.303138,2.543497e+07,False,22.20,False
4,0.255009,0.570217,42737.733765,4.627557e+07,False,20.09,True
...,...,...,...,...,...,...,...
90831,0.026580,0.059435,52078.886692,1.230039e+07,False,25.00,False
90832,0.016771,0.037501,46114.605073,5.432121e+07,False,26.00,False
90833,0.031956,0.071456,7566.807732,2.840077e+07,False,24.60,False
90834,0.007321,0.016370,69199.154484,6.869206e+07,False,27.80,False


In [36]:
cat_cols = ['sentry_object','hazardous']
data_df[cat_cols] = data_df[cat_cols].astype(int)
data_df

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,sentry_object,absolute_magnitude,hazardous
0,1.198271,2.679415,13569.249224,5.483974e+07,0,16.73,0
1,0.265800,0.594347,73588.726663,6.143813e+07,0,20.00,1
2,0.722030,1.614507,114258.692129,4.979872e+07,0,17.83,0
3,0.096506,0.215794,24764.303138,2.543497e+07,0,22.20,0
4,0.255009,0.570217,42737.733765,4.627557e+07,0,20.09,1
...,...,...,...,...,...,...,...
90831,0.026580,0.059435,52078.886692,1.230039e+07,0,25.00,0
90832,0.016771,0.037501,46114.605073,5.432121e+07,0,26.00,0
90833,0.031956,0.071456,7566.807732,2.840077e+07,0,24.60,0
90834,0.007321,0.016370,69199.154484,6.869206e+07,0,27.80,0


In [37]:
lencoder = LabelEncoder()
y = pd.DataFrame(lencoder.fit_transform(data_df['hazardous']), columns=['hazardous'])
y = y.to_numpy()
y

array([[0],
       [1],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [38]:
X = pd.DataFrame(data_df.drop("hazardous", axis = 1))
X

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,sentry_object,absolute_magnitude
0,1.198271,2.679415,13569.249224,5.483974e+07,0,16.73
1,0.265800,0.594347,73588.726663,6.143813e+07,0,20.00
2,0.722030,1.614507,114258.692129,4.979872e+07,0,17.83
3,0.096506,0.215794,24764.303138,2.543497e+07,0,22.20
4,0.255009,0.570217,42737.733765,4.627557e+07,0,20.09
...,...,...,...,...,...,...
90831,0.026580,0.059435,52078.886692,1.230039e+07,0,25.00
90832,0.016771,0.037501,46114.605073,5.432121e+07,0,26.00
90833,0.031956,0.071456,7566.807732,2.840077e+07,0,24.60
90834,0.007321,0.016370,69199.154484,6.869206e+07,0,27.80


In [40]:
# Train & Test Splitting
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

In [41]:
# XG Boost Model
XGBC = XGBClassifier()
XGBC.fit(X_train, y_train)
XGBC_pred = XGBC.predict(X_test)
Acc_XGBC = round(accuracy_score(XGBC_pred, y_test) * 100,2)

# KNN Model
KNN = KNeighborsClassifier(n_neighbors = 3)
KNN.fit(X_train, y_train)
KNN_pred = KNN.predict(X_test)
Acc_KNN = round(accuracy_score(KNN_pred, y_test) * 100, 2)

# RandomForst Model
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
RF_pred = RF.predict(X_test)
Acc_RF = round(accuracy_score(RF_pred, y_test) * 100, 2)

DTC = DecisionTreeClassifier()
DTC.fit(X_train, y_train)
DTC_pred = DTC.predict(X_test)
Acc_DTC = round(accuracy_score(DTC_pred, y_test) * 100, 2)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  from ipykernel import kernelapp as app


In [42]:
# Print all models

models = pd.DataFrame({
    'Model': ['Random Forest', 'XG Boost', 'KNeighborsClassifier', 'DecisionTreeClassifier'],
    'Score': [Acc_RF, Acc_XGBC, Acc_KNN, Acc_DTC]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
0,Random Forest,92.07
1,XG Boost,91.23
3,DecisionTreeClassifier,89.33
2,KNeighborsClassifier,88.06
