In [50]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [46]:
class Bayes:
    def fit(self,X,y):
        n_samples , n_features = X.shape;
        self._classes = np.unique(y);
        n_classes = len(self._classes);

        #calculate mean,variance,prior for each class
        self._mean = np.zeros((n_classes,n_features), dtype=np.float64);
        self._var = np.zeros((n_classes,n_features), dtype=np.float64);
        self._prior = np.zeros((n_classes), dtype=np.float64);

        for idx,c in enumerate(self._classes):
            X_c = X[y==c];
            self._mean[idx,:] = X_c.mean(axis=0);  
            self._var[idx,:] = X_c.var(axis=0);  
            self._prior[idx] = X_c.shape[0]/float(n_samples);
        
    def predict(self,X):
        y_pred = [self._predict(x) for x in X];
        return np.array(y_pred);

    def _predict(self,X):
        posteriors =[];


        #Calculate posterior probability for each class
        for idx,c in enumerate(self._classes):
            prior = np.log(self._prior[idx]);
            posterior = np.sum(np.log(self._pdf(idx,c)));
            posterior = prior + posterior
            posteriors.append(posterior);
        
        #Return class with highest probability
        return self._classes[np.argmax(posteriors)];

    def _pdf(self,class_idx,x):
        mean = self._mean[class_idx];
        var = self._var[class_idx];
        numerator = np.exp(-((x-mean)**2)/(2*var));
        denominator = np.sqrt(2*np.pi*var);
        return numerator/denominator;

In [38]:
data_df = pd.read_csv('neo.csv');

In [39]:
useless = ['id','orbiting_body','name','sentry_object'];
data_df = data_df.drop(useless,axis=1)

In [40]:
data_df['hazardous'] = data_df['hazardous'].astype(int)
data_df

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,1.198271,2.679415,13569.249224,5.483974e+07,16.73,0
1,0.265800,0.594347,73588.726663,6.143813e+07,20.00,1
2,0.722030,1.614507,114258.692129,4.979872e+07,17.83,0
3,0.096506,0.215794,24764.303138,2.543497e+07,22.20,0
4,0.255009,0.570217,42737.733765,4.627557e+07,20.09,1
...,...,...,...,...,...,...
90831,0.026580,0.059435,52078.886692,1.230039e+07,25.00,0
90832,0.016771,0.037501,46114.605073,5.432121e+07,26.00,0
90833,0.031956,0.071456,7566.807732,2.840077e+07,24.60,0
90834,0.007321,0.016370,69199.154484,6.869206e+07,27.80,0


In [41]:
target = data_df['hazardous']
data_df = data_df.drop('hazardous',axis=1)

In [51]:
scaler = StandardScaler()

data_df = scaler.fit_transform(data_df)

In [52]:
X_train,X_test,y_train,y_test = train_test_split(data_df,target,test_size=0.2,random_state=69)

In [53]:
nb = Bayes()
nb.fit(X_train,y_train);
predictions = nb.predict(X_test)

In [54]:
def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

In [58]:
accuracy(y_test,predictions)*100

90.41721708498459