In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from collections import Counter
%matplotlib inline

In [2]:
train_data = pd.read_csv("./data/sonar_train.data", header=None)
valid_data = pd.read_csv("./data/sonar_valid.data", header=None)
test_data = pd.read_csv('./data/sonar_test.data', header=None)

In [3]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.0392,0.0108,0.0267,0.0257,0.041,0.0491,0.1053,0.169,0.2105,0.2471,...,0.0083,0.008,0.0026,0.0079,0.0042,0.0071,0.0044,0.0022,0.0014,2
1,0.1021,0.083,0.0577,0.0627,0.0635,0.1328,0.0988,0.1787,0.1199,0.1369,...,0.0709,0.0317,0.0309,0.0252,0.0087,0.0177,0.0214,0.0227,0.0106,2
2,0.0025,0.0309,0.0171,0.0228,0.0434,0.1224,0.1947,0.1661,0.1368,0.143,...,0.0149,0.0077,0.0036,0.0114,0.0085,0.0101,0.0016,0.0028,0.0014,1
3,0.0335,0.0134,0.0696,0.118,0.0348,0.118,0.1948,0.1607,0.3036,0.4372,...,0.0244,0.0232,0.0093,0.0159,0.0193,0.0032,0.0377,0.0126,0.0156,2
4,0.01,0.0194,0.0155,0.0489,0.0839,0.1009,0.1627,0.2071,0.2696,0.299,...,0.013,0.0073,0.0077,0.0075,0.006,0.008,0.0019,0.0053,0.0019,1


P(class|x1,x2,x3..) = P(x1,x2,x3...|class) * P(class)

In [4]:
class NaiveBayes(object):
    def __init__(self, y_column):
        self.y_column = y_column
        
    @staticmethod
    def calculate_probability(x, mean, stdev):
        exponent = math.exp(-((x-mean)**2 / (2 * stdev**2 )))
        return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
    
    def fit(self, train_df):
        assert isinstance(train_df, pd.DataFrame)
        self.data_split = {}
        self.data_split_stats = {}
        self.classes = []
        self.y_prob = {}
        value_count_df = train_df[self.y_column].value_counts()
        for class_num in value_count_df.index:
            temp_df = train_df.loc[train_df[self.y_column] == class_num].reset_index(drop=True).drop(labels=self.y_column, inplace=False, axis=1)
            self.data_split[class_num] = temp_df
            self.data_split_stats[class_num] = {}
            self.data_split_stats[class_num]["mean"] = temp_df.mean(axis=0).values
            self.data_split_stats[class_num]["std"] = temp_df.std(axis=0, ddof=1).values # unbiased stddev
            self.classes.append(class_num)
            self.y_prob[class_num] = value_count_df[class_num]/len(train_df)
            
    def predict(self, data):
        all_predictions = []
        for test_row in data:
            predictions = {}
            for class_num in self.data_split_stats.keys():
                prob = self.y_prob[class_num]
                assert len(test_row) == len(self.data_split_stats[class_num]["mean"])
                for x, mean, stddev in zip(test_row, self.data_split_stats[class_num]["mean"], self.data_split_stats[class_num]["std"]):
                    prob*=NaiveBayes.calculate_probability(x, mean, stddev)
                predictions[class_num] = prob
            all_predictions.append(predictions)
        self.prediction_scores = all_predictions
        return np.array(list(map(lambda pred: max(pred, key=pred.get), all_predictions))).astype(np.int32)
    
    @staticmethod
    def accuracy(y_true, y_pred):
        y_true = y_true.astype(np.int32)
        y_pred = y_pred.astype(np.int32)
        return np.mean(np.equal(y_true, y_pred))

In [5]:
model = NaiveBayes(60)

In [6]:
model.fit(train_data)

# Run predictions on test data

In [7]:
predictions = model.predict(test_data.values[:,:-1])

In [8]:
predictions

array([1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 1, 1,
       1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 2, 1, 1,
       1, 2, 1, 2, 1, 2, 2, 2])

In [9]:
model.accuracy(y_pred=predictions, y_true=test_data.values[:,-1])

0.6923076923076923