# Naive Bayes Classifeir

In [2]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import math
from statistics import mean, stdev
from sklearn.metrics import accuracy_score
from sklearn import preprocessing, cross_validation
from sklearn.naive_bayes import GaussianNB

In [2]:
df = pd.read_csv('pima-indians-diabetes.csv')
df.head()

Unnamed: 0,NTP,PGC,DBP,TSFT,SI,BMI,DPI,A,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Data Set Information:

https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes

Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage. ADAP is an adaptive learning routine that generates and executes digital analogs of perceptron-like devices. It is a unique algorithm; see the paper for details.

Attribute Information:

1. Number of times pregnant
2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
3. Diastolic blood pressure (mm Hg)
4. Triceps skin fold thickness (mm)
5. 2-Hour serum insulin (mu U/ml)
6. Body mass index (weight in kg/(height in m)^2)
7. Diabetes pedigree function
8. Age (years)
9. Class variable (0 or 1)


In [3]:
X = np.array(df.drop('Class', axis=1))
X = preprocessing.scale(X)
y = np.array(df['Class'])

In [4]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

In [5]:
clf = GaussianNB()
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)
print(accuracy)

0.785714285714


# My implementation of Naive Bayes Classifier

In [6]:
class NaiveBayesClassifier:
        
    def separate_by_class(self):
        separated = {}
        for i in range(len(self.labels)):
            if (self.labels[i] not in separated):
                separated[self.labels[i]] = []
            separated[self.labels[i]].append(self.features[i])
        return separated
    
    def mean_and_std_by_class(self):
        separated = self.separate_by_class()
        summaries = {}
        for key, values in separated.items():
            summaries[key] = self.summarize(values)
        return summaries
    
    def summarize(self, values):
        summaries = [(mean(feature), stdev(feature)) for feature in list(zip(*values))]
        return summaries
    
    def norm(self, x, m, s):
        e = math.exp(-(math.pow(x - m, 2) / (2 * math.pow(s, 2))))
        return (1 / (math.sqrt(2 * math.pi) * s)) * e
    
    def class_probability(self, summaries, features):
        probabilities = {}
        for key, values in summaries.items():
            probabilities[key] = 1
            for i in range(len(values)):
                m, s = values[i]
                x = features[i]
                probabilities[key] *= self.norm(x, m, s)
        return probabilities
    
    def predict_class(self, X):
        probabilities = self.class_probability(self.summaries, X)
        label, probability = None, -1
        for key, value in probabilities.items():
            if label is None or value > probability:
                probability = value
                label = key
        return label
        
    def predict(self, X):
        predictions = []
        for i in range(len(X)):
            result = self.predict_class(X[i])
            predictions.append(result)
        return predictions
        
    def fit(self, features, labels):
        self.features = features
        self.labels = labels
        self.summaries = self.mean_and_std_by_class()

In [7]:
my_clf = NaiveBayesClassifier()

my_clf.fit(X_train, y_train)
predictions = my_clf.predict(X_test)

accuracy_of_mine = accuracy_score(y_test, predictions)
print(accuracy_of_mine)

0.746753246753
