In [29]:
from csv import reader
from random import randrange
from math import sqrt,exp,pi

class NaiveBayes():
    
    # Intializing value
    def __init__(self):
        
        self.X_train = []
        self.y_train = []
        self.summarize = {}
        
    # Evaluate an algorithm using a train/test split
    def train_test_split(self,X, y, split):

        X_train = list()
        y_train = list()
        train_size = split * len(X)
        X_test = list(X)
        y_test = list(y)
        
        while len(X_train) < train_size:
            index = randrange(len(X_test))
            X_train.append(X_test.pop(index))
            y_train.append(y_test.pop(index))
        
        return X_train,X_test,y_train,y_test   
    
    
    # Calculate score
    def score(self,X_test, y_test):
        correct = 0
        predicted = list()
        
        for row in X_test:
            output = self.predict(row)
            predicted.append(output)
        
        for i in range(len(y_test)):
            if y_test[i] == predicted[i]:
                correct = correct + 1
        
        return correct / float(len(y_test))
 
    
    # Split the dataset by class values, returns a dictionary
    def separate_by_class(self):
        separated = dict()
        
        for i in range(len(self.X_train)):
            vector = self.X_train[i]
            class_value = self.y_train[i]
            if (class_value not in separated):
                separated[class_value] = list()
            separated[class_value].append(vector)
        
        return separated
 
    
    # Calculate the mean of a list of numbers
    def mean(self,numbers):
        return sum(numbers)/float(len(numbers))
 
    
    # Calculate the standard deviation of a list of numbers
    def stdev(self,numbers):
        avg = self.mean(numbers)
        variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
        
        return sqrt(variance)
 
    
    # Calculate the mean, stdev and count for each column in a dataset
    def summarize_dataset(self,dataset):
        summaries = [(self.mean(column), self.stdev(column), len(column)) for column in zip(*dataset)]
        
        return summaries
 
    
    # Split dataset by class then calculate statistics for each row
    def summarize_by_class(self):
        separated = self.separate_by_class()
        summaries = dict()
        
        for class_value, rows in separated.items():
            summaries[class_value] = self.summarize_dataset(rows)
        
        return summaries
 
    
    # Calculate the Gaussian probability distribution function for x
    def calculate_probability(self, x, mean, stdev):
        exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
        
        return (1 / (sqrt(2 * pi) * stdev)) * exponent
 
    
    # Calculate the probabilities of predicting each class for a given row
    def calculate_class_probabilities(self, summaries, row):
        total_rows = sum([summaries[label][0][2] for label in summaries])
        probabilities = dict()
        
        for class_value, class_summaries in summaries.items():
            probabilities[class_value] = summaries[class_value][0][2] / float(total_rows)
            
            for i in range(len(class_summaries)):
                mean, stdev, _ = class_summaries[i]
                probabilities[class_value] = probabilities[class_value] * self.calculate_probability(row[i], mean, stdev)
        
        return probabilities
 
    
    # Predict the class for a given row
    def predict(self, row):
        probabilities = self.calculate_class_probabilities(self.summarize, row)
        best_label, best_prob = None, -1
        
        for class_value, probability in probabilities.items():
            if best_label is None or probability > best_prob:
                best_prob = probability
                best_label = class_value
        
        return best_label
 
    
    # Fitting the model
    def fit(self,X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
        summarize = self.summarize_by_class()
        self.summarize = summarize
  
    
if __name__ == "__main__":
    
    # load the iris dataset 
    from sklearn.datasets import load_iris 
    iris = load_iris() 
  
    # store the feature matrix (X) and response vector (y) 
    X = iris.data 
    y = iris.target 
    
    model = NaiveBayes()
    X_train, X_test, y_train, y_test = model.train_test_split(X,y,0.6)
    model.fit(X_train, y_train)
    print(model.predict([4.8, 3.4, 1.6, 0.2]))
    print(model.score(X_test,y_test))

0
0.9833333333333333
