# Coding Assignment 4: Implement a Naive Bayes classifier for the Iris dataset

# Libraries

In [7]:
##Necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

# Load the dataset

In [8]:
data = load_iris()
temp = {}
temp['data'] = data['data'].tolist()
temp['target'] = data['target'].tolist()
df = pd.DataFrame(temp)

In [9]:
X = df['data']
y = df['target']

# Create a class named "NaiveBaiyes"that divides the whole algorithm into number of methods 

In [18]:
class NaiveBaiyes:

    def __init__(self):
        pass

    def separate_classes(self, X, y):
        """
        Separates the dataset in a subset of data for each class. that returns a dictionnary 
        with y as keys, and the assigned X as values
        """
        separated_classes = {}
        for i in range(len(X)):
            feature_values = X[i]
            class_name = y[i]
            if class_name not in separated_classes:
                separated_classes[class_name] = [] #handles Null case
            separated_classes[class_name].append(feature_values)
        return separated_classes
    
    def summarize(self, X):
        """
        Calculates mean and standard deviation for each column of X.
        """
        for feature in zip(*X):
            yield {
                'stdev' : np.std(feature),
                'mean' : np.mean(feature)
            }
          
    def fit(self, X, y):
        """
        Trains the model that returns a dictionary with the prior probability, 
        mean, and standard deviation of each class
        """
        separated_classes = self.separate_classes(X, y)
        self.class_summary = {}
        for class_name, feature_values in separated_classes.items():
            """
            Calculates prior probability, mean and std deviation for each target category (0,1 and 2)
            """
            self.class_summary[class_name] = {
                'prior_proba': len(feature_values)/len(X), ##Formula of prior probability
                'summary': [i for i in self.summarize(feature_values)] ##Mean and std dev
            }     
        return self.class_summary
    
    def gauss_distribution_function(self, x, mean, stdev):
        """
        Gaussian Distribution Function 
        """
        exponent = np.exp(-((x-mean)**2 / (2*stdev**2))) 
        return exponent / (np.sqrt(2*np.pi)*stdev) ##formula of gaussian distribution
    
    def predict(self, X):
        """
        Predicts the class.
        """
        predictions = []
        for row in X:
            joint_proba = {}
            for class_name, features in self.class_summary.items():
                total_features = len(features['summary'])
                likelihood = 1
                for idx in range(total_features):
                    feature = row[idx]
                    mean = features['summary'][idx]['mean'] #Extract mean info from summary
                    stdev = features['summary'][idx]['stdev'] #Extract std dev info from summary
                    normal_proba = self.gauss_distribution_function(feature, mean, stdev)
                    likelihood *= normal_proba
                prior_proba = features['prior_proba'] ##prior probability of each feature
                joint_proba[class_name] = prior_proba * likelihood ##calculte posterior probabilities
            p = max(joint_proba, key=joint_proba.get) ##considier the probability with highest value for the unseen
            predictions.append(p)
        return predictions

In [19]:
model = NaiveBaiyes() ##Calls the class
model.fit(X, y) #fit model with data

{0: {'prior_proba': 0.3333333333333333,
  'summary': [{'stdev': 0.3489469873777391, 'mean': 5.006},
   {'stdev': 0.37525458025186054, 'mean': 3.428},
   {'stdev': 0.17191858538273283, 'mean': 1.4620000000000002},
   {'stdev': 0.1043264108459598, 'mean': 0.24599999999999997}]},
 1: {'prior_proba': 0.3333333333333333,
  'summary': [{'stdev': 0.5109833656783751, 'mean': 5.936},
   {'stdev': 0.31064449134018135, 'mean': 2.7700000000000005},
   {'stdev': 0.4651881339845203, 'mean': 4.26},
   {'stdev': 0.19576516544063705, 'mean': 1.3259999999999998}]},
 2: {'prior_proba': 0.3333333333333333,
  'summary': [{'stdev': 0.6294886813914926, 'mean': 6.587999999999998},
   {'stdev': 0.3192553836664309, 'mean': 2.974},
   {'stdev': 0.546347874526844, 'mean': 5.5520000000000005},
   {'stdev': 0.2718896835115301, 'mean': 2.0260000000000002}]}}

# Test the model for given samples

In [20]:
testSet = [[5.0, 3.1, 2.4, 1.6],[5.0, 3.1, 2.1, 0.6],[6.0, 3.1, 5.5, 2.0]]
y_pred = model.predict(testSet)
y_pred

[1, 0, 2]