In [30]:
import numpy as np
import pandas as pd

## Load Iris Data

In [31]:
iris_data = pd.read_csv("./data/IrisDataset.csv")

iris_data.head(15)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [32]:
iris_data["species"].value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [33]:
def encode_species(specie):
    if specie == "setosa":
        return 1
    elif specie == "versicolor":
        return 2
    elif specie == "virginica":
        return 3


def decode_predict(specie_encode):
    if specie_encode == 1:
        return "setosa"
    elif specie_encode == 2:
        return "versicolor"
    elif specie_encode == 3:
        return "virginica"



In [34]:
iris_data["species_encode"] = iris_data["species"].apply(encode_species)

In [35]:
iris_data.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_encode
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1


In [36]:
trainY = iris_data["species_encode"].values
trainX = iris_data.drop(["species", "species_encode"], axis=1).values

## Create NavieBayesClassifier

In [37]:

# Naive Bayes Classifier Class

class NaiveBayesClassifier:

    def __init__(self):
        pass

    # Separate the dataset into a subset of data for each class

    def separate_classes(self, X, y):
        """
        Separates the dataset in to a subset of data for each class.
        Parameters:
        ------------
        X- array, list of features
        y- list, target
        Returns:
        A dictionary with y as keys and assigned X as values.
        """
        separated_classes = {}
        for i in range(len(X)):
            feature_values = X[i]
            class_name = y[i]
            if class_name not in separated_classes:
                separated_classes[class_name] = []
            separated_classes[class_name].append(feature_values)
        return separated_classes

    # Standard deviation and mean are required for the (Gaussian) distribution function

    def stat_info(self, X):
        """
        Calculates standard deviation and mean of features.
        Parameters:
        ------------
        X- array , list of features
        Returns:
        A dictionary with STD and Mean as keys and assigned features STD and Mean as values.
        """
        for feature in zip(*X):
            yield {
                'std': np.std(feature),
                'mean': np.mean(feature)
            }

    # Required fit method, to train the model

    def fit(self, X, y):
        """
        Trains the model.
        Parameters:
        ----------
        X: array-like, training features
        y: list, target variable
        Returns:
        Dictionary with the prior probability, mean, and standard deviation of each class
        """

        separated_classes = self.separate_classes(X, y)
        self.class_summary = {}

        for class_name, feature_values in separated_classes.items():
            self.class_summary[class_name] = {
                'prior_proba': len(feature_values) / len(X),
                'summary': [i for i in self.stat_info(feature_values)],
            }
        return self.class_summary

    # Gaussian distribution function

    def distribution(self, x, mean, std):
        """
        Gaussian Distribution Function
        Parameters:
        ----------
        x: float, value of feature
        mean: float, the average value of feature
        stdev: float, the standard deviation of feature
        Returns:
        A value of Normal Probability
        """

        exponent = np.exp(-((x - mean) ** 2 / (2 * std ** 2)))

        return exponent / (np.sqrt(2 * np.pi) * std)

    # Required predict method, to predict the class

    def predict(self, X):
        """
        Predicts the class.
        Parameters:
        ----------
        X: array-like, test data set
        Returns:
        -----------
        List of predicted class for each row of data set
        """

        # Maximum a posteriori (MAP)

        MAPs = []

        for row in X:
            joint_proba = {}

            for class_name, features in self.class_summary.items():
                total_features = len(features['summary'])
                likelihood = 1

                for idx in range(total_features):
                    feature = row[idx]
                    mean = features['summary'][idx]['mean']
                    stdev = features['summary'][idx]['std']
                    normal_proba = self.distribution(feature, mean, stdev)
                    likelihood *= normal_proba
                prior_proba = features['prior_proba']
                joint_proba[class_name] = prior_proba * likelihood

            MAP = max(joint_proba, key=joint_proba.get)
            MAPs.append(MAP)

        return MAPs




## Use NaiveBayesClassifier

In [38]:
model = NaiveBayesClassifier()

In [39]:
model.fit(trainX, trainY)

{1: {'prior_proba': 0.3333333333333333,
  'summary': [{'std': 0.3489469873777391, 'mean': 5.006},
   {'std': 0.37719490982779713, 'mean': 3.418},
   {'std': 0.17176728442867112, 'mean': 1.464},
   {'std': 0.10613199329137281, 'mean': 0.244}]},
 2: {'prior_proba': 0.3333333333333333,
  'summary': [{'std': 0.5109833656783751, 'mean': 5.936},
   {'std': 0.31064449134018135, 'mean': 2.7700000000000005},
   {'std': 0.4651881339845203, 'mean': 4.26},
   {'std': 0.19576516544063705, 'mean': 1.3259999999999998}]},
 3: {'prior_proba': 0.3333333333333333,
  'summary': [{'std': 0.6294886813914926, 'mean': 6.587999999999998},
   {'std': 0.3192553836664309, 'mean': 2.974},
   {'std': 0.546347874526844, 'mean': 5.5520000000000005},
   {'std': 0.2718896835115301, 'mean': 2.0260000000000002}]}}

In [40]:
sepal_length = 5.7
sepal_width = 3.2
petal_length = 5.1
petal_width = 1.6

test_data = [[sepal_length, sepal_width, petal_length, petal_width]]

pred_y = model.predict(test_data)

print(f"Sepal Length: {sepal_length}, Sepal Width: {sepal_width}, Petal Length: {petal_length}, Petal Width: {petal_width}, \nClassification Result : {decode_predict(pred_y[0])}")

Sepal Length: 5.7, Sepal Width: 3.2, Petal Length: 5.1, Petal Width: 1.6, 
Classification Result : virginica


## Use Sklearn Naive Bayes Classifier

In [41]:
from sklearn.naive_bayes import GaussianNB

In [42]:
clf = GaussianNB()

clf.fit(trainX, trainY)

GaussianNB()

In [43]:
sepal_length = 5.7
sepal_width = 3.2
petal_length = 5.1
petal_width = 1.6

test_data = [[sepal_length, sepal_width, petal_length, petal_width]]

pred_clf = clf.predict(test_data)

In [44]:
print(f"Sepal Length: {sepal_length}, Sepal Width: {sepal_width}, Petal Length: {petal_length}, Petal Width: {petal_width}, \nClassification Result : {decode_predict(pred_clf[0])}")

Sepal Length: 5.7, Sepal Width: 3.2, Petal Length: 5.1, Petal Width: 1.6, 
Classification Result : virginica
