# EXPERIMENT 3

### SHIVAM SINGHAL 2K18/CO/340

AIM - Write a program to implement the Naïve Bayesian classifier for appropriate dataset and compute
the performance measures of the model.

In [7]:
import numpy as np
import pandas as pd
import math

In [19]:
# gaussClf will be the class that will have the Gaussian naive bayes classifier implimentation
class gaussClf:
    def separate_by_classes(self, X, y):
        ''' This function separates our dataset in subdatasets by classes '''
        self.classes = np.unique(y)
        classes_index = {}
        subdatasets = {}
        cls, counts = np.unique(y, return_counts=True)
        self.class_freq = dict(zip(cls, counts))
        print(self.class_freq)
        for class_type in self.classes:
            classes_index[class_type] = np.argwhere(y==class_type)
            subdatasets[class_type] = X[classes_index[class_type], :]
            self.class_freq[class_type] = self.class_freq[class_type]/sum(list(self.class_freq.values()))
        return subdatasets
    def fit(self, X, y):
        ''' The fitting function '''
        separated_X = self.separate_by_classes(X, y)
        self.means = {}
        self.std = {}
        for class_type in self.classes:
            # Here we calculate the mean and the standart deviation from datasets
            self.means[class_type] = np.mean(separated_X[class_type], axis=0)[0]
            self.std[class_type] = np.std(separated_X[class_type], axis=0)[0]
    def calculate_probability(self, x, mean, stdev):
        ''' This function calculates the class probability using gaussian distribution '''
        exponent = math.exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
        return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
    def predict_proba(self, X):
        ''' This function predicts the probability for every class '''
        self.class_prob = {cls:math.log(self.class_freq[cls], math.e) for cls in self.classes}
        for cls in self.classes:
            for i in range(len(self.means)):
                self.class_prob[cls]+=math.log(self.calculate_probability(X[i], self.means[cls][i], self.std[cls][i]), math.e)
        self.class_prob = {cls: math.e**self.class_prob[cls] for cls in self.class_prob}
        return self.class_prob
    def predict(self, X):
        ''' This funtion predicts the class of a sample '''
        pred = []
        for x in X:
            pred_class = None
            max_prob = 0
            for cls, prob in self.predict_proba(x).items():
                if prob>max_prob:
                    max_prob = prob
                    pred_class = cls
            pred.append(pred_class)
        return pred

In [20]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
header = ['sepal length in cm' , 'sepal width in cm' , 'petal length in cm' , 'petal width in cm' , 'class']

In [21]:
df = pd.read_csv(url , names = header)
df = df.replace(df['class'].unique() , [0,1,2])

In [22]:
df.head()

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [23]:
X = np.array(df)[: , 0:4]
Y = np.array(df)[: , 4]

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, Y,
                                                    shuffle=True, stratify=Y,
                                                    test_size=0.3, random_state=2)
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((105, 4), (45, 4), (105,), (45,))

In [25]:
naive_bayes = gaussClf()

In [26]:
naive_bayes.fit(X = X_train, y=y_train)

{0.0: 35, 1.0: 35, 2.0: 35}


In [27]:
y_predict = naive_bayes.predict(X_valid)

In [30]:
len(y_predict)

45

In [31]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_valid , y_predict)

array([[15,  0,  0],
       [ 0, 13,  2],
       [ 0,  2, 13]], dtype=int64)

In [33]:
from sklearn.metrics import classification_report

print(classification_report(y_valid , y_predict))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        15
         1.0       0.87      0.87      0.87        15
         2.0       0.87      0.87      0.87        15

    accuracy                           0.91        45
   macro avg       0.91      0.91      0.91        45
weighted avg       0.91      0.91      0.91        45

