submitted by, Tarang Ranpara (202011057)

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv("https://raw.githubusercontent.com/TarangRanpara/Machine-learning-algos/master/Iris-data-KNN/Iris.csv")

In [3]:
dataset.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
dataset = dataset.drop(['Id'], axis=1)
dataset

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [5]:
dataset['Species'].value_counts()

Iris-setosa        50
Iris-virginica     50
Iris-versicolor    50
Name: Species, dtype: int64

In [6]:
class NaiveBayes:
    '''
    Bayes Theorem form
    P(y|X) = P(X|y) * P(y) / P(X)
    '''
    def calc_prior(self, features, target):
        '''
        prior probability P(y)
        '''
        self.prior = (features.groupby(target).apply(lambda x: len(x)) / self.rows).to_numpy()

        return self.prior
    
    def calc_statistics(self, features, target):
        '''
        calculate mean, variance for each column and convert to numpy array
        ''' 
        self.mean = features.groupby(target).apply(np.mean).to_numpy()
        self.var = features.groupby(target).apply(np.var).to_numpy()
              
        return self.mean, self.var
    
    def gaussian_density(self, class_idx, x):     
        '''
        calculate probability from gaussian density function (normally distributed)
        we will assume that probability of specific target value given specific class is normally distributed 
        
        probability density function:
        (1/√2pi*σ) * exp((-1/2)*((x-μ)^2)/(2*σ²)), where μ is mean, σ² is variance, σ is square root of variance (standard deviation)
        '''
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        
        numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        
        prob = numerator / denominator
        return prob
    
    def calc_posterior(self, x):
        posteriors = []
        # calculate posterior probability for each class
        for i in range(self.count):
            
            # using the log to make it more numerically stable
            prior = np.log(self.prior[i]) 
            
            # use the log to make it more numerically stable
            conditional = np.sum(np.log(self.gaussian_density(i, x))) 
            
            posterior = prior + conditional
            posteriors.append(posterior)
            
        # return class with highest posterior probability
        return self.classes[np.argmax(posteriors)]
     

    def fit(self, features, target):
        self.classes = np.unique(target)
        self.count = len(self.classes)
        self.feature_nums = features.shape[1]
        self.rows = features.shape[0]
        
        self.calc_statistics(features, target)
        self.calc_prior(features, target)
        
    def predict(self, features):
        preds = [self.calc_posterior(f) for f in features.to_numpy()]
        return preds

    def accuracy(self, y_test, y_pred):
        accuracy = np.sum(y_test == y_pred) / len(y_test)
        return accuracy

In [7]:
from sklearn.model_selection import train_test_split

acc_list =[]
for i in range(10):
    grouped = dataset.groupby(dataset.Species)
    sentosa = grouped.get_group("Iris-setosa")
    virginica = grouped.get_group("Iris-virginica")
    versicolor = grouped.get_group("Iris-versicolor")

    x_sentosa, y_sentosa = sentosa.iloc[:, :-1], sentosa.iloc[:, -1]
    x_virginica, y_virginica = virginica.iloc[:, :-1], virginica.iloc[:, -1]
    x_versicolor, y_versicolor = versicolor.iloc[:, :-1], versicolor.iloc[:, -1]

    x_train_sentosa, x_test_sentosa, y_train_sentosa, y_test_sentosa = train_test_split(x_sentosa,y_sentosa, test_size=0.20)
    x_train_virginica, x_test_virginica, y_train_virginica, y_test_virginica = train_test_split(x_virginica,y_virginica, test_size=0.20)
    x_train_versicolor, x_test_versicolor, y_train_versicolor, y_test_versicolor = train_test_split(x_versicolor,y_versicolor, test_size=0.20)

    x_train = pd.concat([x_train_sentosa, x_train_virginica, x_train_versicolor])
    y_train = pd.concat([y_train_sentosa, y_train_virginica, y_train_versicolor])

    x_test = pd.concat([x_test_sentosa, x_test_virginica, x_test_versicolor])
    y_test = pd.concat([y_test_sentosa, y_test_virginica, y_test_versicolor])

    nb = NaiveBayes()
    nb.fit(x_train, y_train)
    acc_list.append(nb.accuracy(y_test, nb.predict(x_test)))

for i in range(10):
    print(f'iter: {i+1} - {round(acc_list[i] * 100, 2)}%')
    
print(f'\nAvg Accuracy: {round(sum(acc_list)/10*100, 2)}%')

iter: 1 - 100.0%
iter: 2 - 96.67%
iter: 3 - 96.67%
iter: 4 - 96.67%
iter: 5 - 96.67%
iter: 6 - 93.33%
iter: 7 - 100.0%
iter: 8 - 96.67%
iter: 9 - 96.67%
iter: 10 - 100.0%

Avg Accuracy: 97.33%
