In [201]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split 
import scipy.stats
from sklearn.naive_bayes import GaussianNB 
from sklearn import metrics 

In [202]:
iris = load_iris() 
data_list = iris.data 
target_list = iris.target 

In [203]:
# function for finding the gassian pdf
def gauss_pdf(i, mean, variance) : 
  return (1/(np.sqrt(variance) * np.sqrt(2 * np.pi)) * np.exp( - (i - mean)**2 / (2 * variance)))

# function for obtaining naive bayes classifier if we have 12 pdfs for each feature in each class
def bayes_dec(test_set, result_list) : 
  
  p1_f1 = gauss_pdf(i[0], f1_c1_mean, f1_c1_std**2)
  p1_f2 = gauss_pdf(i[1], f2_c1_mean, f2_c1_std**2)
  p1_f3 = gauss_pdf(i[2], f3_c1_mean, f3_c1_std**2)
  p1_f4 = gauss_pdf(i[3], f4_c1_mean, f4_c1_std**2)

  p2_f1 = gauss_pdf(i[0], f1_c2_mean, f1_c2_std**2)
  p2_f2 = gauss_pdf(i[1], f2_c2_mean, f2_c2_std**2)
  p2_f3 = gauss_pdf(i[2], f3_c2_mean, f3_c2_std**2)
  p2_f4 = gauss_pdf(i[3], f4_c2_mean, f4_c2_std**2)

  p3_f1 = gauss_pdf(i[0], f1_c3_mean, f1_c3_std**2)
  p3_f2 = gauss_pdf(i[1], f2_c3_mean, f2_c3_std**2)
  p3_f3 = gauss_pdf(i[2], f3_c3_mean, f3_c3_std**2)
  p3_f4 = gauss_pdf(i[3], f4_c3_mean, f4_c3_std**2)

  p1 = p1_f1*p1_f2*p1_f3*p1_f4
  p2 = p2_f1*p2_f2*p2_f3*p2_f4
  p3 = p3_f1*p3_f2*p3_f3*p3_f4

  if max(p1, p2, p3) == p1:
    result_list.append(0)
  elif max(p1,p2,p3) == p2:
    result_list.append(1)
  else:
    result_list.append(2)

# function for gaussian naive bayes using sklearn
def gaussian_nb(X_train, X_test, Y_train, Y_test, result_list) : 
  gnb = GaussianNB() 
  gnb.fit(X_train, Y_train) 
  y_pred = gnb.predict(X_test) 
  res.append(metrics.accuracy_score(Y_test, y_pred)*100)

In [204]:
print("Experiment with Naive Bayes Classification from scratch")
print("=======================================================\n")

itr = 10
accuracy1 = []
accuracy2 = []

for k in range(itr) : 

  X1_train, X1_test, Y1_train, Y1_test = train_test_split(data_list[0:50], target_list[0:50], test_size = 0.2)
  X2_train, X2_test, Y2_train, Y2_test = train_test_split(data_list[50:100], target_list[50:100], test_size = 0.2)
  X3_train, X3_test, Y3_train, Y3_test = train_test_split(data_list[100:150], target_list[100:150], test_size = 0.2)

  # calculating mean across Columns of all three classes
  column_mean1 = np.mean(X1_train, axis=0) 
  f1_c1_mean = round(column_mean1[0],2)   
  f2_c1_mean = round(column_mean1[1],2)  
  f3_c1_mean = round(column_mean1[2],2) 
  f4_c1_mean = round(column_mean1[3],2) 

  column_mean2 = np.mean(X2_train, axis=0) 
  f1_c2_mean = round(column_mean2[0],2)   
  f2_c2_mean = round(column_mean2[1],2)  
  f3_c2_mean = round(column_mean2[2],2) 
  f4_c2_mean = round(column_mean2[3],2) 

  column_mean3 = np.mean(X3_train, axis=0) 
  f1_c3_mean = round(column_mean3[0],2)   
  f2_c3_mean = round(column_mean3[1],2)  
  f3_c3_mean = round(column_mean3[2],2) 
  f4_c3_mean = round(column_mean3[3],2) 

  # Calculating standard deviation across Columns of all three classes
  column_std1 = np.std(X1_train, axis=0) 
  f1_c1_std = round(column_std1[0],2)
  f2_c1_std = round(column_std1[1],2)  
  f3_c1_std = round(column_std1[2],2) 
  f4_c1_std = round(column_std1[3],2) 

  column_std2 = np.std(X2_train, axis=0) 
  f1_c2_std = round(column_std2[0],2)   
  f2_c2_std = round(column_std2[1],2)  
  f3_c2_std = round(column_std2[2],2) 
  f4_c2_std = round(column_std2[3],2)

  column_std3 = np.std(X3_train, axis=0) 
  f1_c3_std = round(column_std3[0],2)   
  f2_c3_std = round(column_std3[1],2)  
  f3_c3_std = round(column_std3[2],2) 
  f4_c3_std = round(column_std3[3],2)

  # initialising the result array for all three class train sets
  res1 = []
  res2 = []
  res3 = []
  acc = 0

  # running naive bayes for each test class
  for i in X1_test : 
    bayes_dec(X1_test, res1)
  for i in X2_test : 
    bayes_dec(X2_test, res2)
  for i in X3_test : 
    bayes_dec(X3_test, res3)

  # comparing result with target set 
  for i in range(10) : 
    if res1[i] == Y1_test[i] : 
      acc += 1
  for i in range(10) :
    if res2[i] == Y2_test[i] : 
      acc += 1
  for i in range(10) : 
    if res3[i] == Y3_test[i] :
      acc += 1

  accuracy1.append(acc/30*100)
  print("Accuracy using own approach for iteration no.", k+1, "-->",(round(acc/30*100,2)))

  # performing gaussian naive bayes using sklearn on each class separately
  res = []
  gaussian_nb(X1_train, X1_test, Y1_train, Y1_test, res)
  gaussian_nb(X2_train, X2_test, Y2_train, Y2_test, res)
  gaussian_nb(X3_train, X3_test, Y3_train, Y3_test, res)

  accuracy2.append(np.average(res))
  print("Accuracy using Scikit-learn for iteration no.", k+1, "-->", np.average(res), "\n")

print("================================================================")
print("Average Accuracy using own approach over", itr, "iterations -->", round(np.average(accuracy1),2))
print("Average Accuracy using Scikit-learn over", itr, "iterations -->", round(np.average(accuracy2),2))

Experiment with Naive Bayes Classification from scratch

Accuracy using own approach for iteration no. 1 --> 100.0
Accuracy using Scikit-learn for iteration no. 1 --> 100.0 

Accuracy using own approach for iteration no. 2 --> 96.67
Accuracy using Scikit-learn for iteration no. 2 --> 100.0 

Accuracy using own approach for iteration no. 3 --> 100.0
Accuracy using Scikit-learn for iteration no. 3 --> 100.0 

Accuracy using own approach for iteration no. 4 --> 90.0
Accuracy using Scikit-learn for iteration no. 4 --> 100.0 

Accuracy using own approach for iteration no. 5 --> 100.0
Accuracy using Scikit-learn for iteration no. 5 --> 100.0 

Accuracy using own approach for iteration no. 6 --> 93.33
Accuracy using Scikit-learn for iteration no. 6 --> 100.0 

Accuracy using own approach for iteration no. 7 --> 96.67
Accuracy using Scikit-learn for iteration no. 7 --> 100.0 

Accuracy using own approach for iteration no. 8 --> 96.67
Accuracy using Scikit-learn for iteration no. 8 --> 100.0 

