In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

In [2]:
class NaiveBayesClassifier():
    def calc_prior(self, features, target):
        self.prior = (features.groupby(target).apply(lambda x: len(x)) / self.rows).to_numpy()
        return self.prior
    
    def calc_statistics(self, features, target):      
        self.mean = features.groupby(target).apply(np.mean).to_numpy()
        self.var = features.groupby(target).apply(np.var).to_numpy()
              
        return self.mean, self.var
    
    def gaussian_density(self, class_idx, x):     
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))
        #numerator = np.exp(-((x-mean)**2 / (2 * var)))
        denominator = np.sqrt(2 * np.pi * var)
        prob = numerator / denominator
        return prob
    
    def calc_posterior(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for i in range(self.count):
            prior = np.log(self.prior[i]) ## use the log to make it more numerically stable
            conditional = np.sum(np.log(self.gaussian_density(i, x))) # use the log to make it more numerically stable
            posterior = prior + conditional
            posteriors.append(posterior)
        # return class with highest posterior probability
        return self.classes[np.argmax(posteriors)]
     

    def fit(self, features, target):
        self.classes = np.unique(target)
        self.count = len(self.classes)
        self.feature_nums = features.shape[1]
        self.rows = features.shape[0]
        
        self.calc_statistics(features, target)
        self.calc_prior(features, target)
        
    def predict(self, features):
        preds = [self.calc_posterior(f) for f in features.to_numpy()]
        return preds

    def accuracy(self, y_test, y_pred):
        accuracy = np.sum(y_test == y_pred) / len(y_test)
        return accuracy
 
    def visualize(self, y_true, y_pred, target):
        
        tr = pd.DataFrame(data=y_true, columns=[target])
        pr = pd.DataFrame(data=y_pred, columns=[target])
        
        
        fig, ax = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(15,6))
        
        sns.countplot(x=target, data=tr, ax=ax[0], palette='viridis', alpha=0.7, hue=target, dodge=False)
        sns.countplot(x=target, data=pr, ax=ax[1], palette='viridis', alpha=0.7, hue=target, dodge=False)
        

        fig.suptitle('True vs Predicted Comparison', fontsize=20)

        ax[0].tick_params(labelsize=12)
        ax[1].tick_params(labelsize=12)
        ax[0].set_title("True values", fontsize=18)
        ax[1].set_title("Predicted values", fontsize=18)
        plt.show()

In [3]:
mydata = pd.read_csv("D:\Datasets\Dry_Bean_Dataset\Dry_Bean_Dataset.csv")
mydata.head()
mydata["Class"] = mydata["Class"].astype('category')
mydata.dtypes
mydata["Class"] = mydata["Class"].cat.codes
mydata.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,5
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,5
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,5
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,5
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,5


In [5]:
x=mydata.iloc[:, :-1]
y=mydata.iloc[:, -1]

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=0)
# train the model
x = NaiveBayesClassifier()
x.fit(X_train, y_train)
pred = x.predict(X_test)
x.accuracy(y_test, pred)

0.8942343004039662

In [7]:
y_test.value_counts(normalize=True)

3    0.269923
6    0.192802
4    0.143224
5    0.138083
2    0.128535
0    0.093647
1    0.033786
Name: Class, dtype: float64

In [8]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
acc=accuracy_score(y_test,pred)
print("Accuracy is")
print(acc)
print("Confusion Matrix")
cm=confusion_matrix(y_test,pred)
print(cm)
cr=classification_report(y_test,pred)
print("classification report:")
print(cr)

Accuracy is
0.8942343004039662
Confusion Matrix
[[184   0  51   0   2   2  16]
 [  0  92   0   0   0   0   0]
 [ 12   0 328   0   5   1   4]
 [  0   0   0 648   2  16  69]
 [  0   0  10   3 366   0  11]
 [  1   0   0   2   0 355  18]
 [  0   0   2  43  11   7 462]]
classification report:
              precision    recall  f1-score   support

           0       0.93      0.72      0.81       255
           1       1.00      1.00      1.00        92
           2       0.84      0.94      0.89       350
           3       0.93      0.88      0.91       735
           4       0.95      0.94      0.94       390
           5       0.93      0.94      0.94       376
           6       0.80      0.88      0.84       525

    accuracy                           0.89      2723
   macro avg       0.91      0.90      0.90      2723
weighted avg       0.90      0.89      0.89      2723



In [9]:
#applying gridsearchcv method to find the suitable parameters
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
param_grid_nb = {
 'var_smoothing': np.logspace(0,-9, num=100)
}
nbModel_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
nbModel_grid.fit(X_train, y_train)
print(nbModel_grid.best_estimator_)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
GaussianNB()


In [10]:
#root mean square error:
from sklearn.metrics import mean_squared_error
from math import sqrt
sqrt(mean_squared_error( y_test,pred))

0.9186523524671087