In [None]:
import numpy as np
import pandas as pd
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.base import BaseEstimator, ClassifierMixin

In [None]:
class MyGaussianNB (BaseEstimator, ClassifierMixin):   
    
    #Fit function of the My Gaussian Naive Bayes with train data and labels as the input    
    def fit(self, Xt, yt):
        self.Xt = Xt
        self.yt = yt
        
        class_list = {}
        
        #To separate all the feature values based on the various classes so everything is grouped for calculations 
        for i in range(len(Xt)):
            feature_values = Xt[i]
            class_name = yt[i]
            """
            If the class is not present in the dictionary, then that class will be added to it and
            the all the feature values belonging to that class will be appended to the corresponding class array
            """
            if class_name not in class_list:
                class_list[class_name] = []
            class_list[class_name].append(feature_values)
        
        self.feature_calculations = {}
        for key, value in class_list.items():
            mean_std_values = []
            """
            Separating all the values of each feature for a specific class
            and calculating the mean and standard deviation for each of the feature
            """
            for feature in zip(*value):
                calculations = {}
                calculations['mean'] = np.mean(feature)
                calculations['std'] = np.std(feature)
                mean_std_values.append(calculations)
            """    
            Creating a dictionary with the prior probability and the calculated values(mean, standard deviation)
            of all the features for every class   
            The structure would be:
            {'class_name': {'prior_prob': value, 
            'calc_values': [{'mean': value, 'std': value}...(for all features in the dataset)]}}
            """
            class_values = {}
            class_values['prior_prob'] = len(value)/len(Xt)
            class_values['calc_values'] = mean_std_values
            self.feature_calculations[key] = class_values
        return self
    
    """ 
    Calculates the conditional probabilties value for every feature based on the formula
    with the mean, standard deviation and the value as the input
    """ 
    def GaussianNBCalc(self, mean, std, x):
        numerator = np.exp(-((np.square(x-mean))/(2*np.square(std))))
        conditional_prob = numerator/(np.sqrt(2*math.pi*np.square(std)))
        return conditional_prob

    #Predict function of the My Gaussian Naive Bayes with test data as the input
    def predict(self, Xtes):
        
        self.Xtes = Xtes
    
        result = []
        
        #Looping over the test data for which the class has to be predicted
        for data in Xtes:
            probability = {}
            
            #Calculating the conditional probabilities for every feature
            for key, class_val in self.feature_calculations.items():
                features_probability = 1;
                for i in range(len(class_val['calc_values'])):
                    x_feature = data[i]
                    cond_prob = self.GaussianNBCalc(class_val['calc_values'][i]['mean'], class_val['calc_values'][i]['std'], x_feature)
                    
                    #Multiplying the conditional probabilities of all the features which will be the likelihood
                    features_probability = features_probability * cond_prob
                prior_probability = class_val['prior_prob']
                
                #Multiplying the prior probablity of the class with the calculated probabilities(likelihood)
                probability[key] = features_probability * prior_probability
                
            """
            Finding the maximum value from the probabilities calculated. 
            The corresponding key of the maximum value would be the predicted value for that data
            """
            max_probability = max(probability, key=probability.get)
            result.append(max_probability)
        return result
    
    #Calculates the accuracy of the predictions by comparing the predicted values and original data
    def score(self, Xtes, Ytes):
        self.Xtes = Xtes
        self.Ytes = Ytes
        
        res = self.predict(Xtes)
        count = 0;
        for i in range(len(res)):
            if res[i] == Ytes[i]:
                count += 1
        return count / len(Ytes)
            

In [None]:
mgnb = MyGaussianNB()
gnb = GaussianNB()

In [None]:
"""
Penguins Dataset is used as the test data.
The test data only includes two classes of the penguins which is present in the data
"""

penguins_af = pd.read_csv('penguins_af.csv', index_col = 0)
penguins_af.head()

#Excluding the other columns as Gaussian Naive Bayes is mostly used for only numerical values
f_names = ['bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g']
penguins = penguins_af[f_names + ['species']]
penguins2C = penguins.loc[penguins['species'].isin(['Adelie','Chinstrap'])]
y = penguins2C.pop('species').values
X_raw = penguins2C.values
feature_names = penguins2C.columns
X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X_raw, y, random_state=3, test_size=1/2)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_tr_raw)
X_test = scaler.transform(X_ts_raw)

mgnb.fit(X_train,y_train)
mgnb.predict(X_test)
gnb.fit(X_train,y_train)
gnb.predict(X_test)

"""
Comparing the accuracy of the MyGaussianNB class and the GaussianNB class from scikit learn
(same accuracy is returned in both the cases)
"""

print("Accuracy:")
print("My Gaussian Naive Bayes :", mgnb.score(X_test, y_test))
print("Gaussian Naive Bayes Scikit Learn :", gnb.score(X_test, y_test))

In [None]:
"""
Penguins Dataset is used as the test data.
The test data includes all the classes of the penguins which is present in the data
"""

penguins_af = pd.read_csv('penguins_af.csv', index_col = 0)
penguins_af.head()

#Excluding the other columns as Gaussian Naive Bayes is mostly used for only numerical values
f_names = ['bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g']
penguins = penguins_af[f_names + ['species']]
y = penguins.pop('species').values
X_raw = penguins.values
feature_names = penguins.columns
X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X_raw, y, random_state=3, test_size=1/2)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_tr_raw)
X_test = scaler.transform(X_ts_raw)

mgnb.fit(X_train,y_train)
mgnb.predict(X_test)
gnb.fit(X_train,y_train)
gnb.predict(X_test)

"""
Comparing the accuracy of the MyGaussianNB class and the GaussianNB class from scikit learn
(same accuracy is returned in both the cases)
"""
print("Accuracy:")
print("My Gaussian Naive Bayes :", mgnb.score(X_test, y_test))
print("Gaussian Naive Bayes Scikit Learn :", gnb.score(X_test, y_test))

In [None]:
"""
Diabetes Dataset is used as the test data.
The test data includes all the classes which is present in the data
"""

diabetes = pd.read_csv('diabetes.csv', index_col = 0)
diabetes.head()
y = diabetes.pop('neg_pos').values
X_raw = diabetes.values
feature_names = diabetes.columns
X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X_raw, y, random_state=3, test_size=1/2)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_tr_raw)
X_test = scaler.transform(X_ts_raw)

mgnb.fit(X_train,y_train)
mgnb.predict(X_test)
gnb.fit(X_train,y_train)
gnb.predict(X_test)

"""
Comparing the accuracy of the MyGaussianNB class and the GaussianNB class from scikit learn
(same accuracy is returned in both the cases)
"""
print("Accuracy:")
print("My Gaussian Naive Bayes :", mgnb.score(X_test, y_test))
print("Gaussian Naive Bayes Scikit Learn :", gnb.score(X_test, y_test))

In [None]:
"""
Hotel Review Helpfulness Dataset is used as the test data.
The test data includes all the classes which is present in the data
"""

hotel = pd.read_csv('HotelRevHelpfulnessV2.csv', index_col = 0)
hotel.head()
y = hotel.pop('reviewHelpfulness').values
X_raw = hotel.values
feature_names = hotel.columns
X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X_raw, y, random_state=3, test_size=1/2)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_tr_raw)
X_test = scaler.transform(X_ts_raw)

mgnb.fit(X_train,y_train)
mgnb.predict(X_test)
gnb.fit(X_train,y_train)
gnb.predict(X_test)

"""
Comparing the accuracy of the MyGaussianNB class and the GaussianNB class from scikit learn
(same accuracy is returned in both the cases, but the accuracy is less in both the cases as all the numerical features
are considered.)
"""
print("Accuracy:")
print("My Gaussian Naive Bayes :", mgnb.score(X_test, y_test))
print("Gaussian Naive Bayes Scikit Learn :", gnb.score(X_test, y_test))