# MyGaussianNB Class classifier
A simple example of a classifier in the `sklearn` framework.   
https://sklearn-template.readthedocs.io/en/latest/user_guide.html   
This classifier simply identifies the most frequent class and always predicts that.  
Implementing the classifier entails defining the `fit` and `predict` methods. 

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.base import BaseEstimator, ClassifierMixin
from collections import Counter

from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances

from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

from sklearn.model_selection import cross_val_score, cross_validate

from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


## My Gaussian Naive Bayes  Classifier
An implementation of a Gaussian Naive Class Classifier that fits into the scikit-learn framework.

In [2]:
class MyGaussianNB(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.data_by_class = dict()
        self.class_prior = dict()
        self.feature_lkh = dict()
    
    def fit(self, X, y):
        
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        self.X_ = X
        self.y_ = y
        
        # Separate dataset by class
        for i in range(len(X)):
            vec = X[i]
            class_label = y[i]
            if class_label not in self.data_by_class:
                self.data_by_class[class_label] = list()
            self.data_by_class[class_label].append(vec)
        
        # Calculate class prior
        self.class_prior = dict(Counter(y))
        for cls in self.class_prior:
            self.class_prior[cls] = 1.0 * self.class_prior[cls] / len(y)            
        #print(self.class_prior)
        
        # summary stats on features X (mean and standard deviation)
        for cls, data in self.data_by_class.items():
            self.feature_lkh[cls] = {'mu':np.mean(self.data_by_class[cls], axis=0), 'sigma':np.std(self.data_by_class[cls], axis=0), 'cls_length':len(self.data_by_class[cls])}
            
        #print(self.feature_lkh)

        # Return the classifier
        return self
    
      

    def predict(self, X):
        
        def naive_bayes_pdf(mu, sigma, x_OneFeature):
            return 1.0 / np.sqrt(2 * np.pi * (sigma**2) ) * np.exp(-1 * ((x_OneFeature - mu)**2) / (2 * (sigma**2)))   
            
            
        def predict_X_OneRow(feature_lkh_stats, X_OneRow):
            class_proba = dict()
            
            for cls in feature_lkh_stats:
                params_list = list(zip(feature_lkh_stats[cls]['mu'], feature_lkh_stats[cls]['sigma'], X_OneRow))
                lkh = list(map(lambda params: naive_bayes_pdf(params[0],params[1],params[2]), params_list))
                class_proba[cls] = np.prod(lkh) * self.class_prior[cls]
                
            class_proba_normalized = dict()
            for cls in class_proba:
                class_proba_normalized[cls] = class_proba[cls] / sum(class_proba.values())    
                
            #print(class_proba_normalized)
            
            class_selected_index = np.argmax(list(class_proba_normalized.values()))        
            class_selected = list(class_proba_normalized.keys())[class_selected_index]        
            return class_selected
    
        # Check is fit had been called
        check_is_fitted(self)

        # Input validation
        X = check_array(X)
        
        predicted_class_labels = None
        
        if X.ndim == 2 and X.shape[0]>1:
            predicted_class_labels = list(map(lambda X_OneRow : predict_X_OneRow(self.feature_lkh, X_OneRow), X))
            #predicted_class_labels = [predict_X_OneRow(self.feature_lkh, X_OneRow) for X_OneRow in X]
            predicted_class_labels = np.array(predicted_class_labels)
        elif X.ndim ==1:
            predicted_class_labels = predict_X_OneRow(self.feature_lkh, X)
        else:
            predicted_class_labels = None
            
        return predicted_class_labels
    
    
    def predict_proba(self, X):
        pass

# Testing MyGaussianNB Classifier with 4 Different Datasets 

In [3]:
# Load the penguines dataset
penguins_af = pd.read_csv('penguines.csv')
print(penguins_af.shape)

(333, 8)


In [4]:
# Load the diabetes dataset
diabetes = pd.read_csv('diabetes.csv')
print(diabetes.shape)

(768, 9)


In [5]:
# Load the glassV2 dataset
glassV2 = pd.read_csv('glassV2.csv')
glassV2 = glassV2.replace({0.0:np.nan})
print(glassV2.shape)

(205, 10)


In [6]:
# Load the wine dataset
wine = pd.read_csv('wine.csv')
print(wine.shape)


(178, 14)


In [7]:
# Clean the data, and addressing the missing values 

# penguins dataset
f_names = ['bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g']
penguins = penguins_af[f_names + ['species']]
print(penguins.head())
print(penguins[penguins.columns[-1]].value_counts())

# diabetes dataset
# Looks good, no need for cleaning
print(diabetes.head())
print(diabetes[diabetes.columns[-1]].value_counts())



# glassv2 dataset
# There is quite a few missing values appearing as 0, in the dataset, so some preprosessing need to be done by 
# replacing these 0s with null valunes in Python(np.nan) and then impute the meaning values in those locations.
glassV2['Type'] = [ 'type_' + str(T) for T in glassV2['Type']  ]
glassV2.replace({0.0:np.nan})
glassV2[glassV2.columns[0: len(glassV2.columns)-1]] = KNNImputer(n_neighbors=2).fit_transform(glassV2[glassV2.columns[0: len(glassV2.columns)-1]])
glassV2[glassV2.columns[0: len(glassV2.columns)-1]] = SimpleImputer(strategy='mean').fit_transform(glassV2[glassV2.columns[0: len(glassV2.columns)-1]])


print(glassV2.head())
print(glassV2[glassV2.columns[-1]].value_counts())


# wine dataset
# Looks good, no need for cleaning
print(wine.head())
print(wine[wine.columns[-1]].value_counts())

   bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g species
0            39.1           18.7              181.0       3750.0  Adelie
1            39.5           17.4              186.0       3800.0  Adelie
2            40.3           18.0              195.0       3250.0  Adelie
4            36.7           19.3              193.0       3450.0  Adelie
5            39.3           20.6              190.0       3650.0  Adelie
Adelie       146
Gentoo       119
Chinstrap     68
Name: species, dtype: int64
   preg  plas  pres  skin  insu  mass   pedi  age          neg_pos
0     6   148    72    35     0  33.6  0.627   50  tested_positive
1     1    85    66    29     0  26.6  0.351   31  tested_negative
2     8   183    64     0     0  23.3  0.672   32  tested_positive
3     1    89    66    23    94  28.1  0.167   21  tested_negative
4     0   137    40    35   168  43.1  2.288   33  tested_positive
tested_negative    500
tested_positive    268
Name: neg_pos, dtype: int64
       

In [8]:
# Compare the MyGaussianNB implementation with the scikit-learn GaussianNB classifier to see if the MyGaussianNB generate
# the same results as its scikit-learn version. Also, other classifier models such as Suport Vector Machine, KNN, and Decision
# Tree are also being used as a comparion across the 4 different datasets.

datasets = {'penguins': penguins, 'diabetes':diabetes,'wine':wine,'glassV2':glassV2 }

dataset_names = list(datasets.keys())

classifier_comparison_table = dict() 

cross_validation_folds = 10  # e#

for dataset_name in dataset_names:
    
    dataset = datasets[dataset_name].copy()
    feature_names = dataset.columns
    y = dataset.pop(dataset.columns[-1]).values
    X_raw = dataset.values
    
    my_gnb = MyGaussianNB()
    sk_gnb = GaussianNB()
    
    svm = SVC(kernel = 'linear',C=1)
    kNN = KNeighborsClassifier(n_neighbors=3)  
    dtree = DecisionTreeClassifier(criterion='entropy')
    
    models = {'My_GNB':my_gnb, 'SKL_GNB':sk_gnb, 'KNN':kNN, 'SVM':svm, 'DecisionTree':dtree}
    classifier_names = list(models.keys())
    
    scalers = {'MinMax_Scaler':MinMaxScaler(), 'Standard_Scaler': StandardScaler()}    
    
    for classifier_name in classifier_names:
        
        print('\n\n\n------------- Training the classifier on the "{0}" dataset with the "{1}" model ---------------'.format(dataset_name,classifier_name))
        
        steps = [('scaler', scalers['MinMax_Scaler']), ('gaussian_naive_classifier', models[classifier_name])]
        pipeline = Pipeline(steps)
        X_train, X_test, y_train, y_test = train_test_split(X_raw, y, test_size=0.2, random_state=0)
        # training the model from the training set and predict class in the test set 
        fitted_model = pipeline.fit(X_train, y_train)
        y_pred = fitted_model.predict(X_test)
        
        # accuracy score
        accuracy = accuracy_score(y_test, y_pred)
        print('\nDataset: | {0} |, Model: | {1} |, \n\nAccuracy: {2:.2f}'.format(dataset_name, classifier_name, accuracy))
        
        # precision score
        precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
        print('Precision: {0:.2f}'.format(precision))

        # recall score
        recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
        print('Recall: {0:.2f}'.format( recall))

        # f1 score
        f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
        print('f1: {0:.2f}'.format(f1))    
        
        # confusion matrix
        confusion = confusion_matrix(y_test, y_pred)
        print('\nnConfusion matrix:\n{0}'.format(confusion))

        # cross validation
        cv_scores = cross_val_score(pipeline, X_raw, y, scoring = 'accuracy', cv=cross_validation_folds, n_jobs=-1)
        print('\n{0}-fold cross validation score ("Accuracy") : {1:.2f}'.format(cross_validation_folds, np.mean(cv_scores)) )
        
        col_label = dataset_name + '_' + classifier_name
        
        if col_label not in classifier_comparison_table:
            classifier_comparison_table[col_label] = list()
        
        classifier_comparison_table[col_label].extend([accuracy, precision, recall, f1, np.mean(cv_scores)])
        
        




------------- Training the classifier on the "penguins" dataset with the "My_GNB" model ---------------

Dataset: | penguins |, Model: | My_GNB |, 

Accuracy: 0.99
Precision: 0.99
Recall: 0.97
f1: 0.98

nConfusion matrix:
[[39  0  0]
 [ 1  9  0]
 [ 0  0 18]]

10-fold cross validation score ("Accuracy") : 0.97



------------- Training the classifier on the "penguins" dataset with the "SKL_GNB" model ---------------

Dataset: | penguins |, Model: | SKL_GNB |, 

Accuracy: 0.99
Precision: 0.99
Recall: 0.97
f1: 0.98

nConfusion matrix:
[[39  0  0]
 [ 1  9  0]
 [ 0  0 18]]

10-fold cross validation score ("Accuracy") : 0.97



------------- Training the classifier on the "penguins" dataset with the "KNN" model ---------------

Dataset: | penguins |, Model: | KNN |, 

Accuracy: 1.00
Precision: 1.00
Recall: 1.00
f1: 1.00

nConfusion matrix:
[[39  0  0]
 [ 0 10  0]
 [ 0  0 18]]

10-fold cross validation score ("Accuracy") : 0.99



------------- Training the classifier on the "penguins" dat

In [9]:
classifier_comparison_table = pd.DataFrame(classifier_comparison_table, index=['accuracy', 'precision', 'recall', 'f1_score', 'cv_accuracy'])
classifier_comparison_table

Unnamed: 0,penguins_My_GNB,penguins_SKL_GNB,penguins_KNN,penguins_SVM,penguins_DecisionTree,diabetes_My_GNB,diabetes_SKL_GNB,diabetes_KNN,diabetes_SVM,diabetes_DecisionTree,wine_My_GNB,wine_SKL_GNB,wine_KNN,wine_SVM,wine_DecisionTree,glassV2_My_GNB,glassV2_SKL_GNB,glassV2_KNN,glassV2_SVM,glassV2_DecisionTree
accuracy,0.985075,0.985075,1.0,0.985075,1.0,0.792208,0.792208,0.772727,0.818182,0.733766,0.916667,0.916667,0.972222,1.0,0.944444,0.560976,0.560976,0.609756,0.463415,0.756098
precision,0.991667,0.991667,1.0,0.991667,1.0,0.756128,0.756128,0.732143,0.797182,0.695153,0.910714,0.910714,0.952381,1.0,0.931548,0.582857,0.582857,0.617778,0.511905,0.835128
recall,0.966667,0.966667,1.0,0.966667,1.0,0.74309,0.74309,0.717141,0.755816,0.712965,0.9375,0.9375,0.979167,1.0,0.955357,0.594505,0.594505,0.532601,0.422527,0.745604
f1_score,0.978237,0.978237,1.0,0.978237,1.0,0.748828,0.748828,0.723462,0.770833,0.700971,0.917654,0.917654,0.963606,1.0,0.94118,0.569986,0.569986,0.554437,0.405815,0.766376
cv_accuracy,0.969964,0.969964,0.988057,0.981996,0.973084,0.756494,0.756494,0.739542,0.768216,0.696565,0.977778,0.977778,0.94902,0.983007,0.898366,0.47381,0.47381,0.713095,0.551667,0.614286


# Comments on Performance Comparison

1. The performance metrics being used are: 1) Accuracy; 2) Precision; 3) Recall; 4) f1 score; 5) confusion matrix; 6) 10-fold cross-validation for average accuracy.

2. The MyGaussian implementation generates exactly the same confusion matrix and various performance scores ( accuracy, precision, recall, f1 score, and 10-fold cross-validation for average accuracy) mentioned above as the scikit-learn GaussianNB implementation, which means this MyGaussian implementation has identifical performance as the scikit-learn GaussianNB version.

3. In terms of different models, the 10-fold cross-validation for average accuracy score indicates that: 1) the KNN method and the SVM method have the best performance for the penguines dataset, which is slightly better than the Gaussian Naive Bayes method; 2) the Gaissian Naive Bayes model has the best performance for the diabetes dataset; 3) the SVM has the best performance for the wine dataset, but it is just marginally better than the aissian Naive Bayes method; 4) The Decision Tree model outperforms other model for the glassV2 dataset, and the Gaissian Naive Bayes model is quite disadvantagous for this dataset among all of the classifier methods. 