In [97]:
import numpy as np
import pandas as pd
import math
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.base import BaseEstimator, ClassifierMixin
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.model_selection import cross_val_score

1. Extend the Gaussian Naive Bayes code so that it handles missing values. Gaussian Naive Bayes can handle missing values in training by calculating conditional probabilities on the values that are present. You may choose to put a limit on the
number of missing values allowed. Your code should also handle missing values on any test data. The easiest way to do
this is to leave features with missing values out of the posterior probability calculation.

In [None]:
class MyGaussianNB(BaseEstimator, ClassifierMixin):   
    def fit(self, Xt, yt):
        self.var_smoothing = 1e-9   # zero variance will cause division by zero errors.
        self.Xt = Xt
        self.yt = yt
        self.n_feat = Xt.shape[1]
        self.mus = {}
        self.sig_sqs = {}
        self.priors = {}
        c_dict = Counter(self.yt)
        
        for c in c_dict.keys():
            self.mus[c] = np.zeros(self.n_feat) # where the means will be stored
            self.sig_sqs[c] = np.zeros(self.n_feat) # where the variances will be stored
            self.priors[c] = c_dict[c]/Xt.shape[0]
            
            mask = self.yt == c
            X_tr_c = self.Xt[mask, :] # the rows for this class label
            
            for f in range(self.n_feat):
                self.mus[c][f] = np.nanmean(X_tr_c[:,f])  # Changing the mean to nanmean to leave the nans out of conditional probability
                self.sig_sqs[c][f] = np.nanvar(X_tr_c[:,f] + self.var_smoothing)  # Similarly Changing the variance to nanvariance to leave the nans out of conditional probability      
        #print(self.mus)
        #print(self.sig_sqs)
        
        return self
    
    # The predictions are the most common class in the training set.
    def predict(self, Xtes):
        #print("Predicting MGNB")
        self.Xtes = Xtes
         
        res_list = []
        for sample in Xtes:
            res_list.append(self.predict_single(sample))
            
        return np.array(res_list)
    
    def predict_single(self, x_single):
        probs = {}
        for c in self.priors.keys():   # for each of the class labels
            probs[c] = self.priors[c]
            
            for i, f in enumerate(x_single):
                if np.isnan(f):
                    pxi_y = 1
                else:
                    t1 = 1/math.sqrt(2*math.pi*self.sig_sqs[c][i])
                    num = (f - self.mus[c][i])**2
                    den = 2*self.sig_sqs[c][i]
                    pxi_y = t1 * math.exp(-num/den)
                probs[c] = probs[c] * pxi_y
                #print(t1, num, den, pxi_y)
                #print(probs)
            #print(c, self.priors[c])
        return max(probs, key=probs.get) # Return the key with the largest value
    

In [None]:
def fidelity_tests (X,y, nreps = 10):
    for rs in range(1, nreps + 1):
        X_tr_raw, X_ts_raw, y_train, y_test = train_test_split(X, y, 
                                                               random_state=rs, 
                                                               test_size=1/2)
        # Scaling using Standard scaler
        scale = StandardScaler()
        X_train = scale.fit_transform(X_tr_raw)
        X_test = scale.transform(X_ts_raw)
        
        # Univariate Imputing using SimpleImputer
        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp.fit(X_train)
        X_train_si = imp.transform(X_train)
        X_test_si = imp.transform(X_test)
        
        # Multivariate Imputing using IterativeImputer
        imp_it = IterativeImputer(missing_values=np.nan, max_iter = 11, random_state=0)
        imp_it.fit(X_train)
        X_train_it = imp_it.transform(X_train)
        X_test_it = imp_it.transform(X_test)
        
        gnb_Simple = GaussianNB()
        mgnb = MyGaussianNB()
        gnb_Iterative = GaussianNB()
        
        gnb_Simple.fit(X_train_si,y_train)
        mgnb.fit(X_train,y_train)
        gnb_Iterative.fit(X_train_it,y_train)
        
        # Check accuracies against models.
        ascore = accuracy_score(gnb_Simple.predict(X_test_si),mgnb.predict(X_test)) 
        gnb_acc_simple = accuracy_score(gnb_Simple.predict(X_test_si),y_test)
        mgnb_acc = accuracy_score(mgnb.predict(X_test),y_test)
        
        print ("Run after Univariate Imputing: %d Score: %.2f SK acc: %.2f My acc: %.2f" % (rs, ascore, gnb_acc_simple, mgnb_acc))
        
        # Check accuracies against models.
        ascore = accuracy_score(gnb_Iterative.predict(X_test_it),mgnb.predict(X_test))
        gnb_acc_it = accuracy_score(gnb_Iterative.predict(X_test_it),y_test)
        mgnb_acc = accuracy_score(mgnb.predict(X_test),y_test)
        
        print ("Run after Multivariate Imputing: %d Score: %.2f SK acc: %.2f My acc: %.2f" % (rs, ascore, gnb_acc_it, mgnb_acc))
        
        My_scores = cross_val_score(mgnb, X_train, y_train, cv=10)
        print("CV scores for MyGNB: %.2f" %(My_scores.mean()))
        
        Simple_gnb_scores = cross_val_score(gnb_Simple, X_train_si, y_train, cv=10)
        print("CV scores for Simple imputed GNB SK: %.2f" %(Simple_gnb_scores.mean()))
        
        Iterative_gnb_scores = cross_val_score(gnb_Iterative, X_train_it, y_train, cv=10)
        print("CV scores for Iterative imputed GNB SK:%.2f" %(Iterative_gnb_scores.mean()))
        

2. Test the performance of your implementation against the scikit-learn `GaussianNB` using missing value imputation. Test two imputation options, one univariate and one multi-variate. To help with your evaluation two versions of the penguins datasets with missing values are provided, one with 20% missing and the other with 40%.

## Penguins Dataset (20% Missing Values)

In [None]:
penguins_20 = pd.read_csv('PenguinsMV0.2.csv', index_col = 0)
penguins_20 = penguins_20.replace('?',np.nan)
print(penguins_20.shape)
penguins_20.head()

In [None]:
y = penguins_20.pop('species').values
X_raw = penguins_20.values

In [None]:
fidelity_tests(X_raw, y)

## Findings
Because the missing values probability is ignored in my case, but is imputed to the column's mean value in GaussianNB, the above dataset is less accurate than MyGaussianNB after being simple imputed and iteratively imputed. Cross validation, on the other hand, provides almost equal results for imputed  but is `less accurate` than `MyGaussianNB` with CV accuracy of 94% for MyGaussianNB and 93% for imputed GausssianNB.

## Penguins Dataset (40% Missing Values)

In [None]:
penguins_40 = pd.read_csv('PenguinsMV0.4.csv', index_col = 0)
penguins_40 = penguins_40.replace('?',np.nan)
print(penguins_40.shape)
penguins_40.head()

In [None]:
y_40 = penguins_40.pop('species').values
X_raw_40 = penguins_40.values

In [None]:
fidelity_tests(X_raw_40, y_40)

## Findings:
The above dataset after being simple imputed and Iteratively imputed is less accurate than MyGaussianNB because the missing values probability is ignored in my case but it is imputed to the mean value of the column in GaussianNB. However, cross validation yields results that are more accurate for MyGNB than the imputed  GaussianNB classifier with 87% for MyGaussianNB, 83% and 84% for univariate and multivariate imputations respectively.

## Conclusion:

It is now evident after testing the performance of `MyGaussianNB` Classifier against `GaussianNB` implementation in scikit-learn that `MyGaussianNB` is producing better accuracy than both the imputing methods on both Penguin datasets. This outcome is due to the way the imputers have strategised to replacing missing value by the mean values whereas in the case of `MyGaussianNB` the missing values are dumped in the computation of the probabilities.