In [278]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [185]:
df = pd.read_csv("../data/churn_data.csv", index_col=[0])
X = df.drop(["Exited"], axis=1)
y = df[["Exited"]]

In [304]:
class GaussianNaiveBayes:
    def __init__(self):
        self.__means = None
        self.__stds = None
        self.__class_priors = None
        self.__target = None
        self.__features = None
        self.__fitted = False
    
    def fit(self, X: pd.DataFrame, y: pd.DataFrame):
        self.__target = y.columns[0]
        self.__features = list(X.columns)
        data = pd.concat([X, y], axis=1)
        grouped = data.groupby(by=self.__target)
        self.__means = grouped.mean()
        self.__stds = grouped.std()
        self.__class_priors = grouped.size()/len(data)
        self.__classes = list(self.__class_priors.index)
        self.__fitted = True
        return self
            
    def predict(self, X: pd.DataFrame):
        def maximum_a_posteriori(posterior_dict):
            return max(posterior_dict, key=posterior_dict.get) # argmax for dict
        
        assert self.__fitted == True
        assert list(X.columns) == self.__features
        
        return np.array(list(map(maximum_a_posteriori, self.__posteriors(X))))
    
    def predict_proba(self, X: pd.DataFrame):
        def likelihood_to_probability(posterior_dict):
            posteriors_sum = sum(posterior_dict.values())
            for key, val in posterior_dict.items():
                posterior_dict[key] = val/posteriors_sum
            return posterior_dict
            
        assert self.__fitted == True
        assert list(X.columns) == self.__features
        
        return np.array(list(map(likelihood_to_probability, self.__posteriors(X))))
    
    def __posteriors(self, X: pd.DataFrame):
        posteriors_by_class = {}
        posteriors_by_row = []
        for c in self.__classes:
            posteriors_by_class[c] = self.__posterior(c, X)
        for i in range(len(X)):
            posteriors_by_row.append({key: array[i] for key, array in posteriors_by_class.items()})
        return posteriors_by_row
    
    def __posterior(self, c: int, X: pd.DataFrame):
        posterior = np.full((len(X),), self.__class_priors[c])
        for feature in X.columns:
            mean = self.__means[feature][c]
            std = self.__stds[feature][c]
            gaussian = lambda x: GaussianNaiveBayes.gaussian_pdf(x, mean, std)
            posterior = posterior * np.array(X[feature].apply(gaussian))
        return posterior
        
    @staticmethod
    def gaussian_pdf(x, mu, sigma):
        return (1/np.sqrt(2*np.pi*sigma**2))*np.exp((-(x-mu)**2)/(2*sigma**2))

In [305]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [310]:
gnb = GaussianNaiveBayes()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

In [312]:
sk = GaussianNB(var_smoothing=0)
sk.fit(X_train, y_train)
y_sk = sk.predict(X_test)

  y = column_or_1d(y, warn=True)
