In [278]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [185]:
df = pd.read_csv("../data/churn_data.csv", index_col=[0])
X = df.drop(["Exited"], axis=1)
y = df[["Exited"]]

In [257]:
class GaussianNaiveBayes:
    def __init__(self):
        self.__means = None
        self.__stds = None
        self.__class_priors = None
        self.__target = None
        self.__features = None
        self.__fitted = False
    
    def fit(self, X: pd.DataFrame, y: pd.DataFrame):
        self.__target = y.columns[0]
        self.__features = list(X.columns)
        data = pd.concat([X, y], axis=1)
        grouped = data.groupby(by=self.__target)
        self.__means = grouped.mean()
        self.__stds = grouped.std()
        self.__class_priors = grouped.size()/len(data)
        self.__classes = list(self.__class_priors.index)
        self.__fitted = True
        return self
            
    def predict(self, X: pd.DataFrame):
        def maximum_a_posteriori(posteriors):
            return max(posteriors, key=posteriors.get) # argmax for dict
        
        assert self.__fitted == True
        assert list(X.columns) == self.__features
        
        return maximum_a_posteriori(self.__posteriors(X))
    
    def predict_proba(self, X: pd.DataFrame):
        assert self.__fitted == True
        assert list(X.columns) == self.__features
        
        posteriors = self.__posteriors(X)
        posteriors_sum = sum(posteriors.values())
        for key, val in posteriors.items():
            posteriors[key] = val/posteriors_sum
    
        return posteriors
    
    def __posteriors(self, X: pd.DataFrame):
        posteriors = {}
        for c in self.__classes:
            posteriors[c] = self.__posterior(c, X)
        return posteriors
    
    def __posterior(self, c: int, X: pd.DataFrame):
        posterior = self.__class_priors[c]
        for feature in X.columns:
            mean = self.__means[feature][c]
            std = self.__stds[feature][c]
            posterior *= GaussianNaiveBayes.gaussian_pdf(X[feature].values[0], mean, std)
        return posterior
        
    @staticmethod
    def gaussian_pdf(x, mu, sigma):
        return (1/np.sqrt(2*np.pi*sigma**2))*np.exp((-(x-mu)**2)/(2*sigma**2))

In [260]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [285]:
gnb = GaussianNaiveBayes()
gnb.fit(X_train, y_train)
y_pred = []
for i in X_test.index:
    val = X_test.loc[[i]]
    y_pred.append(gnb.predict_proba(val))

In [283]:
sk = GaussianNB(var_smoothing=0)
sk.fit(X_train, y_train)
y_sk = sk.predict_proba(X_test)

  y = column_or_1d(y, warn=True)
