In [2]:
import numpy as np

In [3]:
def _softmax(logits):
    logits = logits - np.max(logits, axis=1, keepdims=True)
    exps = np.exp(logits)
    return exps/np.sum(exps, axis=1, keepdims=True)

def _to_int_labels(y):
    y = np.asarray(y).astype(int)
    if y.ndim!=1:
        y.ravel()
    return y

In [8]:
class NaiveBayes:
    def __init__(self, model_type="gaussian", var_smoothing=1e-9, alpha = 1.0):
        self.model_type = model_type
        self.var_smoothing = var_smoothing
        self.alpha = alpha
        self.classes = None
        self.class_priori_log = None
        self.theta_ = None
        self.var_ = None
        self.feature_log_prob = None
    
    def fit(self, train_input, train_output):
        train_input = np.asarray(train_input)
        train_output = np.asarray(train_output)
        self.classes, out_indices = np.unique(train_output, return_inverse=True)
        k = len(self.classes)
        n_feature = train_input.shape[1]
        class_counts = np.bincount(out_indices, minlength=k)
        prior = class_counts/class_counts.sum()
        self.class_priori_log = np.log(prior+1e-15)

        if self.model_type == "gaussian":
            self.theta_ = np.zeros((k, n_feature))
            self.var_ = np.zeros((k, n_feature))
            for k_idx, K in enumerate(self.classes):
                train_input_k = train_input[train_output==K]
                self.theta_[k_idx] = train_input_k.mean(axis=0)
                self.var_[k_idx] = train_input_k.var(axis=0)
        return self
    
    def _joint_log_likelihood(self, X):
        X = np.asarray(X)
        K = len(self.classes)
        if self.model_type=="gaussian":
            log_prob = []
            for k in range(K):
                mu, var = self.theta_[k], self.var_[k]
                a = -0.5 * np.log(2.0*np.pi*var)
                b = -0.5 * ((X-mu)**2/var)
                log_px_given_y = np.sum(a+b, axis=1)
                log_prob.append(self.class_priori_log[k] + log_px_given_y)
            return np.column_stack(log_prob)
    
    def predict_prob(self, X):
        ll = self._joint_log_likelihood(X)
        return _softmax(ll)

    def predict(self, X):
        ll = self._joint_log_likelihood(X)
        return self.classes[np.argmax(ll, axis=1)]
    
    def accuracy(self, X, y):
        y = _to_int_labels(y)
        return np.mean(self.predict(X) == y)*100.0

In [9]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = load_breast_cancer()
X, y = data.data, data.target
X = StandardScaler().fit_transform(X)

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42)

nb = NaiveBayes(model_type="gaussian")
nb.fit(Xtr, ytr)
print("Train Accuracy:", nb.accuracy(Xtr, ytr))
print("Test Accuracy:", nb.accuracy(Xte, yte))


Train Accuracy: 93.62637362637362
Test Accuracy: 96.49122807017544
