In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [None]:
#This data is in textual format so we need to convert in into numerical form using label encoder
en = LabelEncoder()
data = df.apply(en.fit_transform)
data = data.values
data,data.shape

(array([[1, 5, 2, ..., 2, 3, 5],
        [0, 5, 2, ..., 3, 2, 1],
        [0, 0, 2, ..., 3, 2, 3],
        ...,
        [0, 2, 2, ..., 0, 1, 2],
        [1, 3, 3, ..., 7, 4, 2],
        [0, 5, 2, ..., 4, 1, 2]]),
 (8124, 23))

In [None]:
X= data[:,1:]
y = data[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Custom Multinomial Naive Bayes

In [14]:
class CustomNaiveBayes:
    def fit(self,X,y):
        self.X_train = X
        self.y_train = y
    # Formula
    # p(y/x) = p(y) * p(x/y)
    
    # Prior probability i.e. p(y) 
    def prior_prob(self,label):
        return np.sum(self.y_train==label)/self.y_train.shape[0]
    # Conditional Probability i.e. p(x/y)
    def conditional_prob(self,label,feature_col,feature_val):
        X_mod = self.X_train[self.y_train == label]
        return np.sum(X_mod[:,feature_col]==feature_val)/float(X_mod.shape[0])
    
    def predict_point(self,X_test):
        post = []
        classes = np.unique(self.y_train)
        features = self.X_train.shape[1]
        for label in classes:
            likelihood = 1.0
            for feature in range(features):
                likelihood*= self.conditional_prob(label,feature,X_test[feature])
            post.append(likelihood*self.prior_prob(label))
        return np.argmax(post)
    
    def predict(self,X):
        results = []
        for point in X:
            results.append(self.predict_point(point))
        return np.array(results)
    def score(self,X_test,y_test):
        ans = self.predict(X_test)
        return (y_test == ans).mean()

In [15]:
model = CustomNaiveBayes()
model.fit(X_train,y_train)

In [16]:
model.predict(X_test[:5])

array([0, 1, 1, 0, 1], dtype=int64)

In [17]:
model.score(X_test,y_test)

0.9973890339425587