In [2]:
import numpy as np
import pandas as pd

In [3]:
dataset = pd.read_csv('mushrooms.csv')

In [5]:
dataset

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [7]:
# Convert categorical data into numerical data

In [6]:
dataset.apply?

In [8]:
le = LabelEncoder()

In [9]:
converted_data = dataset.apply(le.fit_transform)

In [10]:
converted_data

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


In [11]:
array_data = converted_data.values

In [12]:
print(array_data.shape)

(8124, 23)


In [13]:
split = int(0.8*array_data.shape[0])
train_X = array_data[:split, 1:]
train_Y = array_data[:split:, 0]

test_X = array_data[split:, 1:]
test_Y = array_data[split:, 0]

In [112]:
print(train_X.shape, train_Y.shape)
print(test_X.shape, test_Y.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


In [15]:
type(train_Y)

numpy.ndarray

In [16]:
np.unique(train_Y)

array([0, 1])

In [17]:
uni = np.unique(train_X, axis=1)

In [18]:
uni.shape

(6499, 22)

###### Data preprocessing is done

# Building our Naive Bayes Classifier

In [113]:
def prior_prob(Y, label):
    """
    params:
    Y - types of mushroom (vector)
    label - a type of mushroom (int/numberical data)
    return:
        flot - prior probablity of 'label' type 
    """
    count = np.sum(Y==label)
    total = Y.shape[0]
    
    return count/float(total)

In [115]:
prior_prob(train_Y, 1)

0.43052777350361593

In [116]:
def conditional_prob(X, Y, feature_col, feature_value, label):
    filtered_X = X[Y==label]
    
    numerator = np.sum(filtered_X[:,feature_col]==feature_value)
    denominator = np.sum(Y==label) ## filtered_X.size
    
    return numerator/float(denominator)

In [110]:
def likelihood(X, Y, x, label):
    """
    params:
    X - feature matrix
    Y - mushroom types array
    label - type of a mushroom 
    return:
        float - likelihood of X given label
    """
    prob = 1.0
    for i in range(x.shape[0]):
        prob = prob*conditional_prob(X, Y, i, x[i], label)
    return prob

In [58]:
x = train_X[5, :]
prob0 = likelihood(train_X, train_Y, x, 0)
prob1 = likelihood(train_X, train_Y, x, 1)

In [56]:
print(x)

[5 3 9 1 0 1 0 0 5 0 2 2 2 7 7 0 2 1 4 2 2 1]


In [43]:
print(prob0, prob1)

6.704628895113898e-38 0.0


# Compute Posterior probabilities and predict a test point 

In [74]:
def predict(X, Y, x):
    """
    params:
    X, Y - training dataset (n features, m labels)
    x - test point - it has n features
    return:
        label - type of a testpoint (mushroom)
    """
    labels = np.unique(Y)
    features_size = X.shape[1]
    post_probs = []
    for label in labels:
        post_prob = prior_prob(Y, label)*likelihood(X, Y, x, label)
        post_probs.append(post_prob)
    
    pred = np.argmax(post_probs)
    return pred
        

In [103]:
correct = 0

for i in range(test_X.shape[0]):
    output = predict(train_X, train_Y, test_X[i])
    correct += (output == test_Y[i])
    print(str(i) + "/" + str(test_X.shape[0]), end="\r")
    
acc = 100*(correct/test_X.shape[0])

1624/1625

In [104]:
print(acc)

94.0923076923077


In [138]:
def score(X, Y, X_, Y_):
    
    predY_ = []
    for i in range(X_.shape[0]):
        output = predict(X, Y, X_[i])
        predY_.append(output)
        print(str(i+1) + "/" + str(X_.shape[0]), end="\r")
        
    predY_ = np.array(predY_)
    acc = 100*(np.sum(predY_ == Y_)/Y_.shape[0])
    
    print(acc, end="\r")
    return acc, predY_

In [139]:
acc, pred_test_Y = score(train_X, train_Y, test_X, test_Y)

94.0923076923077

In [97]:
print(acc)

94.0923076923077


In [109]:
pred_test_Y

array([1, 1, 1, ..., 0, 1, 0], dtype=int64)

In [117]:
np.unique(train_Y)

array([0, 1])

# Laplace smoothing

In [148]:
def lap_cond_prob(X, Y, i, xi, label):
    
    filtered_X = X[Y==label]
    xi_count = 1 + np.sum(filtered_X[:, i] == xi)
    
    all_count = 0
    uniq = np.unique(filtered_X)
    for i in range(uniq.size):
        all_count += 1 + np.sum(filtered_X[:, i] == uniq[i])
    
    prob = xi_count/float(all_count)
    
    return prob

In [149]:
def lap_likelihood(X, Y, x, label):
    
    prob = 1.0
    for i in range(x.shape[0]):
        prob *= lap_cond_prob(X, Y, i, x[i], label)
        
    return prob

In [150]:
def lap_predict(X, Y, x):
    """
    params:
    X, Y - training dataset (n features, m labels)
    x - test point - it has n features
    return:
        label - type of a testpoint (mushroom)
    """
    labels = np.unique(Y)
    features_size = X.shape[1]
    post_probs = []
    
    for label in labels:
        post_prob = prior_prob(Y, label)*lap_likelihood(X, Y, x, label)
        post_probs.append(post_prob)
    
    pred = np.argmax(post_probs)
    
    return pred
        

In [151]:
def lap_score(X, Y, X_, Y_):
    
    predY_ = []
    for i in range(X_.shape[0]):
        output = lap_predict(X, Y, X_[i])
        predY_.append(output)
        print(str(i+1) + "/" + str(X_.shape[0]), end="\r")
        
    predY_ = np.array(predY_)
    acc = 100*(np.sum(predY_ == Y_)/Y_.shape[0])
    
    print(acc, end="\r")
    return acc

In [152]:
lap_acc = lap_score(train_X, train_Y, test_X, test_Y)

70.46153846153847