# Naive Bayes-Mushroom dataset
 ### Goal is to predict the class of mushrooms,given some feature sof the mushrooms.We will use Naive Bayes model for this Classification

# Load the dataset

In [1]:
#markdown shortcut- ESc+M

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
df=pd.read_csv('../datasets/Mushrooms/mushrooms.csv')
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
df.shape

(8124, 23)

### Encode the categorial Data into Numerical data

In [5]:
le=LabelEncoder()

#Applies transformation on each column
ds=df.apply(le.fit_transform)

#axis=0 applied along each column ->in df.apply() ->in pandas

In [6]:
print(type(ds))
ds.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [7]:
data=ds.values
print(data.shape)
print(type(data))
print(data[:5,:])

data_y=data[:,0]
data_x=data[:,1:]


(8124, 23)
<class 'numpy.ndarray'>
[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


### Break the data into train and test

In [8]:
x_train,x_test,y_train,y_test=train_test_split(data_x,data_y,test_size=0.2)

In [9]:
print(x_train.shape,x_test.shape)
print(y_train.shape,y_test.shape)

(6499, 22) (1625, 22)
(6499,) (1625,)


In [10]:
np.unique(y_train)

array([0, 1])

# Building Naive Bayes Classifier!

In [11]:
a=np.array([0,0,0,1,1,0,1,0,1,0])
b=np.sum(a==1)
print(b)

# works faster than a for() loop

4


In [12]:
def prior_prob(y_train,label):
    
    total_examples=y_train.shape[0]
    class_examples=np.sum(y_train==label)
    
    return (class_examples/total_examples)
    
    

In [13]:
#y=np.array([0,5,5,1,1,1,1,0,0,0])
#prior_prob(y,1)

In [14]:
def cond_prob(x_train,y_train,feature_col,feature_val,label):
    x_filtered=x_train[y_train==label]
    numerator=np.sum(x_filtered[:,feature_col]==feature_val)
    denominator=np.sum(y_train==label)
    
    return numerator/float(denominator)
    

### Next Step: Compute Posterior Prob for each test example and make predictions

In [15]:
np.unique(y_train)

array([0, 1])

In [16]:
def predict(x_train,y_train,x_test):
    """Xtest is a single testing point, n features"""
    
    classes=np.unique(y_train)
    n_features=x_train.shape[1]
    post_probs=[] #List of porb for all classes and given a single testing point
    #Compute Posterior for each class
    
    for label in classes:
        
        #Post_c= likelihood*prior  , for class label
        likelihood=1.0
        for f in range(n_features):
            cond=cond_prob(x_train,y_train,f,x_test[f],label)
            likelihood*=cond
        
        prior=prior_prob(y_train,label)
        post=likelihood*prior
        post_probs.append(post)
        
    pred=np.argmax(post_probs)
    return pred
        
    

In [17]:
output=predict(x_train,y_train,x_test[1])
print(output)
print(y_test[1])

1
1


In [18]:
def score(x_train,y_train,x_test,y_test):
    pred=[]
    
    for i in range(x_test.shape[0]):
        pred_label=predict(x_train,y_train,x_test[i])
        pred.append(pred)
    
    pred=np.array(pred)
    accuracy= np.sum(pred==y_test)/y_test.shape[0]
    
    return accuracy
    
    
    
    
    

In [19]:
print(x_test.shape)

(1625, 22)


In [None]:
print(score(x_train,y_train,x_test,y_test))