# Naive Bayes from the scratch

Naive Bayes algorithm is based on Bayes theory on conditional probability. Naive Bayes is one of the simplest algorithms. But it is not very easy to implement. Because to find individual probabilities, you have to separate the different classes, then for each feature in each class, you have to find the probability of each value of feature.

# Import Libraries and Data set

In [61]:
#import numpy and pandas
import numpy as np
import pandas as pd

#import dataset from csv 
mush = pd.read_csv("mushrooms.csv")
#view the dataset once
mush

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


In [63]:
#print column names 
print(mush.columns)

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')


In [64]:
#there is a column in our dataset which has a special character in it.
mush["stalk-root"]

0       e
1       c
2       c
3       e
4       e
5       c
6       c
7       c
8       e
9       c
10      c
11      c
12      c
13      e
14      e
15      e
16      e
17      e
18      e
19      e
20      c
21      e
22      c
23      c
24      c
25      e
26      c
27      c
28      e
29      b
       ..
8094    ?
8095    c
8096    ?
8097    ?
8098    ?
8099    ?
8100    ?
8101    ?
8102    ?
8103    ?
8104    ?
8105    ?
8106    ?
8107    ?
8108    ?
8109    ?
8110    ?
8111    ?
8112    ?
8113    ?
8114    c
8115    ?
8116    ?
8117    ?
8118    ?
8119    ?
8120    ?
8121    ?
8122    ?
8123    ?
Name: stalk-root, Length: 8124, dtype: object

# Cleaning Data

Since the data consists of missing data with '?' as value. All the missing values are from single column. Lets remove it.

In [65]:
#replace '?' with numpy null value
mush.replace('?',np.nan,inplace=True)
#print the lenth of columns in the dataframe, print the length of columns after dropping null valued columns
print(len(mush.columns),"columns, after dropping NA,",len(mush.dropna(axis=1).columns))

23 columns, after dropping NA, 22


In [67]:
#see the current attributes that we have in our dataset
print(mush.dropna(axis=1).columns)
#the stalk-root column has been dropped

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
       'veil-color', 'ring-number', 'ring-type', 'spore-print-color',
       'population', 'habitat'],
      dtype='object')


Since we will be using this algorithm for classification, let us set the target class first

In [40]:
#set class as target
target = 'class'

In [41]:
#print all the attributes except the target
print(mush.columns[mush.columns != target])
#assign this index array to "features"
features = mush.columns[mush.columns != target]

Index(['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')


In [68]:
#see the records in the target column
print(mush[target].unique())
#assign this array to "classes"
classes = mush[target].unique()

['p' 'e']


We have go the classes as 'p' or 'e'. Let's proceed.

# Splitting dataset into test and train

In [72]:
#get a random 30% sample from the mush dataframe, store in "test"
test = mush.sample(frac=0.3)

In [73]:
#drop the test samples and save the rest of the samples in "mush"
mush = mush.drop(test.index)

# Probabilities Calculation

In [75]:
#Here we calculate probabilities and store them in dictionary structure.
'''
dict:
    keys: class
    values: dict:
            keys: feature
            values: dict:
                    keys: categorical value
                    values: probability of value
'''
#Thus the probability of the class can be accessed using multiple dict access.

'\ndict:\n    keys: class\n    values: dict:\n            keys: feature\n            values: dict:\n                    keys: categorical value\n                    values: probability of value\n'

In [82]:
len(mush[mush[target]=='e'][features]),len(mush[mush[target]=='p'][features])

(2935, 2752)

In [8]:
#empty dict probs and probcl are created
probs = {}
probcl = {}
for x in classes:
    mushcl = mush[mush[target]==x][features]
    clsp = {}
    tot = len(mushcl)
    for col in mushcl.columns:
        colp = {}
        for val,cnt in mushcl[col].value_counts().iteritems():
            pr = cnt/tot
            colp[val] = pr
        clsp[col] = colp
    probs[x] = clsp
    probcl[x] = len(mushcl)/len(mush)

In [9]:
def probabs(x):
    #X - pandas Series with index as feature
    if not isinstance(x,pd.Series):
        raise IOError("Arg must of type Series")
    probab = {}
    for cl in classes:
        pr = probcl[cl]
        for col,val in x.iteritems():
            try:
                pr *= probs[cl][col][val]
            except KeyError:
                pr = 0
        probab[cl] = pr
    return probab

In [10]:
def classify(x):
    probab = probabs(x)
    mx = 0
    mxcl = ''
    for cl,pr in probab.items():
        if pr > mx:
            mx = pr
            mxcl = cl
    return mxcl

In [11]:
#Train data
b = []
for i in mush.index:
    #print(classify(mush.loc[i,features]),mush.loc[i,target])
    b.append(classify(mush.loc[i,features]) == mush.loc[i,target])
print(sum(b),"correct of",len(mush))
print("Accuracy:", sum(b)/len(mush))

5678 correct of 5687
Accuracy: 0.998417443291718


In [12]:
#Test data
b = []
for i in test.index:
    #print(classify(mush.loc[i,features]),mush.loc[i,target])
    b.append(classify(test.loc[i,features]) == test.loc[i,target])
print(sum(b),"correct of",len(test))
print("Accuracy:",sum(b)/len(test))

2436 correct of 2437
Accuracy: 0.9995896594173164
