In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split

### Naive_Bayes_Class

In [None]:
class NaiveBayes():
    def __init__(self):
        self.prior=None
        self.posterion=None
        self.target=None
    
    # Method to train
    def fit(self,df,target):
        self.target=target
        features=list(df.columns)
        features.remove(target)
        out=list(df[target].unique())
        prior={i:0 for i in out}
        for i in prior:
            prior[i]=(df[target]==i).sum()/df.shape[0]
        posterior={i:{} for i in features}
        for i in features:
            f_types=df[i].unique()
            for j in f_types:
                small_df=df.loc[df[i]==j]
                posterior[i][j]={c:0 for c in out}
                for c in out:
                    posterior[i][j][c]=(small_df[target]==c).sum()/small_df.shape[0]
        self.prior=prior
        self.posterior=posterior
        print('Traing done!!!')
    
    # Method to predict the class
    def predict(self,series):
        out=list(self.prior.keys())
        prob=[]
        for c in out:
            p=self.prior[c]
            for f in series.index:
                p*=self.posterior[f][series[f]][c]
            prob.append(p)
        return out[np.argmax(prob)]
    
    # Method to calculate accuracy, precisions and other metrics over the test data
    def evalate(self,test_df):
        y=test_df[self.target].values
        test_df=test_df.drop(columns=[self.target])
        y_pred=[]
        for i in range(test_df.shape[0]):
            pred=self.predict(test_df.loc[i])
            y_pred.append(pred)
        y_pred=np.array(y_pred)
        acc=np.sum(y==y_pred)/y.shape[0]
        print('accracy={}\n'.format(acc),metrics.classification_report(y,y_pred,target_names=list(self.prior.keys())))
        return y_pred

### Loading mushroom dataset
https://www.kaggle.com/uciml/mushroom-classification

In [None]:
df=pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [None]:
df.shape

(8124, 23)

### train_test_split

In [None]:
df_train,df_test=train_test_split(df,test_size=1/6,shuffle=True)
df_train=df_train.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)

In [None]:
print(df_train.shape,df_test.shape)

(6770, 23) (1354, 23)


In [None]:
df_test.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,e,k,f,g,f,n,f,w,b,w,...,k,w,w,p,w,t,p,w,n,g
1,p,x,s,n,t,p,f,c,n,w,...,s,w,w,p,w,o,p,n,v,g
2,p,x,f,y,f,f,f,c,b,h,...,k,p,b,p,w,o,l,h,y,p
3,e,f,f,w,f,n,f,w,b,h,...,f,w,w,p,w,o,e,n,a,g
4,e,x,s,y,t,l,f,c,b,k,...,s,w,w,p,w,o,p,k,s,g


### training model and finding test accuracy

In [None]:
model=NaiveBayes()
model.fit(df_train,target='class')

Traing done!!!


In [None]:
model.prior

{'p': 0.48670605612998524, 'e': 0.5132939438700148}

In [None]:
model.posterior

{'cap-shape': {'k': {'p': 0.7317415730337079, 'e': 0.26825842696629215},
  'f': {'p': 0.4943224829674489, 'e': 0.5056775170325511},
  'x': {'p': 0.47281167108753314, 'e': 0.5271883289124668},
  'b': {'p': 0.1078167115902965, 'e': 0.8921832884097035},
  's': {'p': 0.0, 'e': 1.0},
  'c': {'p': 1.0, 'e': 0.0}},
 'cap-surface': {'y': {'p': 0.5417432879735197, 'e': 0.45825671202648033},
  's': {'p': 0.5612052730696798, 'e': 0.4387947269303202},
  'f': {'p': 0.3255330213208528, 'e': 0.6744669786791472},
  'g': {'p': 1.0, 'e': 0.0}},
 'cap-color': {'e': {'p': 0.590625, 'e': 0.409375},
  'n': {'p': 0.4527415143603133, 'e': 0.5472584856396867},
  'y': {'p': 0.6377142857142857, 'e': 0.36228571428571427},
  'g': {'p': 0.442257217847769, 'e': 0.5577427821522309},
  'w': {'p': 0.2970760233918129, 'e': 0.7029239766081872},
  'b': {'p': 0.7446808510638298, 'e': 0.2553191489361702},
  'p': {'p': 0.6140350877192983, 'e': 0.38596491228070173},
  'c': {'p': 0.2972972972972973, 'e': 0.7027027027027027},
 

### Evaluating for test data

In [None]:
y_pred=model.evalate(df_test)

accracy=1.0
               precision    recall  f1-score   support

           p       1.00      1.00      1.00       733
           e       1.00      1.00      1.00       621

    accuracy                           1.00      1354
   macro avg       1.00      1.00      1.00      1354
weighted avg       1.00      1.00      1.00      1354



### printing first 50 prediction against real label

In [None]:
y_pred[:50]

array(['e', 'p', 'p', 'e', 'e', 'e', 'p', 'p', 'e', 'e', 'p', 'p', 'p',
       'e', 'p', 'p', 'p', 'e', 'e', 'p', 'e', 'e', 'e', 'p', 'e', 'e',
       'p', 'e', 'p', 'e', 'p', 'e', 'p', 'e', 'p', 'p', 'e', 'e', 'e',
       'e', 'e', 'e', 'e', 'p', 'p', 'e', 'e', 'e', 'p', 'p'], dtype='<U1')

In [None]:
df_test['class'].values[:50]

array(['e', 'p', 'p', 'e', 'e', 'e', 'p', 'p', 'e', 'e', 'p', 'p', 'p',
       'e', 'p', 'p', 'p', 'e', 'e', 'p', 'e', 'e', 'e', 'p', 'e', 'e',
       'p', 'e', 'p', 'e', 'p', 'e', 'p', 'e', 'p', 'p', 'e', 'e', 'e',
       'e', 'e', 'e', 'e', 'p', 'p', 'e', 'e', 'e', 'p', 'p'],
      dtype=object)