In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split

### Naive_Bayes_Class

In [31]:
class NaiveBayes():
    def __init__(self):
        self.prior=None
        self.posterion=None
        self.target=None
    
    # Method to train
    def fit(self,df,target):
        self.target=target
        features=list(df.columns)
        features.remove(target)
        out=list(df[target].unique())
        prior={i:0 for i in out}
        for i in prior:
            prior[i]=(df[target]==i).sum()/df.shape[0]
        posterior={i:{} for i in out}
        for i in out:
            small_df=df.loc[df[target]==i]
            for j in features:
                f_types=df[j].unique()
                posterior[i][j]={f:0 for f in f_types}
                for f in f_types:
                    posterior[i][j][f]=(small_df[j]==f).sum()/small_df.shape[0]
        self.prior=prior
        self.posterior=posterior
        print('Traing done!!!')
    
    # Method to predict the class
    def predict(self,series):
        out=list(self.prior.keys())
        prob=[]
        for c in out:
            p=self.prior[c]
            for f in series.index:
                p*=self.posterior[c][f][series[f]]
            prob.append(p)
        return out[np.argmax(prob)]
    
    # Method to calculate accuracy, precisions and other metrics over the test data
    def evalate(self,test_df):
        y=test_df[self.target].values
        col=col=np.unique(y)
        test_df=test_df.drop(columns=[self.target])
        y_pred=[]
        for i in range(test_df.shape[0]):
            pred=self.predict(test_df.loc[i])
            y_pred.append(pred)
        y_pred=np.array(y_pred)
        results={}
        results['acc']=np.sum(y==y_pred)/y.shape[0]
        results['metrics']=metrics.classification_report(y,y_pred)
        results['prediction']=y_pred
        mat=metrics.confusion_matrix(y,y_pred)
        data={col[n]+'-Pred':mat.T[n] for n in range(len(col))}
        df=pd.DataFrame(data)
        df.index=[i+'-Real' for i in col]
        results['confusion_matrix']=df
        return results

### Loading mushroom dataset
https://www.kaggle.com/uciml/mushroom-classification

In [3]:
df=pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [6]:
df.shape

(8124, 23)

### train_test_split

In [7]:
df_train,df_test=train_test_split(df,test_size=1/6,shuffle=True,random_state=42)
df_train=df_train.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)

In [10]:
print(df_train.shape,df_test.shape)

(6770, 23) (1354, 23)


In [13]:
df_test.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,e,f,f,n,f,n,f,w,b,h,...,f,w,w,p,w,o,e,n,s,g
1,p,f,s,e,f,y,f,c,n,b,...,s,p,p,p,w,o,e,w,v,l
2,p,x,y,n,f,f,f,c,n,b,...,s,w,p,p,w,o,e,w,v,l
3,e,f,y,g,t,n,f,c,b,n,...,s,g,p,p,w,o,p,n,y,d
4,p,f,s,e,f,s,f,c,n,b,...,s,p,p,p,w,o,e,w,v,l


### training model and finding test accuracy

In [32]:
model=NaiveBayes()
model.fit(df_train,target='class')

Traing done!!!


In [33]:
model.prior

{'e': 0.5165435745937962, 'p': 0.4834564254062038}

In [34]:
model.posterior

{'e': {'cap-shape': {'f': 0.37260509007720904,
   'x': 0.4692593651701458,
   'b': 0.09322276236774378,
   'k': 0.057191878753217046,
   's': 0.007720903631684301,
   'c': 0.0},
  'cap-surface': {'f': 0.36974549613954816,
   'y': 0.3611667143265656,
   's': 0.2690877895338862,
   'g': 0.0},
  'cap-color': {'n': 0.2976837289104947,
   'g': 0.24449528167000287,
   'e': 0.14812696597083214,
   'w': 0.17329139262224763,
   'y': 0.09465255933657421,
   'b': 0.012582213325707749,
   'p': 0.014012010294538175,
   'r': 0.0034315127251930227,
   'c': 0.007720903631684301,
   'u': 0.004003431512725193},
  'bruises': {'f': 0.3440091507006005, 't': 0.6559908492993994},
  'odor': {'n': 0.811838718901916,
   's': 0.0,
   'c': 0.0,
   'f': 0.0,
   'y': 0.0,
   'l': 0.09379468115527595,
   'p': 0.0,
   'a': 0.09436659994280812,
   'm': 0.0},
  'gill-attachment': {'f': 0.9548184157849585, 'a': 0.04518158421504146},
  'gill-spacing': {'w': 0.28338575922219045, 'c': 0.7166142407778096},
  'gill-size': {'

### Evaluating for test data

In [35]:
results=model.evalate(df_test)

In [36]:
print(f"Accuracy={results['acc']}")
print(results['metrics'])

Accuracy=0.9970457902511078
              precision    recall  f1-score   support

           e       1.00      0.99      1.00       711
           p       0.99      1.00      1.00       643

    accuracy                           1.00      1354
   macro avg       1.00      1.00      1.00      1354
weighted avg       1.00      1.00      1.00      1354



In [39]:
results['confusion_matrix']

Unnamed: 0,e-Pred,p-Pred
e-Real,707,4
p-Real,0,643
