#### Importing Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Reading the data

In [2]:
data=pd.read_csv('mushrooms.csv')
pd.set_option('display.max_columns', 500)
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g


In [3]:
data.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

### Encoding the Categorial data

In [4]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df=data.apply(le.fit_transform)
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,0,3,2,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,0,2,2,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,0,2,2,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,0,3,2,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,1,3,2,2,7,7,0,2,1,0,3,0,1


#### Splitting the data into training and testing data

In [5]:
X=df.iloc[:,[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]].values
Y=df.iloc[:,0].values
from sklearn.model_selection import train_test_split as tt
X_train, X_test , Y_train, Y_test=tt(X,Y,test_size=0.2,random_state=100)

In [6]:
X_train.shape

(6499, 22)

In [7]:
Y.shape

(8124,)

## Building the Naive Bayes Model

In [8]:
def prior_prob(y_train,label):
    numer = np.sum(y_train==label)
    denom = y_train.shape[0]
    return numer/float(denom)

In [9]:
def livelihood(x_train,y_train,feature_col,feature_val,label):
    denom = np.sum(y_train==label)
    X_new = x_train[y_train==label]
    numer = np.sum(X_new[:,feature_col]==feature_val)
    return numer/float(denom)

In [10]:
def posterior_prob(x_train,y_train,x_test):
    prediction=[]
    for i_data in np.unique(y_train):
        live=1.0
        for i_col in range(x_train.shape[1]):
            live *= livelihood(x_train,y_train,i_col,x_test[i_col],i_data)
        prior = prior_prob(y_train,i_data)
        predict = prior * live
        prediction.append(predict)
#         prediction=np.array(prediction)
#         prediction.ravel()
    return np.argmax(prediction)    
# [[764  62]
#  [ 57 742]]

In [11]:
def score(x_train,y_train,x_test,y_test):
    y_pred = []
    for i_data in range(x_test.shape[0]):
        pred = posterior_prob(x_train,y_train,x_test[i_data])
        y_pred.append(pred)
    y_pred = np.array(y_pred)
    ans = (np.sum(y_pred==y_test))/float(y_test.shape[0])
    return ans

In [12]:
score(X_train,Y_train,X_test,Y_test)

0.9950769230769231

## Model using sklearn

In [13]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train,Y_train)

import statsmodels.api as sm
X_train = sm.add_constant(X_train)
nbm = sm.GLM(Y_train, X_train , family=sm.families.Binomial()).fit()
print(nbm.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                 6499
Model:                            GLM   Df Residuals:                     6477
Model Family:                Binomial   Df Model:                           21
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -890.36
Date:                Sun, 07 Apr 2019   Deviance:                       1780.7
Time:                        17:39:22   Pearson chi2:                 3.41e+05
No. Iterations:                    25   Covariance Type:             nonrobust
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -74.4237   2.93e+04     -0.003      0.998   -5.76e+04    5.74e+04
x1             0.0273      0.039      0.696      0.4

In [14]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def VIF(input_data, dependent_col):
    X = input_data.drop([dependent_col], axis=1)
    # For each X, calculate VIF and save in dataframe
    vif = pd.DataFrame(columns = ['Var', 'Vif'])
    vif["Var"] = X.columns
    vif["Vif"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif.sort_values(by = 'Vif', axis=0, ascending=False, inplace=False).round(2)

In [15]:
VIF(df,'class')

  return 1 - self.ssr/self.uncentered_tss


Unnamed: 0,Var,Vif
16,veil-color,280.12
5,gill-attachment,256.12
17,ring-number,65.37
20,population,21.43
14,stalk-color-below-ring,17.22
13,stalk-color-above-ring,17.16
19,spore-print-color,16.94
18,ring-type,13.43
12,stalk-surface-below-ring,11.77
11,stalk-surface-above-ring,11.71


In [16]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train,Y_train)
X_test = sm.add_constant(X_test,has_constant= 'add')
y_pred = nb.predict(X_test)

## Prediction

In [17]:
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score
cm = confusion_matrix(Y_test,y_pred)

In [18]:
print(cm)

[[764  62]
 [ 57 742]]


In [19]:
classification_report(Y_test,y_pred)

'              precision    recall  f1-score   support\n\n           0       0.93      0.92      0.93       826\n           1       0.92      0.93      0.93       799\n\n   micro avg       0.93      0.93      0.93      1625\n   macro avg       0.93      0.93      0.93      1625\nweighted avg       0.93      0.93      0.93      1625\n'

In [20]:
accuracy_score(Y_test , y_pred)

0.9267692307692308