In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [2]:
df = pd.read_csv('mushrooms.csv')

In [3]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
df.shape

(8124, 23)

In [5]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [6]:
le?

In [7]:
le.fit_transform?

In [8]:
transformed_df = df.apply(le.fit_transform)

In [9]:
transformed_df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


In [10]:
transformed_df = transformed_df.values

In [11]:
transformed_df.shape

(8124, 23)

In [12]:
transformed_df[:2, :]

array([[1, 5, 2, 4, 1, 6, 1, 0, 1, 4, 0, 3, 2, 2, 7, 7, 0, 2, 1, 4, 2, 3,
        5],
       [0, 5, 2, 9, 1, 0, 1, 0, 0, 4, 0, 2, 2, 2, 7, 7, 0, 2, 1, 4, 3, 2,
        1]])

In [13]:
data_X = transformed_df[:, 1:]
data_Y = transformed_df[:, 0]

In [14]:
data_X.shape, data_Y.shape

((8124, 22), (8124,))

In [15]:
from sklearn import model_selection
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(data_X, data_Y, test_size=0.2)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(6499, 22) (1625, 22) (6499,) (1625,)


In [16]:
def prior(Y_train, label):
    p = np.sum(Y_train == label)/Y_train.shape[0]
    return p

In [17]:
Y_train[:90]

array([0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1])

In [18]:
print(np.sum(Y_train == 0)/6499)

0.5179258347438067


In [19]:
prior(Y_train, 1)

0.4820741652561933

In [20]:
0.48130481612555776 + 0.5186951838744422

1.0

In [21]:
def conditional_prob(X_train, Y_train, feature_col, feature_value, label):
    X_train = X_train[Y_train == label]
    numerator = np.sum(X_train[:, feature_col] == feature_value)
    denominator = np.sum(Y_train == label)
    
    return numerator/denominator

In [22]:
np.unique(Y_train)

array([0, 1])

In [23]:
def predict(X_train, Y_train, X_test):
    num_classes = np.unique(Y_train)
    
    num_feat = X_train.shape[1]
    posterior_prob = []
    
    for label in num_classes:
        likelihood = 1
        prior_prob = prior(Y_train, label)
        for feat in range(num_feat):
            cond_prob = conditional_prob(X_train, Y_train, feat, X_test[feat], label)
            likelihood *= cond_prob
#             likelihood = likelihood*cond_prob
        
        posterior = likelihood*prior_prob
        posterior_prob.append(posterior)
        
        prediction = np.argmax(posterior_prob)
                
    return posterior_prob[prediction], prediction

In [24]:
pred = predict(X_train, Y_train, X_test[0])
print(pred, Y_test[0])

(2.4852757578351393e-08, 1) 1


In [25]:
def accuracy(X_train, Y_train, X_test, Y_test):
    pred = []
    
    for i in range(X_test.shape[0]):
        conf, label = predict(X_train, Y_train, X_test[i])
        pred.append(label)
        print(f'Test case belongs to {label} class, with confidence {conf}, and its label is {Y_test[i]}')
    pred = np.array(pred)
    
    acc = np.sum(pred == Y_test)/Y_test.shape[0]
    
    return acc

In [26]:
acc = accuracy(X_train, Y_train, X_test, Y_test)

Test case belongs to 1 class, with confidence 2.4852757578351393e-08, and its label is 1
Test case belongs to 1 class, with confidence 1.9330007576887266e-07, and its label is 1
Test case belongs to 0 class, with confidence 8.289059991209094e-07, and its label is 0
Test case belongs to 0 class, with confidence 3.4784440106256183e-09, and its label is 0
Test case belongs to 0 class, with confidence 4.508090404486711e-07, and its label is 0
Test case belongs to 0 class, with confidence 1.4635253466520805e-09, and its label is 0
Test case belongs to 1 class, with confidence 1.9779826169159922e-09, and its label is 1
Test case belongs to 1 class, with confidence 1.0535694899634046e-09, and its label is 1
Test case belongs to 0 class, with confidence 5.331473735852919e-10, and its label is 0
Test case belongs to 1 class, with confidence 2.547886492589582e-08, and its label is 1
Test case belongs to 0 class, with confidence 1.1195051543333227e-07, and its label is 0
Test case belongs to 0 cl

Test case belongs to 0 class, with confidence 4.941268105723498e-07, and its label is 0
Test case belongs to 1 class, with confidence 4.643481386828849e-08, and its label is 1
Test case belongs to 0 class, with confidence 8.112713526731494e-07, and its label is 0
Test case belongs to 0 class, with confidence 8.532909787880468e-09, and its label is 0
Test case belongs to 1 class, with confidence 4.040553162064361e-09, and its label is 1
Test case belongs to 0 class, with confidence 1.2648540547214678e-07, and its label is 0
Test case belongs to 1 class, with confidence 3.504495038125625e-08, and its label is 1
Test case belongs to 0 class, with confidence 2.0396521289787516e-13, and its label is 0
Test case belongs to 1 class, with confidence 1.9482845449844126e-12, and its label is 1
Test case belongs to 0 class, with confidence 8.374758551416281e-07, and its label is 0
Test case belongs to 0 class, with confidence 4.125952552600392e-07, and its label is 0
Test case belongs to 0 class,

Test case belongs to 1 class, with confidence 2.668295260666158e-07, and its label is 1
Test case belongs to 0 class, with confidence 7.826218226844796e-08, and its label is 0
Test case belongs to 0 class, with confidence 1.928022999277933e-07, and its label is 0
Test case belongs to 1 class, with confidence 6.163917736540131e-08, and its label is 1
Test case belongs to 0 class, with confidence 1.09720593589707e-06, and its label is 0
Test case belongs to 1 class, with confidence 2.013769419747454e-09, and its label is 1
Test case belongs to 1 class, with confidence 3.278086672231279e-08, and its label is 1
Test case belongs to 0 class, with confidence 5.082689492272552e-07, and its label is 0
Test case belongs to 1 class, with confidence 1.463028018464951e-08, and its label is 1
Test case belongs to 1 class, with confidence 4.2866043973883005e-08, and its label is 1
Test case belongs to 1 class, with confidence 6.836671696523038e-08, and its label is 1
Test case belongs to 1 class, wi

Test case belongs to 1 class, with confidence 4.912564148049309e-09, and its label is 1
Test case belongs to 1 class, with confidence 1.2977530082338232e-07, and its label is 1
Test case belongs to 1 class, with confidence 7.965893161840344e-08, and its label is 1
Test case belongs to 0 class, with confidence 7.850495600179319e-15, and its label is 0
Test case belongs to 0 class, with confidence 1.32517473743095e-11, and its label is 0
Test case belongs to 1 class, with confidence 4.732192564278749e-08, and its label is 1
Test case belongs to 0 class, with confidence 3.9768170596162596e-10, and its label is 0
Test case belongs to 1 class, with confidence 6.919188482610212e-08, and its label is 1
Test case belongs to 1 class, with confidence 1.4927203072318996e-09, and its label is 1
Test case belongs to 1 class, with confidence 7.52975164977033e-12, and its label is 1
Test case belongs to 0 class, with confidence 4.135659243345705e-08, and its label is 0
Test case belongs to 1 class, w

Test case belongs to 1 class, with confidence 1.4896274990157498e-07, and its label is 1
Test case belongs to 0 class, with confidence 8.730122392460522e-07, and its label is 0
Test case belongs to 1 class, with confidence 8.987094973640244e-09, and its label is 1
Test case belongs to 1 class, with confidence 1.0405412160394666e-15, and its label is 0
Test case belongs to 0 class, with confidence 2.03696497667618e-11, and its label is 0
Test case belongs to 0 class, with confidence 9.586818838299492e-07, and its label is 0
Test case belongs to 0 class, with confidence 8.520290471353991e-11, and its label is 0
Test case belongs to 0 class, with confidence 9.00115586738379e-07, and its label is 0
Test case belongs to 0 class, with confidence 1.2168132402418613e-07, and its label is 0
Test case belongs to 1 class, with confidence 4.266673508115081e-08, and its label is 1
Test case belongs to 0 class, with confidence 2.047820197115691e-07, and its label is 0
Test case belongs to 0 class, w

Test case belongs to 0 class, with confidence 1.5780963309655466e-07, and its label is 0
Test case belongs to 0 class, with confidence 6.318348292545438e-10, and its label is 0
Test case belongs to 1 class, with confidence 2.1194632193768306e-07, and its label is 1
Test case belongs to 1 class, with confidence 1.901547668647235e-11, and its label is 1
Test case belongs to 1 class, with confidence 1.4244695024078587e-08, and its label is 1
Test case belongs to 0 class, with confidence 1.7824195367428537e-07, and its label is 0
Test case belongs to 0 class, with confidence 3.274826073823255e-08, and its label is 0
Test case belongs to 0 class, with confidence 3.908082130028668e-09, and its label is 0
Test case belongs to 0 class, with confidence 2.2506927445156057e-10, and its label is 0
Test case belongs to 1 class, with confidence 2.2429751093511997e-08, and its label is 1
Test case belongs to 1 class, with confidence 3.040020622299621e-07, and its label is 1
Test case belongs to 1 cla

Test case belongs to 1 class, with confidence 2.292032930917375e-08, and its label is 1
Test case belongs to 1 class, with confidence 9.122741709659248e-08, and its label is 1
Test case belongs to 1 class, with confidence 1.8185400220236006e-08, and its label is 1
Test case belongs to 1 class, with confidence 5.23387629225025e-08, and its label is 1
Test case belongs to 0 class, with confidence 5.070154107680766e-08, and its label is 0
Test case belongs to 0 class, with confidence 5.198923988290181e-07, and its label is 0
Test case belongs to 0 class, with confidence 6.054249302552796e-09, and its label is 0
Test case belongs to 1 class, with confidence 3.876595258699121e-07, and its label is 1
Test case belongs to 1 class, with confidence 1.3681238245394641e-10, and its label is 1
Test case belongs to 1 class, with confidence 9.499961228297612e-13, and its label is 1
Test case belongs to 0 class, with confidence 7.257721047021247e-14, and its label is 0
Test case belongs to 1 class, w

Test case belongs to 0 class, with confidence 1.2594052787742699e-08, and its label is 0
Test case belongs to 0 class, with confidence 9.170244304433506e-08, and its label is 0
Test case belongs to 0 class, with confidence 3.456106054962741e-11, and its label is 0
Test case belongs to 1 class, with confidence 4.026441107633698e-08, and its label is 1
Test case belongs to 0 class, with confidence 1.1046564105124005e-12, and its label is 0
Test case belongs to 0 class, with confidence 8.560228078973637e-16, and its label is 0
Test case belongs to 1 class, with confidence 2.953296800777462e-08, and its label is 1
Test case belongs to 1 class, with confidence 3.456504944325291e-09, and its label is 1
Test case belongs to 1 class, with confidence 3.54257515610068e-07, and its label is 1
Test case belongs to 1 class, with confidence 1.2095798763480937e-07, and its label is 1
Test case belongs to 1 class, with confidence 1.3822947321375666e-07, and its label is 1
Test case belongs to 0 class,

Test case belongs to 0 class, with confidence 1.3492963840065292e-12, and its label is 0
Test case belongs to 1 class, with confidence 1.4102209636162058e-07, and its label is 1
Test case belongs to 0 class, with confidence 3.661570735381303e-07, and its label is 0
Test case belongs to 1 class, with confidence 8.497999944131716e-12, and its label is 1
Test case belongs to 0 class, with confidence 3.0537596193434857e-15, and its label is 0
Test case belongs to 0 class, with confidence 3.6145428441330154e-11, and its label is 0
Test case belongs to 1 class, with confidence 8.684625386116414e-12, and its label is 1
Test case belongs to 0 class, with confidence 1.0338382475897104e-07, and its label is 0
Test case belongs to 1 class, with confidence 8.777278794998969e-08, and its label is 1
Test case belongs to 0 class, with confidence 2.1039512490624296e-07, and its label is 0
Test case belongs to 1 class, with confidence 1.1001137424394396e-08, and its label is 1
Test case belongs to 0 cl

Test case belongs to 0 class, with confidence 8.692565948114359e-12, and its label is 0
Test case belongs to 1 class, with confidence 7.733561116521487e-13, and its label is 1
Test case belongs to 1 class, with confidence 8.554888582994303e-10, and its label is 1
Test case belongs to 1 class, with confidence 1.7249567758546754e-07, and its label is 1
Test case belongs to 1 class, with confidence 1.1566563002309281e-08, and its label is 1
Test case belongs to 0 class, with confidence 2.1574910467967365e-09, and its label is 0
Test case belongs to 1 class, with confidence 4.867224474604648e-07, and its label is 1
Test case belongs to 1 class, with confidence 3.654293213907113e-21, and its label is 1
Test case belongs to 0 class, with confidence 1.746922972679763e-08, and its label is 0
Test case belongs to 1 class, with confidence 2.2465017164948917e-07, and its label is 1
Test case belongs to 0 class, with confidence 1.887669849451317e-10, and its label is 0
Test case belongs to 1 class

Test case belongs to 0 class, with confidence 2.389393124344992e-15, and its label is 0
Test case belongs to 0 class, with confidence 5.511717195019143e-07, and its label is 0
Test case belongs to 0 class, with confidence 3.686870734605047e-10, and its label is 0
Test case belongs to 0 class, with confidence 3.280255681618318e-08, and its label is 0
Test case belongs to 0 class, with confidence 5.027919131364444e-07, and its label is 0
Test case belongs to 0 class, with confidence 1.3855962663516138e-08, and its label is 0
Test case belongs to 1 class, with confidence 1.857627149802994e-07, and its label is 1
Test case belongs to 0 class, with confidence 6.723349913317288e-10, and its label is 0
Test case belongs to 1 class, with confidence 2.6544169505437113e-07, and its label is 1
Test case belongs to 1 class, with confidence 8.616630941640646e-09, and its label is 1
Test case belongs to 0 class, with confidence 8.687155348935002e-07, and its label is 0
Test case belongs to 1 class, 

Test case belongs to 1 class, with confidence 1.8896739173124835e-07, and its label is 1
Test case belongs to 0 class, with confidence 1.729939873485295e-07, and its label is 0
Test case belongs to 0 class, with confidence 2.3552132264911892e-07, and its label is 0
Test case belongs to 1 class, with confidence 3.8569574034408667e-07, and its label is 1
Test case belongs to 1 class, with confidence 2.973876276164008e-13, and its label is 1
Test case belongs to 1 class, with confidence 1.3716244416821446e-07, and its label is 1
Test case belongs to 0 class, with confidence 1.1260631729607493e-09, and its label is 0
Test case belongs to 1 class, with confidence 6.991028611655871e-13, and its label is 1
Test case belongs to 1 class, with confidence 4.2270369187213335e-08, and its label is 1
Test case belongs to 1 class, with confidence 6.277426744227825e-08, and its label is 1
Test case belongs to 0 class, with confidence 3.2317594405566446e-09, and its label is 0
Test case belongs to 0 cl

Test case belongs to 1 class, with confidence 1.6294940042698793e-12, and its label is 1
Test case belongs to 0 class, with confidence 1.0113019329298256e-07, and its label is 0
Test case belongs to 1 class, with confidence 4.1634602359340454e-08, and its label is 1
Test case belongs to 0 class, with confidence 5.16456513339061e-10, and its label is 0
Test case belongs to 1 class, with confidence 1.1726629946384162e-07, and its label is 1
Test case belongs to 0 class, with confidence 2.093775530876775e-07, and its label is 0
Test case belongs to 0 class, with confidence 1.4905545999558323e-10, and its label is 0
Test case belongs to 0 class, with confidence 6.089567097185919e-07, and its label is 0
Test case belongs to 0 class, with confidence 9.516615680046048e-13, and its label is 0
Test case belongs to 1 class, with confidence 2.3103258643621202e-07, and its label is 1
Test case belongs to 0 class, with confidence 2.0360772344814058e-16, and its label is 0
Test case belongs to 0 cla

Test case belongs to 0 class, with confidence 7.251918835182564e-07, and its label is 0
Test case belongs to 0 class, with confidence 1.6632595100430051e-09, and its label is 0
Test case belongs to 0 class, with confidence 4.362895746549886e-07, and its label is 0
Test case belongs to 1 class, with confidence 8.828092489716857e-08, and its label is 1
Test case belongs to 1 class, with confidence 3.161138650320941e-09, and its label is 1
Test case belongs to 0 class, with confidence 1.1619904592344142e-12, and its label is 0
Test case belongs to 1 class, with confidence 4.9611415081517156e-17, and its label is 1
Test case belongs to 0 class, with confidence 8.425322669832173e-07, and its label is 0
Test case belongs to 1 class, with confidence 1.62925401855526e-08, and its label is 1
Test case belongs to 1 class, with confidence 1.5862621763443268e-07, and its label is 1
Test case belongs to 0 class, with confidence 1.5234690548444462e-10, and its label is 0
Test case belongs to 1 class

Test case belongs to 0 class, with confidence 1.1522900297890813e-08, and its label is 0
Test case belongs to 0 class, with confidence 1.0217800336165078e-07, and its label is 0
Test case belongs to 0 class, with confidence 2.301151050907046e-10, and its label is 0
Test case belongs to 0 class, with confidence 4.5865402998929884e-10, and its label is 0
Test case belongs to 0 class, with confidence 5.139780837925104e-07, and its label is 0
Test case belongs to 0 class, with confidence 1.3505637050534485e-11, and its label is 0
Test case belongs to 0 class, with confidence 4.594771367842229e-09, and its label is 0
Test case belongs to 0 class, with confidence 4.548048167288139e-10, and its label is 0
Test case belongs to 0 class, with confidence 9.649951182245885e-08, and its label is 0
Test case belongs to 0 class, with confidence 3.308460811230889e-11, and its label is 0
Test case belongs to 0 class, with confidence 1.5517095552044465e-07, and its label is 0
Test case belongs to 1 clas

Test case belongs to 1 class, with confidence 1.6972971630312453e-07, and its label is 1
Test case belongs to 0 class, with confidence 3.169679537709522e-11, and its label is 0
Test case belongs to 0 class, with confidence 1.3078851522446282e-11, and its label is 0
Test case belongs to 0 class, with confidence 3.7355738253396274e-07, and its label is 0
Test case belongs to 1 class, with confidence 3.149206160953199e-08, and its label is 1
Test case belongs to 0 class, with confidence 6.59432812729587e-13, and its label is 0
Test case belongs to 0 class, with confidence 2.8352053029036948e-11, and its label is 0
Test case belongs to 1 class, with confidence 7.088095678311617e-10, and its label is 1
Test case belongs to 0 class, with confidence 9.286412985905672e-07, and its label is 0
Test case belongs to 0 class, with confidence 4.278080592590939e-08, and its label is 0
Test case belongs to 0 class, with confidence 3.1091491175440476e-10, and its label is 0
Test case belongs to 0 class

Test case belongs to 1 class, with confidence 4.0765053222925857e-07, and its label is 1
Test case belongs to 0 class, with confidence 2.5629967612323734e-09, and its label is 0
Test case belongs to 1 class, with confidence 4.534647751508236e-09, and its label is 1


In [27]:
print(acc*100, '%', sep='')

99.6923076923077%
