In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder #Could have been done using mapping via dictionary
from sklearn.model_selection import train_test_split #Could have done ourselves too but it saves the work

In [4]:
df = pd.read_csv("Mushrooms.csv")
print(df.head())
#There are 23 columns 

  type cap_shape cap_surface cap_color bruises odor gill_attachment  \
0    p         x           s         n       t    p               f   
1    e         x           s         y       t    a               f   
2    e         b           s         w       t    l               f   
3    p         x           y         w       t    p               f   
4    e         x           s         g       f    n               f   

  gill_spacing gill_size gill_color   ...   stalk_surface_below_ring  \
0            c         n          k   ...                          s   
1            c         b          k   ...                          s   
2            c         b          n   ...                          s   
3            c         n          n   ...                          s   
4            w         b          k   ...                          s   

  stalk_color_above_ring stalk_color_below_ring veil_type veil_color  \
0                      w                      w         p          w

In [5]:
df.shape
#Total number of rows are 8124

(8124, 23)

In [6]:
#The data is categorical, hence we need to convert it into  numerical data
#To convert the data we can create a dictionary and do the necessary mapping
#Instead of doing all the work we called LabelEncoder from sklearn.preprocessing to do our work

le = LabelEncoder()
#Applies transformation on each columns
ds = df.apply(le.fit_transform)

In [7]:
print(ds.head())
#The mapping is done column wise

   type  cap_shape  cap_surface  cap_color  bruises  odor  gill_attachment  \
0     1          5            2          4        1     6                1   
1     0          5            2          9        1     0                1   
2     0          0            2          8        1     3                1   
3     1          5            3          8        1     6                1   
4     0          5            2          3        0     5                1   

   gill_spacing  gill_size  gill_color   ...     stalk_surface_below_ring  \
0             0          1           4   ...                            2   
1             0          0           4   ...                            2   
2             0          0           5   ...                            2   
3             0          1           5   ...                            2   
4             1          0           4   ...                            2   

   stalk_color_above_ring  stalk_color_below_ring  veil_type  veil_c

In [9]:
data = ds.values
print(data.shape)
print(type(data))
#Converted ds into array and store it into data

(8124, 23)
<class 'numpy.ndarray'>


In [11]:
X = data[:,1:]
Y = data[:,0] #First column contains the label so put it in y
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)
#Splitting it in test and training set

In [12]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


In [15]:
def prior_prob(y_train,label):
    total = y_train.shape[0]
    ClassLabel = np.sum(y_train==label) #It will compute number of datasets equal to reuired label in whole dataset
    return (ClassLabel/total)
a = np.array([0,0,0,1,1,2,1,3,4,1,2,6])
print(prior_prob(a,1))

0.3333333333333333


In [21]:
def cond_prob(x_train,y_train,feature_col,feature_val,label):
    
    x_filter = x_train[y_train==label] #We will just consider rows where labels as we want them to be
    numerator = np.sum(x_filter[:,feature_col]==feature_val) #We will get the count of the required character(Colour) among the
    #correct labels
    denominator = np.sum(y_train==label)#Count of rows with correct labels
    
    return numerator/float(denominator) #So this will give the number like we want to check the probability of a mushroom being
#of a certain colour if we know it's class

In [22]:
#We will get likelihood of all the labels
#To be a probability of a certain label if given a certain feature we can use our above function
#Then we can keep multiplying for all the features and get the likelihood
#Then we can do this for all the labels
#Then we can take max of all probabilities to label that particular mushroom
def predict(x_train,y_train,x_test):
    """Xtest is a single testing point, n features"""
    
    classes = np.unique(y_train) #Number of unique classes
    n_features = x_train.shape[1] #Number of features
    probabilities = [] # List of prob for all classes and given a single testing point
    #Compute Posterior for each class
    for label in classes: #For each label
        
        #Post_c = likelihood*prior
        likelihood = 1.0
        for f in range(n_features): #Checks all features
            cond = cond_prob(x_train,y_train,f,x_test[f],label)
            likelihood *= cond 
            
        prior = prior_prob(y_train,label)
        post = likelihood*prior
        probabilities.append(post)
        
    pred = np.argmax(probabilities) #Take the maximum
    return pred

In [23]:
output = predict(x_train,y_train,x_test[7])
print(output) #The output given by our algo
print(y_test[7]) #The actual label

0
0


In [24]:
def accuarca(x_train,y_train,x_test,y_test):

    pred = []   #making a prediction array
    for i in range(x_test.shape[0]):
        pred_label = predict(x_train,y_train,x_test[i]) #Predicting for each test
        pred.append(pred_label) # <===Correction
    
    pred = np.array(pred) #Converting it to array
    
    accuracy = np.sum(pred==y_test)/y_test.shape[0] #Dividing all correct predictions/ Total number of tests
    return accuracy

print(score(x_train,y_train,x_test,y_test))

0.9950769230769231
