# In this notebook we will make a Machine Learning model which helps us to predict that  Mushroom is safe to eat or is it poisons?


In [1]:
# Importing all important libraries which will be in use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Reading data using pandas
data = pd.read_csv('mushrooms.csv')

In [3]:
# Getting top 5 rows of our dataset
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
# Total rows and columns in our dataset
data.shape

(8124, 23)

In [5]:
# All column names in our dataset
data.columns.values

array(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'odor', 'gill-attachment', 'gill-spacing', 'gill-size',
       'gill-color', 'stalk-shape', 'stalk-root',
       'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
       'veil-color', 'ring-number', 'ring-type', 'spore-print-color',
       'population', 'habitat'], dtype=object)

In [6]:
# Checking null values in the dataset
data.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [7]:
# 
data['class'].value_counts()

e    4208
p    3916
Name: class, dtype: int64

## Converting all categorical data into Numerical data

In [8]:
# Using pandas method, we will convert the catigorical to numeric variables.
dummy = pd.get_dummies(data,drop_first=True)

In [9]:
# Merging the variables in the dataset
data1 = pd.concat([data, dummy],axis=1)

In [10]:
# Getting top 5 rows 
data1.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,p,x,s,n,t,p,f,c,n,k,...,0,1,0,0,0,0,0,0,1,0
1,e,x,s,y,t,a,f,c,b,k,...,1,0,0,0,1,0,0,0,0,0
2,e,b,s,w,t,l,f,c,b,n,...,1,0,0,0,0,0,1,0,0,0
3,p,x,y,w,t,p,f,c,n,n,...,0,1,0,0,0,0,0,0,1,0
4,e,x,s,g,f,n,f,w,b,k,...,0,0,0,0,1,0,0,0,0,0


In [11]:
# Again checking the shape of our dataset
data1.shape

(8124, 119)

In [12]:
# All columns in our dataset
data1.columns.values

array(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'odor', 'gill-attachment', 'gill-spacing', 'gill-size',
       'gill-color', 'stalk-shape', 'stalk-root',
       'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
       'veil-color', 'ring-number', 'ring-type', 'spore-print-color',
       'population', 'habitat', 'class_p', 'cap-shape_c', 'cap-shape_f',
       'cap-shape_k', 'cap-shape_s', 'cap-shape_x', 'cap-surface_g',
       'cap-surface_s', 'cap-surface_y', 'cap-color_c', 'cap-color_e',
       'cap-color_g', 'cap-color_n', 'cap-color_p', 'cap-color_r',
       'cap-color_u', 'cap-color_w', 'cap-color_y', 'bruises_t', 'odor_c',
       'odor_f', 'odor_l', 'odor_m', 'odor_n', 'odor_p', 'odor_s',
       'odor_y', 'gill-attachment_f', 'gill-spacing_w', 'gill-size_n',
       'gill-color_e', 'gill-color_g', 'gill-color_h', 'gill-color_k',
       'gill-color_n', 'gill-color_o', 'gill-color

In [21]:
# Dropping columns which will be in no iuse
data1 = data1.drop(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'odor', 'gill-attachment', 'gill-spacing', 'gill-size',
       'gill-color', 'stalk-shape', 'stalk-root',
       'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
       'veil-color', 'ring-number', 'ring-type', 'spore-print-color',
       'population', 'habitat'],axis=1)

In [23]:
# Checking the shape of our dataset
data1.shape

(8124, 96)

As we can see that the number of features are in large quantity, which will affect our machine learning model. So, the solution is to do the Dimensionality redution.

In [24]:
# Top 5 rows
data1.head()

Unnamed: 0,class_p,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [29]:
# Classifying the data in to X variable
X  = data1.drop(['class_p'],axis=1).values

In [37]:
X.shape

(8124, 95)

In [30]:
X

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [33]:
#  Classifying the data in to y variable
y = data1['class_p'].values

In [34]:
y

array([1, 0, 0, ..., 0, 1, 0], dtype=uint8)

In [25]:
# Splitting dataset into train and test
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [39]:
# Doing standard scaling of our data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [43]:
X_train

array([[-0.02348341,  1.2607294 , -0.34635076, ..., -0.40130972,
        -0.21100049, -0.16188464],
       [-0.02348341, -0.79319162, -0.34635076, ..., -0.40130972,
        -0.21100049, -0.16188464],
       [-0.02348341, -0.79319162, -0.34635076, ..., -0.40130972,
        -0.21100049, -0.16188464],
       ...,
       [-0.02348341,  1.2607294 , -0.34635076, ...,  2.49184099,
        -0.21100049, -0.16188464],
       [-0.02348341, -0.79319162,  2.88724646, ...,  2.49184099,
        -0.21100049, -0.16188464],
       [-0.02348341, -0.79319162,  2.88724646, ..., -0.40130972,
        -0.21100049, -0.16188464]])

In [44]:
X_test

array([[-0.02348341,  1.2607294 , -0.34635076, ..., -0.40130972,
        -0.21100049, -0.16188464],
       [-0.02348341,  1.2607294 , -0.34635076, ..., -0.40130972,
        -0.21100049, -0.16188464],
       [-0.02348341, -0.79319162, -0.34635076, ..., -0.40130972,
        -0.21100049, -0.16188464],
       ...,
       [-0.02348341, -0.79319162, -0.34635076, ..., -0.40130972,
        -0.21100049, -0.16188464],
       [-0.02348341, -0.79319162, -0.34635076, ..., -0.40130972,
        -0.21100049, -0.16188464],
       [-0.02348341,  1.2607294 , -0.34635076, ..., -0.40130972,
        -0.21100049, -0.16188464]])

## Linear Discriminant Analysis

In [47]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components = 1 )
X_train = lda.fit_transform(X_train,y_train)
X_test = lda.transform(X_test)

## Fitting our Machine Learning model 

In [48]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [56]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[1378    0]
 [   0 1303]]


## K-Fold cross validation

In [55]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 99.96 %
Standard Deviation: 0.11 %


## As we can see that with the help of K-fold cross validation our accuracy of model is 99.96, which is preety good.