## Mushroom Edibility Classification

### Importing the Library

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Importing the data

In [6]:
df_raw=pd.read_csv('secondary_data.csv',sep=';')
df=df_raw.copy()
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,17.09,s,y,w,u,w,t,g,,d,w
1,p,16.6,x,g,o,f,e,,w,17.99,18.19,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.8,17.74,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,15.98,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,17.2,s,y,w,u,w,t,p,,d,w


### Data Processing

In [7]:
df.isna().sum()

class                       0
cap-diameter                0
cap-shape                   0
cap-surface             14120
cap-color                   0
does-bruise-or-bleed        0
gill-attachment          9884
gill-spacing            25063
gill-color                  0
stem-height                 0
stem-width                  0
stem-root               51538
stem-surface            38124
stem-color                  0
veil-type               57892
veil-color              53656
has-ring                    0
ring-type                2471
spore-print-color       54715
habitat                     0
season                      0
dtype: int64

In [8]:
df.shape

(61069, 21)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   class                 61069 non-null  object 
 1   cap-diameter          61069 non-null  float64
 2   cap-shape             61069 non-null  object 
 3   cap-surface           46949 non-null  object 
 4   cap-color             61069 non-null  object 
 5   does-bruise-or-bleed  61069 non-null  object 
 6   gill-attachment       51185 non-null  object 
 7   gill-spacing          36006 non-null  object 
 8   gill-color            61069 non-null  object 
 9   stem-height           61069 non-null  float64
 10  stem-width            61069 non-null  float64
 11  stem-root             9531 non-null   object 
 12  stem-surface          22945 non-null  object 
 13  stem-color            61069 non-null  object 
 14  veil-type             3177 non-null   object 
 15  veil-color         

In [10]:
misssing_per=((df.isna().sum())/df.shape[0])*100
misssing_per

class                    0.000000
cap-diameter             0.000000
cap-shape                0.000000
cap-surface             23.121387
cap-color                0.000000
does-bruise-or-bleed     0.000000
gill-attachment         16.184971
gill-spacing            41.040462
gill-color               0.000000
stem-height              0.000000
stem-width               0.000000
stem-root               84.393064
stem-surface            62.427746
stem-color               0.000000
veil-type               94.797688
veil-color              87.861272
has-ring                 0.000000
ring-type                4.046243
spore-print-color       89.595376
habitat                  0.000000
season                   0.000000
dtype: float64

In [11]:
df.describe()

Unnamed: 0,cap-diameter,stem-height,stem-width
count,61069.0,61069.0,61069.0
mean,6.733854,6.581538,12.14941
std,5.264845,3.370017,10.035955
min,0.38,0.0,0.0
25%,3.48,4.64,5.21
50%,5.86,5.95,10.19
75%,8.54,7.74,16.57
max,62.34,33.92,103.91


In [12]:
df.drop(['stem-root','stem-surface','veil-type','veil-color','spore-print-color'],axis=1,inplace=True)

In [13]:
misssing_per=((df.isna().sum())/df.shape[0])*100
misssing_per

class                    0.000000
cap-diameter             0.000000
cap-shape                0.000000
cap-surface             23.121387
cap-color                0.000000
does-bruise-or-bleed     0.000000
gill-attachment         16.184971
gill-spacing            41.040462
gill-color               0.000000
stem-height              0.000000
stem-width               0.000000
stem-color               0.000000
has-ring                 0.000000
ring-type                4.046243
habitat                  0.000000
season                   0.000000
dtype: float64

In [14]:
from sklearn.impute import SimpleImputer

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   class                 61069 non-null  object 
 1   cap-diameter          61069 non-null  float64
 2   cap-shape             61069 non-null  object 
 3   cap-surface           46949 non-null  object 
 4   cap-color             61069 non-null  object 
 5   does-bruise-or-bleed  61069 non-null  object 
 6   gill-attachment       51185 non-null  object 
 7   gill-spacing          36006 non-null  object 
 8   gill-color            61069 non-null  object 
 9   stem-height           61069 non-null  float64
 10  stem-width            61069 non-null  float64
 11  stem-color            61069 non-null  object 
 12  has-ring              61069 non-null  object 
 13  ring-type             58598 non-null  object 
 14  habitat               61069 non-null  object 
 15  season             

In [16]:
impu=SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[['cap-surface','gill-attachment','gill-attachment','gill-spacing','ring-type']]=impu.fit_transform(df[['cap-surface','gill-attachment','gill-attachment','gill-spacing','ring-type']])

In [17]:
df.isna().sum()

class                   0
cap-diameter            0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-height             0
stem-width              0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
season                  0
dtype: int64

In [18]:
import sweetviz as sv
analyze_report = sv.analyze(df)
analyze_report.show_html('report.html', open_browser=True)

                                             |                                             | [  0%]   00:00 ->…

Report report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [19]:
# from dataprep.eda import create_report
# create_report(df)

In [20]:
df[['stem-height','stem-width']].describe()

Unnamed: 0,stem-height,stem-width
count,61069.0,61069.0
mean,6.581538,12.14941
std,3.370017,10.035955
min,0.0,0.0
25%,4.64,5.21
50%,5.95,10.19
75%,7.74,16.57
max,33.92,103.91


In [21]:
df= df[df['stem-height'] != 0]

In [22]:
df[['stem-height','stem-width']].describe()

Unnamed: 0,stem-height,stem-width
count,60010.0,60010.0
mean,6.697683,12.363811
std,3.283218,9.992346
min,0.97,0.52
25%,4.71,5.46
50%,6.0,10.39
75%,7.79,16.69
max,33.92,103.91


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60010 entries, 0 to 61068
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   class                 60010 non-null  object 
 1   cap-diameter          60010 non-null  float64
 2   cap-shape             60010 non-null  object 
 3   cap-surface           60010 non-null  object 
 4   cap-color             60010 non-null  object 
 5   does-bruise-or-bleed  60010 non-null  object 
 6   gill-attachment       60010 non-null  object 
 7   gill-spacing          60010 non-null  object 
 8   gill-color            60010 non-null  object 
 9   stem-height           60010 non-null  float64
 10  stem-width            60010 non-null  float64
 11  stem-color            60010 non-null  object 
 12  has-ring              60010 non-null  object 
 13  ring-type             60010 non-null  object 
 14  habitat               60010 non-null  object 
 15  season             

In [24]:
not_numeric=[]
for label,content in df.items():
    if not pd.api.types.is_numeric_dtype(content):
        not_numeric.append(label)
print(not_numeric)

['class', 'cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color', 'stem-color', 'has-ring', 'ring-type', 'habitat', 'season']


In [25]:
for i in not_numeric:
    print(i)
    print(df[i].unique())

class
['p' 'e']
cap-shape
['x' 'f' 'p' 'b' 'c' 's' 'o']
cap-surface
['g' 'h' 't' 'y' 'e' 's' 'l' 'd' 'w' 'i' 'k']
cap-color
['o' 'e' 'n' 'g' 'r' 'w' 'y' 'p' 'u' 'b' 'l' 'k']
does-bruise-or-bleed
['f' 't']
gill-attachment
['e' 'a' 'd' 's' 'x' 'p' 'f']
gill-spacing
['c' 'd' 'f']
gill-color
['w' 'n' 'p' 'u' 'b' 'g' 'y' 'r' 'e' 'o' 'k' 'f']
stem-color
['w' 'y' 'n' 'u' 'b' 'l' 'r' 'p' 'e' 'k' 'g' 'o']
has-ring
['t' 'f']
ring-type
['g' 'p' 'e' 'l' 'f' 'm' 'r' 'z']
habitat
['d' 'm' 'g' 'h' 'l' 'p' 'w' 'u']
season
['w' 'u' 'a' 's']


In [26]:
df['class'].unique()

array(['p', 'e'], dtype=object)

In [27]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()

In [28]:
df['class']=label_encoder.fit_transform(df['class'])
df['class'].unique()

array([1, 0])

In [29]:
cap_shape=pd.get_dummies(df['cap-shape'],drop_first=True)
df=pd.concat([df,cap_shape],axis=1)
df.drop('cap-shape',axis=1,inplace=True)

In [30]:
cap_surface=pd.get_dummies(df['cap-surface'],drop_first=True)
df=pd.concat([df,cap_surface],axis=1)
df.drop('cap-surface',axis=1,inplace=True)

In [31]:
cap_color=pd.get_dummies(df['cap-color'],drop_first=True)
df=pd.concat([df,cap_color],axis=1)
df.drop('cap-color',axis=1,inplace=True)

In [32]:
df['does-bruise-or-bleed']=label_encoder.fit_transform(df['does-bruise-or-bleed'])
df['does-bruise-or-bleed'].unique()

array([0, 1])

In [33]:
ga=pd.get_dummies(df['gill-attachment'],drop_first=True)
df=pd.concat([df,ga],axis=1)
df.drop('gill-attachment',axis=1,inplace=True)

In [34]:
gs=pd.get_dummies(df['gill-spacing'],drop_first=True)
df=pd.concat([df,gs],axis=1)
df.drop('gill-spacing',axis=1,inplace=True)

In [35]:
gc=pd.get_dummies(df['gill-color'],drop_first=True)
df=pd.concat([df,gc],axis=1)
df.drop('gill-color',axis=1,inplace=True)

In [36]:
sc=pd.get_dummies(df['stem-color'],drop_first=True)
df=pd.concat([df,sc],axis=1)
df.drop('stem-color',axis=1,inplace=True)

In [37]:
rt=pd.get_dummies(df['ring-type'],drop_first=True)
df=pd.concat([df,rt],axis=1)
df.drop('ring-type',axis=1,inplace=True)

In [38]:
hab=pd.get_dummies(df['habitat'],drop_first=True)
df=pd.concat([df,hab],axis=1)
df.drop('habitat',axis=1,inplace=True)

In [39]:
sn=pd.get_dummies(df['season'],drop_first=True)
df=pd.concat([df,sn],axis=1)
df.drop('season',axis=1,inplace=True)

In [40]:
df['has-ring']=label_encoder.fit_transform(df['has-ring'])
df['has-ring'].unique()

array([1, 0])

In [41]:
df.head()

Unnamed: 0,class,cap-diameter,does-bruise-or-bleed,stem-height,stem-width,has-ring,c,f,o,p,s,x,e,g,h,i,k,l,s.1,t,w,y,e.1,g.1,k.1,l.1,n,o.1,p.1,r,u,w.1,y.1,d,e.2,f.1,p.2,s.2,x.1,d.1,f.2,e.3,f.3,g.2,k.2,n.1,o.2,p.3,r.1,u.1,w.2,y.2,e.4,g.3,k.3,l.2,n.2,o.3,p.4,r.2,u.2,w.3,y.3,f.4,g.4,l.3,m,p.5,r.3,z,g.5,h.1,l.4,m.1,p.6,u.3,w.4,s.3,u.4,w.5
0,1,15.26,0,16.95,17.09,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,16.6,0,17.99,18.19,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1,14.07,0,17.8,17.74,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,1,14.17,0,15.77,15.98,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
4,1,14.64,0,16.53,17.2,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1


## Splitting of dataset

In [42]:
X=df.drop('class',axis=1)
y=df['class']

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Standardization

In [45]:
from sklearn.preprocessing import MinMaxScaler
sc=MinMaxScaler()

In [46]:
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

## Model Preparation

In [47]:
from lazypredict.Supervised import LazyClassifier

In [48]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
print(models)