The Audubon Society Field Guide to North American Mushrooms provides descriptions of
hypothetical samples representing 23 species of gilled mushrooms from the Agaricus and
Lepiota families (1981). Each species is classified as either definitively edible, definitively
poisonous, or possibly edible but not recommended. For simplicity, the latter category was
combined with the toxic group. The guide emphasizes that there is no straightforward rule for
determining a mushroom's edibility, unlike the familiar "leaflets three, let it be" rule for Poison
Oak and Poison Ivy.
The main goal is to predict which mushroom is poisonous & which is edible.

Approach:
1. Data Exploration,
2. Data Cleaning,
3. Feature Engineering,
4. Model Building
5. Model Testing.
6. Try out different machine learning algorithms that’s best fit for the above case.

In [124]:
import pandas as pd
import numpy as np

In [125]:
df = pd.read_csv(r"mushrooms.csv")

In [126]:
df.head(100)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,e,x,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,n,s,g
96,e,f,y,n,t,l,f,c,b,p,...,y,w,w,p,w,o,p,n,s,g
97,e,x,s,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
98,e,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,n,s,g


In [127]:
df.shape

(8124, 23)

In [128]:
print(list(df.columns))

['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']


In [129]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [130]:
df['class'].value_counts()      ## This shows data is imbalanced

class
e    4208
p    3916
Name: count, dtype: int64

In [131]:
df.info()          ## to check the data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [132]:
df.isnull().sum()      ## There are no null values

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [133]:
df = df.dropna()          ## to drop row with null values, but the shape remain same proving there exist no null values.
df.shape

(8124, 23)

## BASIC DATA EXPLORATION

In [134]:
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [135]:
df['class'] = df['class'].apply(lambda x: 1 if x == 'e' else 0)


In [136]:
df['class'].value_counts()/len(df)

class
1    0.517971
0    0.482029
Name: count, dtype: float64

In [137]:
df.head(100)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,1,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,1,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,0,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,1,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1,x,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,n,s,g
96,1,f,y,n,t,l,f,c,b,p,...,y,w,w,p,w,o,p,n,s,g
97,1,x,s,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
98,1,b,s,w,t,a,f,c,b,g,...,s,w,w,p,w,o,p,n,s,g


In [138]:
X = df.drop('class', axis=1)

In [139]:
y = df['class']

## Converting categorical value to Neumerical value through laencoder

In [140]:
from sklearn.preprocessing import LabelEncoder

In [141]:
le = LabelEncoder()

In [142]:
X = X.apply(le.fit_transform)

In [143]:
print(X) ## We can see that the caetgorical features are converted to neumerical values.

      cap-shape  cap-surface  cap-color  bruises  odor  gill-attachment  \
0             5            2          4        1     6                1   
1             5            2          9        1     0                1   
2             0            2          8        1     3                1   
3             5            3          8        1     6                1   
4             5            2          3        0     5                1   
...         ...          ...        ...      ...   ...              ...   
8119          3            2          4        0     5                0   
8120          5            2          4        0     5                0   
8121          2            2          4        0     5                0   
8122          3            3          4        0     8                1   
8123          5            2          4        0     5                0   

      gill-spacing  gill-size  gill-color  stalk-shape  ...  \
0                0          1       

In [144]:
from sklearn.preprocessing import MinMaxScaler

In [145]:
scaler = MinMaxScaler()

In [146]:
X_1 = scaler.fit_transform(X)

In [147]:
df_1 = pd.DataFrame(X_1, columns = X.columns)

In [148]:
df_1.head(10)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1.0,0.666667,0.444444,1.0,0.75,1.0,0.0,1.0,0.363636,0.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,1.0,0.25,0.6,0.833333
1,1.0,0.666667,1.0,1.0,0.0,1.0,0.0,0.0,0.363636,0.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,1.0,0.375,0.4,0.166667
2,0.0,0.666667,0.888889,1.0,0.375,1.0,0.0,0.0,0.454545,0.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,1.0,0.375,0.4,0.5
3,1.0,1.0,0.888889,1.0,0.75,1.0,0.0,1.0,0.454545,0.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,1.0,0.25,0.6,0.833333
4,1.0,0.666667,0.333333,0.0,0.625,1.0,1.0,0.0,0.363636,1.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,0.0,0.375,0.0,0.166667
5,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.454545,0.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,1.0,0.25,0.4,0.166667
6,0.0,0.666667,0.888889,1.0,0.0,1.0,0.0,0.0,0.181818,0.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,1.0,0.25,0.4,0.5
7,0.0,1.0,0.888889,1.0,0.375,1.0,0.0,0.0,0.454545,0.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,1.0,0.375,0.6,0.5
8,1.0,1.0,0.888889,1.0,0.75,1.0,0.0,1.0,0.636364,0.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,1.0,0.25,0.8,0.166667
9,0.0,0.666667,1.0,1.0,0.0,1.0,0.0,0.0,0.181818,0.0,...,0.666667,0.875,0.875,0.0,0.666667,0.5,1.0,0.25,0.6,0.5


In [149]:
df_1.shape

(8124, 22)

## Creating Training & testing Data Sets

In [150]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [151]:
df_1_train, y_train, df_1_test, y_pred = train_test_split(df_1, y, train_size=.70, test_size=.30, random_state=42)

In [152]:
lr = LinearRegression()
lr.fit(df_1_train, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [5686, 2438]

In [None]:
y_pred = lr.predict(df_1_test)

In [161]:
y_test = df_1_test

In [159]:
from sklearn.metrics import accuracy_score

In [162]:
accuracy_score(y_test, y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [5686, 2438]

In [163]:
print("Training Features:\n", df_1_train)
print("Testing Features:\n", df_1_test)
print("Training Labels:\n", y_train)
print("Testing Labels:\n", y_test)

Training Features:
       cap-shape  cap-surface  cap-color  bruises   odor  gill-attachment  \
5921        1.0     0.666667   0.000000      1.0  0.250              1.0   
1073        1.0     0.000000   0.333333      1.0  0.625              1.0   
3710        1.0     0.000000   0.333333      0.0  0.250              1.0   
144         1.0     1.000000   1.000000      1.0  0.000              1.0   
5469        1.0     1.000000   0.444444      0.0  1.000              1.0   
...         ...          ...        ...      ...    ...              ...   
5226        1.0     1.000000   0.444444      0.0  0.250              1.0   
5390        0.6     1.000000   0.222222      1.0  0.625              1.0   
860         0.4     1.000000   0.444444      1.0  0.375              1.0   
7603        0.6     0.666667   0.222222      0.0  0.250              1.0   
7270        0.6     0.000000   0.333333      0.0  0.625              1.0   

      gill-spacing  gill-size  gill-color  stalk-shape  ...  \
5921

In [164]:
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score,roc_auc_score,roc_curve,classification_report