In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Context
Although this dataset was originally contributed to the UCI Machine Learning repository nearly 30 years ago, mushroom hunting (otherwise known as "shrooming") is enjoying new peaks in popularity. Learn which features spell certain death and which are most palatable in this dataset of mushroom characteristics. And how certain can your model be?

Content
This dataset includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family Mushroom drawn from The Audubon Society Field Guide to North American Mushrooms (1981). Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom; no rule like "leaflets three, let it be'' for Poisonous Oak and Ivy.

Time period: Donated to UCI ML 27 April 1987
Inspiration
What types of machine learning models perform best on this dataset?

Which features are most indicative of a poisonous mushroom?

Acknowledgements
This dataset was originally donated to the UCI Machine Learning repository. You can learn more about past research using the data here.


# Objective:

Our objective is to predict whether a mushroom is edible or poisoneous.

# About this file
 Attribute Information: (classes: edible=e, poisonous=p)

cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s

cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

bruises: bruises=t,no=f

odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

gill-attachment: attached=a,descending=d,free=f,notched=n

gill-spacing: close=c,crowded=w,distant=d

gill-size: broad=b,narrow=n

gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

stalk-shape: enlarging=e,tapering=t

stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

veil-type: partial=p,universal=u

veil-color: brown=n,orange=o,white=w,yellow=y

ring-number: none=n,one=o,two=t

ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

# Data Inspection

In [None]:
## import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
mushroom_df = pd.read_csv('../input/mushroom-classification/mushrooms.csv')
## read our data

In [None]:
mushroom_df.head() ## check head of our dataset

In [None]:
mushroom_df.shape ## check shape of our dataset

In [None]:
mushroom_df.info() ## check info about our dataset

**Check for null values**

In [None]:
mushroom_df.isnull().sum()

No null values in the dataset.

In [None]:
mushroom_df.describe(include='all') ## check description of the dataset

In [None]:
mushroom_df.columns

## all variables of the dataset

In [None]:
len(mushroom_df.columns) ## total 23 variables are there in the dataset

# EDA

Using pie plot to visualise our classes.

In [None]:
# ----------------------------------------------------------------------------------------------------
# prepare the data for plotting
# create a dictionary of classes and their totals
d = mushroom_df["class"].value_counts().to_dict()

# ----------------------------------------------------------------------------------------------------
# instanciate the figure
fig = plt.figure(figsize = (18, 6))
ax = fig.add_subplot()

# ----------------------------------------------------------------------------------------------------
# plot the data using matplotlib
ax.pie(d.values(), # pass the values from our dictionary
       labels = d.keys(), # pass the labels from our dictonary
       autopct = '%1.1f%%', # specify the format to be plotted
       textprops = {'fontsize': 10, 'color' : "white"} # change the font size and the color of the numbers inside the pie
      )
# ----------------------------------------------------------------------------------------------------
# prettify the plot

# set the title
ax.set_title("Pie chart")

# set the legend and add a title to the legend
ax.legend(loc = "upper left", bbox_to_anchor = (1, 0, 0.5, 1), fontsize = 10, title = "mushroom Class")
plt.show()

Approximately 52% of the total mushrooms are edible

Check all variable in respect of classes.

In [None]:
cols = list(mushroom_df.columns)
plt.figure(figsize=(40,20))

for i in enumerate(cols):
    plt.subplot(5,5,i[0]+1)
    ax = sns.countplot(x=i[1],hue='class',data=mushroom_df)
    ax.set_xlabel(i[1],fontsize=20)
plt.tight_layout()
plt.show()


**Comments:**
* There are no grooves type cap-surface mushrooms at all.
* Mushrooms with bruises are mainly edible and mushrooms with no bruises are mainly poisoneous.
* creosote,fishy,foul,musty and spicy odor mushrooms are poisoneous.
* Maximum mushrooms are free gill-attached , close gill-spacing (max poisoneous) and narrow gill - size (max edible).
* Buff and green gill colored mushrooms are edible.
* bulbous stalk root mushrroms are maximum
* silky stalk-surface-above-ring mushrooms are mainly poisoneous and smooth stalk-surface-above-ring mushrooms are mainly edible.
* silky stalk-surface-below-ring mushrooms are mainly poisoneous and smooth stalk-surface-below-ring mushrooms are mainly edible.
* orange and red stalk color above ring mushrooms are poisoneous.
* brown and orange veil color mushrooms are poisoneous.
* Buff , green , yellow , brown , orange spore print color mushrooms are mainly poisoneous.
* numerous and abandant mushrooms are mainly poisoneous.

In [None]:
## let's check some variable with imbalance lebels
mushroom_df['veil-type'].value_counts()/mushroom_df.shape[0]


 As variable 'vail-type' has only one value hence remove this variable from the dataframe and all mushrooms are partial type.

In [None]:
## hence remove this variable 
mushroom_df.drop('veil-type',axis=1,inplace=True)

In [None]:
mushroom_df.shape ## check final shape

# Data Preproccessing

In [None]:
## import libraries for data preproccessing
import sklearn 
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
import sklearn
from sklearn.model_selection import train_test_split


In [None]:
df_train,df_test = train_test_split(mushroom_df,train_size=0.7,random_state=5) ## split data in train and test

In [None]:
y_train = df_train.pop('class') ## x and y split of train data
X_train = df_train

In [None]:
y_test = df_test.pop('class') ## x and y split of test data
X_test = df_test

In this strategy, each category value is converted into a new column and assigned a 1 or 0 (notation for true/false) value to the column

In [None]:
encoder = ce.OneHotEncoder(cols=list(X_train.columns))

X_train = encoder.fit_transform(X_train) ## one hot encoding on all variables


In [None]:
X_train.head() ## check head of x 

In [None]:
X_test = encoder.transform(X_test) ## ebcoding done on x of test

In [None]:
X_test.head() ## check x test

In [None]:
y_train = y_train.apply(lambda x:0 if x=='e' else 1) ## convert target variable into 0 and 1

In [None]:
y_test = y_test.apply(lambda x:0 if x=='e' else 1) ## convert target variable into 0 and 1

**Remove Constant Features**
* Constant features are those that show the same value, just one value, for all the observations of the dataset. This is, the same value for all the rows of the dataset. These features provide no information that allows a machine learning model to discriminate or predict a target.
* Variance threshold from sklearn is a simple baseline approach to feature selection. It removes all features which variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e., features that have the same value in all samples.


In [None]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0)
sel.fit(X_train)  # fit finds the features with zero variance


In [None]:
# if we sum over get_support, we get the number of features that are not constant
sum(sel.get_support())

There is no constant variable hence increase threshold  bit

In [None]:
sel1 = VarianceThreshold(threshold=0.1)
sel1.fit(X_train)  # fit finds the features with 90% variance


In [None]:
# if we sum over get_support, we get the number of features that are not constant
sum(sel1.get_support())

In [None]:
X_train = X_train[X_train.columns[sel1.get_support()]] ## select variables with proper distribution of values

In [None]:
X_test = X_test[X_test.columns[sel1.get_support()]] ## select variables with proper distribution of values

# Model Building

In [None]:
from sklearn.ensemble import RandomForestClassifier ## import libraries for randomforest

Created a helper function to evaluate models.

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
def evaluate_model(rf):
    print("confusion matrix for training set: ",confusion_matrix(y_train,rf.predict(X_train)))
    print("accuracy score of training set: ",accuracy_score(y_train,rf.predict(X_train)))
    print("--"*50)
    print("confusion matrix for test set: ",confusion_matrix(y_test,rf.predict(X_test)))
    print("accuracy score of test set: ",accuracy_score(y_test,rf.predict(X_test)))
    print("**"*50)


In [None]:
rfc = RandomForestClassifier(random_state = 50)
rfc.fit(X_train,y_train) ## train our first model with default parameters

In [None]:
evaluate_model(rfc) ## evaluate the model

# Check Feature Importance

Create one dataframe with feature score to identify least important feature.

In [None]:
feature_score = pd.DataFrame({'features':X_train.columns,'feature score':rfc.feature_importances_}) 

In [None]:
feature_score.sort_values(by='feature score',ascending=False).head(10) ## check top 10 features

In [None]:
feature_score.sort_values(by='feature score',ascending=False).tail(10) ## check least top 10 features

Least important feature is 'stalk-color-below-ring_3' . Hence removing it. 

In [None]:
X_train.drop('stalk-color-below-ring_3',axis=1,inplace=True) ## remove from train

In [None]:
X_test.drop('stalk-color-below-ring_3',axis=1,inplace=True) ## remove from test

In [None]:
rfc1 = RandomForestClassifier(random_state=10)
rfc1.fit(X_train,y_train) ## fit our secoend model

In [None]:
evaluate_model(rfc1) ## evaluate model

Accuracy not changed due to one variable.

Again check feature importance.

In [None]:
feature_score = pd.DataFrame({'features':X_train.columns,'feature score':rfc1.feature_importances_})

In [None]:
feature_score.sort_values(by='feature score',ascending=False).tail(10)

In [None]:
X_train.drop('cap-color_6',axis=1,inplace=True) ## checking by removing cap-color_6

In [None]:
X_test.drop('cap-color_6',axis=1,inplace=True)

In [None]:
rfc2 = RandomForestClassifier(random_state=20)
rfc2.fit(X_train,y_train) ## again fit the model

In [None]:
evaluate_model(rfc2) ## accuracy score still not changed

In [None]:
len(X_train.columns) ## final list of features

In [None]:
from sklearn import tree
plt.figure(figsize=(30,15))
tree.plot_tree(rfc2.estimators_[0],filled=True)
plt.show()

## plot one decision tree of the random forest

**Conclusion**

We achieved 100 % accuracy for train and test hence there is no need of hyper parameter tuning but we can check and remove more features from the dataset.