# EDA

In [1]:
# import modules
import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus
import numpy as np


plt.style.use('ggplot')
plt.rc('font', size=14)
plt.rc('figure', titlesize=18)
plt.rc('axes', labelsize=15)
plt.rc('axes', titlesize=18)
%matplotlib inline



In [2]:
file = 'data/agaricus-lepiota.data'
df = pd.read_csv(file)

In [3]:
profile = ProfileReport(df, title='Mushroom Profiling Report', html={'style':{'full_width':True}})

HBox(children=(FloatProgress(value=0.0, description='variables', max=23.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='correlations', max=6.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='interactions [continuous]', max=1.0, st…




HBox(children=(FloatProgress(value=0.0, description='table', max=1.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='missing', max=2.0, style=ProgressStyle(description_width=…









HBox(children=(FloatProgress(value=0.0, description='package', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='build report structure', max=1.0, style=ProgressStyle(des…




In [4]:
profile.to_widgets()

Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(value='Number of va…

# Data

In [5]:
# rename columns
df.rename(columns={'p':'classes', 'x':'cap_shape', 's':'cap_surface', 
                  'n':'cap_color', 't':'bruises_?', 'p.1':'odor', 'f':'gill_attachment', 
                  'c':'gill_spacing', 'n.1':'gill_size', 'k':'gill_color', 'e':'stalk_shape', 
                  'e.1':'stalk_root', 's.1':'stalk_surface_above_ring', 's.2':'stalk_surface_below_ring', 
                  'w':'stalk_color_above_ring', 'w.1':'stalk_color_below_ring', 'p.2':'veil_type', 
                  'w.2':'veil_color', 'o':'ring_number', 'p.3':'ring_type', 'k.1':'spore_print_color', 
                  's.3':'population', 'u':'habitat'}, inplace=True)

In [6]:
# Replace the '?'s with NaN
df = df.replace('?', np.nan)

In [7]:
df.isnull().sum()

classes                        0
cap_shape                      0
cap_surface                    0
cap_color                      0
bruises_?                      0
odor                           0
gill_attachment                0
gill_spacing                   0
gill_size                      0
gill_color                     0
stalk_shape                    0
stalk_root                  2480
stalk_surface_above_ring       0
stalk_surface_below_ring       0
stalk_color_above_ring         0
stalk_color_below_ring         0
veil_type                      0
veil_color                     0
ring_number                    0
ring_type                      0
spore_print_color              0
population                     0
habitat                        0
dtype: int64

In [8]:
for col in df:
    if df[col].dtype == 'object':
        df = df.fillna(df[col].value_counts().index[0])
        
print(df.isnull().sum())

classes                     0
cap_shape                   0
cap_surface                 0
cap_color                   0
bruises_?                   0
odor                        0
gill_attachment             0
gill_spacing                0
gill_size                   0
gill_color                  0
stalk_shape                 0
stalk_root                  0
stalk_surface_above_ring    0
stalk_surface_below_ring    0
stalk_color_above_ring      0
stalk_color_below_ring      0
veil_type                   0
veil_color                  0
ring_number                 0
ring_type                   0
spore_print_color           0
population                  0
habitat                     0
dtype: int64


In [9]:
df['classes'].value_counts()

e    4208
p    3915
Name: classes, dtype: int64

> Mushrooms are either classified as edible (e) or poisonous (p). The above output outlines the prevalence of mushrooms in the dataset

In [10]:
#split dataset in features and target variable
X = df[df.columns[1:]]
y = df.classes

In [11]:
# encode categorical features and replace target values to binary values
X_dummy = pd.get_dummies(X)
y_dummy = df['classes'].replace(['e', 'p'], [1, 0])

In [12]:
# slit data into train set and test set
# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_dummy, y_dummy, test_size=0.20, stratify=y_dummy, random_state=42)

print(y_train.shape, X_train.shape)
print(y_test.shape, X_test.shape)

(6498,) (6498, 116)
(1625,) (1625, 116)


# Preprocessing Data

In [13]:
# instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler()
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)


# Building Decision Tree Model

In [27]:
clf = DecisionTreeClassifier()
clf = clf.fit(rescaledX_train, y_train)
y_pred = clf.predict(rescaledX_test)
y_pred_train = clf.predict(rescaledX_train)

# Evaluating Model

In [15]:
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

Accuracy:  1.0


In [28]:
print("Accuracy: ", metrics.accuracy_score(y_train, y_pred_train))

Accuracy:  1.0


> A classification rate of 100%, unbelievable. I guess there is no need to optimize the model now.

In [32]:
    print("Precision: ", metrics.precision_score(y_test, y_pred))
    print("Recall: ", metrics.recall_score(y_test, y_pred))

Precision:  1.0
Recall:  1.0


Wow.

# Feature Importance

In [30]:
dict = {'feature': X_train.columns, 'feature_importance':clf.feature_importances_}
new_df = pd.DataFrame(dict)
new_df.sort_values('feature_importance',axis=0,ascending=False,inplace=True)

In [31]:
new_df = new_df[new_df["feature_importance"] > 0.05]
new_df

Unnamed: 0,feature,feature_importance
27,odor_n,0.613309
52,stalk_root_c,0.18187
54,stalk_root_r,0.081185


> Factors that predict the status of mushrooms.