In [1]:
import numpy as np
import pandas as pd

In [2]:
raw_data = pd.read_csv("mushrooms.csv")

In [3]:
raw_data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


So the target is to classify each mushroom as edible (e) or not (p), given a lot of data.

In [6]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

This will be a simple data set, as all data is present and all of it is categorical, which means a lot of One-Hot-Encoding.

Because of this, the pipeline is trivial and consists of just one transformer.

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
transformer = OneHotEncoder()

In [9]:
from sklearn.model_selection import train_test_split

In [12]:
data_X, data_y = raw_data.drop(["class"], axis=1), raw_data["class"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.3)

In [14]:
transformer.fit(X_train)

OneHotEncoder()

A nice classifier to use is a Random Forest Classifier, since it can provide feature importance.

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [18]:
X_train_ready = transformer.transform(X_train)

In [21]:
rfr_clf = RandomForestClassifier()

score = cross_val_score(rfr_clf, X_train_ready, y_train, cv=3, scoring="accuracy")

In [22]:
score

array([1., 1., 1.])

As can be seen, with a simple random forest regressor, 100% accuracy is acquired, without any further preprocessing.

In [23]:
rfr_clf.fit(X_train_ready, y_train)

RandomForestClassifier()

In [24]:
from sklearn.metrics import accuracy_score

In [25]:
X_test_ready = transformer.transform(X_test)

In [26]:
pred = rfr_clf.predict(X_test_ready)

In [27]:
accuracy_score(pred, y_test)

1.0

And the test score is 100% as well. For such a small dataset it might be nice to explore the importance.

In [33]:
basic_names = transformer.get_feature_names(input_features=X_train.columns)
importances = rfr_clf.feature_importances_

In [35]:
basic_names[:10]

array(['cap-shape_b', 'cap-shape_c', 'cap-shape_f', 'cap-shape_k',
       'cap-shape_s', 'cap-shape_x', 'cap-surface_f', 'cap-surface_g',
       'cap-surface_s', 'cap-surface_y'], dtype=object)

The next step is to strip the last "_" and all that follows to get importances.

In [36]:
combined_names = [name[:name.rfind("_")] for name in basic_names]

In [37]:
combined_names[:10]

['cap-shape',
 'cap-shape',
 'cap-shape',
 'cap-shape',
 'cap-shape',
 'cap-shape',
 'cap-surface',
 'cap-surface',
 'cap-surface',
 'cap-surface']

Now feature importances can be easily averaged.

In [46]:
features_list = {}

for feature, importance in zip(combined_names, importances):
    features_list[feature] = features_list.get(feature, []) + [abs(importance)]

In [47]:
features_importance = [(np.mean(l), feature) for feature, l in features_list.items()]
features_importance.sort(reverse=True)

In [48]:
features_importance

[(0.06115885890384734, 'gill-size'),
 (0.03130478317409953, 'bruises'),
 (0.028772944459848453, 'odor'),
 (0.015770550233577676, 'gill-spacing'),
 (0.014372973799174182, 'ring-type'),
 (0.014092078804447947, 'stalk-surface-below-ring'),
 (0.010724533948026832, 'stalk-shape'),
 (0.010106797751496258, 'spore-print-color'),
 (0.009378297139290967, 'stalk-surface-above-ring'),
 (0.008765260420285267, 'stalk-root'),
 (0.007003116109508271, 'population'),
 (0.004649273065927314, 'habitat'),
 (0.004568711588998625, 'gill-color'),
 (0.004238599929478735, 'ring-number'),
 (0.0024479356992087907, 'cap-surface'),
 (0.0017241277737486487, 'stalk-color-above-ring'),
 (0.0015438994825858494, 'cap-color'),
 (0.0013472172967175988, 'stalk-color-below-ring'),
 (0.0011298815025537353, 'gill-attachment'),
 (0.0007548847130523193, 'cap-shape'),
 (0.00020658279781753358, 'veil-color'),
 (0.0, 'veil-type')]

It is nice to know now, that one of the best predictors of "edibility" is grill size, odor and if the mushroom has bruises. On the other hand, colors seem to have not much effect on edibility.