In [1]:
# Classification
# Inspired by https://www.kaggle.com/coolman/different-classification-techniques-python
# And adapted to categorical classification on https://www.kaggle.com/uciml/mushroom-classification

# The objective is to classify mushrooms between edible and poisonous
# with the following attributes:
#    cap-shape, cap-surface, cap-color, bruises, odor, gill-attachment, gill-spacing, gill-size, 
#    gill-color, stalk-shape, stalk-root, stalk-surface-above-ring, stalk-surface-below-ring, stalk-color-above-ring, 
#    stalk-color-below-ring, veil-type, veil-color, ring-number, ring-type, spore-print-color, population, habitat.

In [2]:
import numpy as np 
import pandas as pd

# Load dataset
mush = pd.read_csv("mushrooms.csv") #load the dataset

In [3]:
# prepare data
x = mush.iloc[:,1:23] # ignore first column which is the predicted
y = mush.iloc[:,0:1] # Classification on edible or poisonous
dummies = pd.get_dummies(x[:]) # transform x collumns 

In [4]:
# Feature Extraction with RFE - Recursive Feature Elimination
# check the bellow reference for other feature selection options
# -- https://machinelearningmastery.com/feature-selection-machine-learning-python/
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# get x and y
attX = dummies
attY = mush.iloc[:,0:1].values.ravel()

# Use logistic regression model
model = LogisticRegression(solver='lbfgs')
# Define RFE to run the log regression model and extract the 10 best features to predict y
rfe = RFE(model, 10)
# fit RFE to our data
fit = rfe.fit(attX, attY)
# get ranking
rank = fit.ranking_

# print number of features selected and the rankings 
print("Num Features: %d" % fit.n_features_)
print("Feature Ranking: %s" % rank)

Num Features: 10
Feature Ranking: [ 57  44  97 105  48  88  24  27  80  99  31  33 103 106  58  32  61  60
  34  77  78  89   1   3   1   1  59   1   2  17  18  85  86   7  12   6
   1   8  46 104  96  63  52  90  64  29  73  65  84  39  16  35  10  37
 100  25  30   1  23  94  20  55  38  13  79  56  45  54  81  47  72 102
  28  87  49  43  53  15  42  68  69   1 108  83  82  76  40  67 101  75
  36   5  41  66  98  91  19   4   1  93   1   1  74  92 107   9  26  70
  71  14  50  22  95  21  51  62  11]


In [5]:
# search all values '1' in the ranking array, return their index and display them
get_indexes = lambda x, xs: [i for (y, i) in zip(xs, range(len(xs))) if x == y] # https://pythonspot.com/array-find/
dummies.iloc[:, get_indexes(1,rank)].head()

# The selected categories are:
# odor, gill-size, stalk-surface-above-ring, stalk-color-below-ring, spore-print-color

Unnamed: 0,odor_a,odor_f,odor_l,odor_n,gill-size_n,stalk-surface-above-ring_k,stalk-color-below-ring_y,spore-print-color_n,spore-print-color_r,spore-print-color_u
0,0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,0,0,0
4,0,0,0,1,0,0,0,1,0,0


In [6]:
# build training and testing(10%) datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# define oneHotEncoder object
onehotencoder = OneHotEncoder(handle_unknown='ignore')

# define x as the selected features
x = x[['odor', 'gill-size', 'stalk-surface-above-ring', 'stalk-color-below-ring', 'spore-print-color']]
x = onehotencoder.fit_transform(x).toarray()
# get training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 100)

In [7]:
import pandas as pd
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def run_model(model, alg_name):
    
    # build the model on training data
    model.fit(X_train, y_train.values.ravel())
    # make predictions for test data
    y_pred = model.predict(X_test)
    
    # calculate the accuracy score
    accuracy =  accuracy_score(y_test, y_pred) * 100
    print(alg_name + ". Accuracy: " + str(accuracy))
    
    # Create a confusion matrix
    cm = pd.DataFrame(confusion_matrix(y_test.iloc[:,0], y_pred, labels=['e','p']))
    cm.rename(index = {0: "edible", 1:"poison"},columns = {0: "edible", 1:"poison"},inplace = True) 
    print(cm)
    
    # my method for confusion matrix
    #pt = 0; pf = 0; et = 0; ef = 0
    #for idx, val in enumerate(y_pred):
    #    if(val == 'p'):
    #        if(val == y_test.iloc[idx,0]): pt += 1;
    #        else: pf += 1;
    #    else:
    #        if(val == y_test.iloc[idx,0]): et += 1;
    #        else: ef += 1;    
    #print('\n        Predicted')
    #print('actual', 'poison', 'edible')
    #print('poison', pt,'  ', ef)
    #print('edible', pf,'  ', et)

### Models

In [8]:
# Decision Tree
from sklearn import tree

model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=7)
run_model(model, "Decision Tree")

Decision Tree. Accuracy: 100.0
        edible  poison
edible     427       0
poison       0     386


In [9]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10)
run_model(model, "Random Forest")

Random Forest. Accuracy: 100.0
        edible  poison
edible     427       0
poison       0     386


In [10]:
# xgboost
# 'pip install xgboost'

from xgboost import XGBClassifier

model = XGBClassifier()
run_model(model, "XGBoost")

XGBoost. Accuracy: 100.0
        edible  poison
edible     427       0
poison       0     386


In [11]:
# SVM Classifier
from sklearn.svm import SVC
model = SVC(gamma = 'scale')
run_model(model, "SVM Classifier")

SVM Classifier. Accuracy: 100.0
        edible  poison
edible     427       0
poison       0     386


In [12]:
# Nearest Neighbors
from sklearn import neighbors
model = neighbors.KNeighborsClassifier()
run_model(model, "Nearest Neighbors Classifier")

Nearest Neighbors Classifier. Accuracy: 100.0
        edible  poison
edible     427       0
poison       0     386


In [13]:
# SGD Classifier
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(SGDClassifier(max_iter=10, tol=100))
run_model(model, "SGD Classifier")

SGD Classifier. Accuracy: 99.0159901599016
        edible  poison
edible     419       8
poison       0     386


In [14]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
run_model(model, "Gaussian Naive Bayes")

Gaussian Naive Bayes. Accuracy: 98.4009840098401
        edible  poison
edible     415      12
poison       1     385


In [15]:
# Neural network - Multi-layer Perceptron
from sklearn.neural_network import MLPClassifier

model = MLPClassifier()
run_model(model, " MLP Neural network ")

 MLP Neural network . Accuracy: 100.0
        edible  poison
edible     427       0
poison       0     386


### Using the Models

This will define functions to receive input and produce a classification output. 
It will use the last saved model, if all the blocks were executed sequentialy this would be MLP neural network.
To use another model you have to run it above and then execute the next two blocks.

In [16]:
# Function to transform a feature into it's respective binary array
# the arguments are a features list for that category and the selected feature
# For 'Surface' it will be:
#    possible selection: Fibrous, Grooves, Scaly, Smooth -> if selected Scaly -> array[0, 0, 1, 0]
def transform (features, selection):
    transformed = np.zeros(len(features))
    transformed[features.index(selection)] = 1
    return transformed

# receives an array with the selected features, process and return the classification result with it's probability
def classify(search):
    lastCol = ''
    featuresDic = {}
    attributes = []
    
    x = mush.iloc[:,1:23] # ignore first column which is the predicted
    x = x[['odor', 'gill-size', 'stalk-surface-above-ring', 'stalk-color-below-ring', 'spore-print-color']]
    dummies = pd.get_dummies(x[:])
    
    # build lists for all the features and save them in to a dictionary
    for value in dummies.columns:
        col = value.split('_')
        if(col[0] != lastCol):
            newList = []
        lastCol = col[0]
        newList.append(col[1])
        featuresDic[col[0]] = newList
    
    # go trough the lists transforming them into binary lists according to the selected term 
    for idx, (key, value) in enumerate(featuresDic.items()):
        attributes.extend(transform(value, search[idx]))

    # try showing the predicted category probability 
    try:
        class_probabilities = model.predict_proba([attributes])
        print("Probability:","{:0.10%}".format(max(class_probabilities[0])))
    except:
        print("this model don't support the probability method")
        
    # predict and display classification
    z = model.predict([attributes])
    if(z[0] == 'p'): print('This mushroom is poisonous')
    else: print('Go for it!')

# Couldn't find the names of the categories in any file so I had to write them
odor = [('Almond','a'), ('Anise','l'), ('Creosote','c'), ('Fishy', 'y'), ('Foul', 'f'), ('Musty','m'), 
        ('None', 'n'), ('Pungent', 'p'), ('Spicy','s')]
gill_size = [('Broad', 'b'), ('Narrow', 'n')]
stalk_surface_above_ring = [('Fibrous','f'), ('Scaly', 'y'), ('Silky', 'k'), ('Smooth', 's')]
stalk_color_below_ring = [('Brown','n'), ('Buff', 'b'), ('Cinnamon', 'c'), ('Gray','g'), ('Orange', 'o'), 
                          ('Pink','p'), ('Red', 'e'), ('White', 'w'), ('Yellow', 'y')]
spore_print_color = [('black','k'), ('Brown','n'), ('Buff', 'b'), ('Chocolate', 'h'), ('Green','r'), ('Orange', 'o'), 
                          ('Purple','u'), ('White', 'w'), ('Yellow', 'y')]

In [17]:
# Simple interface for the model 
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

print('Mushroom Properties:')

@interact(Odor=odor, Gill=gill_size, Stalk_Above_Ring=stalk_surface_above_ring, 
          Stalk_Bellow_Ring=stalk_color_below_ring, Spore_Print_Color=spore_print_color)
def f(Odor, Gill, Stalk_Above_Ring, Stalk_Bellow_Ring, Spore_Print_Color):
    
    search = [Odor, Gill, Stalk_Above_Ring, Stalk_Bellow_Ring, Spore_Print_Color]
    classify(search)

Mushroom Properties:


interactive(children=(Dropdown(description='Odor', options=(('Almond', 'a'), ('Anise', 'l'), ('Creosote', 'c')â€¦