In [1]:
# Classification
# Inspired on https://www.kaggle.com/coolman/different-classification-techniques-python
# And adapted to categorical classification on https://www.kaggle.com/uciml/mushroom-classification

# The objective is to classify mushrooms between edible and poisonous
# with the following attributes:
#    cap-shape, cap-surface, cap-color, bruises, odor, gill-attachment, gill-spacing, gill-size, 
#    gill-color, stalk-shape, stalk-root, stalk-surface-above-ring, stalk-surface-below-ring, stalk-color-above-ring, 
#    stalk-color-below-ring, veil-type, veil-color, ring-number, ring-type, spore-print-color, population, habitat.

In [2]:
import numpy as np 
import pandas as pd

# Load dataset
mush = pd.read_csv("mushrooms.csv") #load the dataset

In [3]:
# prepare data
x = mush.iloc[:,1:7] # ignore first column which is the predicted
y = mush.iloc[:,0:1] # Classification on edible or poisonous
dummies = pd.get_dummies(x[:]) # get transformation with the collumns

In [4]:
# build training and testing(10%) datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

onehotencoder = OneHotEncoder(handle_unknown='ignore')
x = onehotencoder.fit_transform(x).toarray()
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 100)

In [5]:
import pandas as pd
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def run_model(model, alg_name):
    
    # build the model on training data
    model.fit(X_train, y_train.values.ravel())
    # make predictions for test data
    y_pred = model.predict(X_test)
    
    # calculate the accuracy score
    accuracy =  accuracy_score(y_test, y_pred) * 100
    print(alg_name + ". Accuracy: " + str(accuracy))
    
    # Create a confusion matrix
    cm = pd.DataFrame(confusion_matrix(y_test.iloc[:,0], y_pred, labels=['e','p']))
    cm.rename(index = {0: "edible", 1:"poison"},columns = {0: "edible", 1:"poison"},inplace = True) 
    print(cm)
    
    # my own method for confusion matrix
    #pt = 0; pf = 0; et = 0; ef = 0
    #for idx, val in enumerate(y_pred):
    #    if(val == 'p'):
    #        if(val == y_test.iloc[idx,0]): pt += 1;
    #        else: pf += 1;
    #    else:
    #        if(val == y_test.iloc[idx,0]): et += 1;
    #        else: ef += 1;    
    #print('\n        Predicted')
    #print('actual', 'poison', 'edible')
    #print('poison', pt,'  ', ef)
    #print('edible', pf,'  ', et)

### Models

In [6]:
# Decision Tree
from sklearn import tree

model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=7)
run_model(model, "Decision Tree")

Decision Tree. Accuracy: 99.3849938499385
        edible  poison
edible     427       0
poison       5     381


In [7]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=10)
run_model(model, "Random Forest")

Random Forest. Accuracy: 99.50799507995079
        edible  poison
edible     427       0
poison       4     382


In [8]:
# xgboost
# 'pip install xgboost'

from xgboost import XGBClassifier

model = XGBClassifier()
run_model(model, "XGBoost")

XGBoost. Accuracy: 99.3849938499385
        edible  poison
edible     427       0
poison       5     381


In [9]:
# SVM Classifier
from sklearn.svm import SVC
model = SVC(gamma = 'scale')
run_model(model, "SVM Classifier")

SVM Classifier. Accuracy: 99.50799507995079
        edible  poison
edible     427       0
poison       4     382


In [10]:
# Nearest Neighbors
from sklearn import neighbors
model = neighbors.KNeighborsClassifier()
run_model(model, "Nearest Neighbors Classifier")

Nearest Neighbors Classifier. Accuracy: 99.3849938499385
        edible  poison
edible     427       0
poison       5     381


In [11]:
# SGD Classifier
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(SGDClassifier(max_iter=10, tol=100))
run_model(model, "SGD Classifier")

SGD Classifier. Accuracy: 99.50799507995079
        edible  poison
edible     427       0
poison       4     382


In [12]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
run_model(model, "Gaussian Naive Bayes")

Gaussian Naive Bayes. Accuracy: 98.2779827798278
        edible  poison
edible     427       0
poison      14     372


In [13]:
# Neural network - Multi-layer Perceptron
from sklearn.neural_network import MLPClassifier

model = MLPClassifier()
run_model(model, " MLP Neural network ")

 MLP Neural network . Accuracy: 99.50799507995079
        edible  poison
edible     427       0
poison       4     382


### Using the Models

This will define functions to receive input and produce a classification output. 
It will use the last saved model, if all the blocks were executed sequentialy this would be MLP neural network.
To use another model you have to run it above and then execute the next two blocks.

In [14]:
# Function to transform a feature into it's respective binary array
# the arguments are a features list for that category and the selected feature
# For 'Surface' it will be:
#    possible selection: Fibrous, Grooves, Scaly, Smooth -> if selected Scaly -> array[0, 0, 1, 0]
def transform (features, selection):
    transformed = np.zeros(len(features))
    transformed[features.index(selection)] = 1
    return transformed

# receives an array with the selected features, process and return the classification result with it's probability
def classify(search):
    lastCol = ''
    featuresDic = {}
    attributes = []
    
    # build lists for all the features and save them in to a dictionary
    for value in dummies.columns:
        col = value.split('_')
        if(col[0] != lastCol):
            newList = []
        lastCol = col[0]
        newList.append(col[1])
        featuresDic[col[0]] = newList
    
    # go trough the lists transforming them into binary lists according to the selected term 
    for idx, (key, value) in enumerate(featuresDic.items()):
        attributes.extend(transform(value, search[idx]))

    # try showing the predicted category probability 
    try:
        class_probabilities = model.predict_proba([attributes])
        print("Probability:","{:0.10%}".format(max(class_probabilities[0])))
    except:
        print("this model don't support the probability method")
        
    # predict and display classification
    z = model.predict([attributes])
    if(z[0] == 'p'): print('This mushroom is poisonous')
    else: print('Go for it!')

# Couldn't find the names of the categories in any file so I had to write them
shape = [('Bell', 'b'), ('Conical', 'c'), ('Convex', 'x'), ('Flat', 'f'), ('Knobbed', 'k'), ('Sunken','s')]
surface = [('Fibrous','f'), ('Grooves', 'g'), ('Scaly', 'y'), ('Smooth', 's')]
color = [('Brown','n'), ('Buff', 'b'), ('Cinnamon', 'c'), ('Gray','g'), ('Green', 'r'), ('Pink','p'), 
         ('Purple', 'u'), ('Red', 'e'), ('White', 'w'), ('Yellow', 'y')]
bruises = [('Bruises', 't'), ('No Bruises', 'f')]
odor = [('Almond','a'), ('Anise','l'), ('Creosote','c'), ('Fishy', 'y'), ('Foul', 'f'), ('Musty','m'), 
        ('None', 'n'), ('Pungent', 'p'), ('Spicy','s')]
gill_attachment = [('Attached', 'a'),('Free','f')]

In [15]:
# Simple interface for the model 
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

print('Mushroom Properties:')

@interact(Shape=shape, Surface=surface, Color=color, Bruises=bruises, Odor=odor, Gill=gill_attachment)
def f(Shape, Surface, Color, Bruises, Odor, Gill):
    
    search = [Shape, Surface, Color, Bruises, Odor, Gill]
    classify(search)

Mushroom Properties:


interactive(children=(Dropdown(description='Shape', options=(('Bell', 'b'), ('Conical', 'c'), ('Convex', 'x'),…