# IS 362 – Spencer Gallardo Mushrooom Project 4

I'm going to use scikit-learn to determine which of the two predictor columns that I selected (odor and one other column of your choice) most accurately predicts whether or not a mushroom is poisonous. 


In [47]:
# Display the plots inside this notebook
%matplotlib inline
# import matplotlib for graphics
import matplotlib.pyplot as plt
# import seaborn for graphics
import seaborn as sns
# as you will see, this project is a classification problem. So, 
# we import the KNeighborsClassifier library from scikit-learn.
from sklearn.neighbors import KNeighborsClassifier
# import the metrics module from scikit-learn for testing our models
from sklearn import metrics

import numpy as np
import pandas as pd
#set some pandas options controling output format
pd.set_option('display.notebook_repr_html',True) # output as flat text and not HTML
pd.set_option('display.max_rows', None) # this is the maximum number of rows we will display
pd.set_option('display.max_columns', None) # this is the maximum number of rows we will display

# setting up seaborn output
# Set the seaborn grid type to whitegrid. Whitegrid displays sightlines, font_scale set the font size.
sns.set(style='whitegrid',font_scale=4)
# Use pastel colors for the plot
sns.set_color_codes('pastel')

In [48]:
# I'm importing the colums for poisonous, cap-color and odoer and naming them accordingly.
dfFungi = pd.read_csv("data_src/agaricus-data.csv", \
                      header = None, \
                      usecols = [0, 3, 5,], \
                      names = ['poisonous', 'color', 'odor'])

In [49]:
# I'm creating a dictionary to hold the values for the replacement
# of the strings to integers. For the edible column I'm using :
# 0 : if it IS NOT poisonous (edible), and 
# 1 : if it IS poisonous.
dictNewValues = {
    'e' : 0, \
    'p' : 1
}

# replace the cell values with the values in the dictionary
dfFungi.replace({'poisonous': dictNewValues}, inplace=True)

# review the dataframe
dfFungi.head(5)

Unnamed: 0,poisonous,color,odor
0,1,n,p
1,0,y,a
2,0,w,l
3,1,w,p
4,0,g,n


In [50]:
# simmilarly to the edible column I am using the same dictionary variable 
# for the cap-color dictionary
# brown=n = 0
# buff=b = 1
# cinnamon=c = 2
# gray=g = 3
# green=r = 4
# pink=p = 5
# purple=u = 6
# red=e = 7
# white=w = 8
# yellow=y = 9

dictNewValues = {
    'n' : 0, \
    'b' : 1, \
    'c' : 2, \
    'g' : 3, \
    'r' : 4, \
    'p' : 5, \
    'u' : 6, \
    'e' : 7, \
    'w' : 8, \
    'y' : 9, 
}

# replace the cell values with the values in the dictionary
dfFungi.replace({'color': dictNewValues}, inplace=True)

# review the dataframe
dfFungi.head(5)

Unnamed: 0,poisonous,color,odor
0,1,0,p
1,0,9,a
2,0,8,l
3,1,8,p
4,0,3,n


In [54]:
# simmilarly to the other columns I am using the same dictionary variable 
# for the odor dictionary
# almond=a = 0
# anise=l = 1
# creosote=c = 2
# fishy=y = 3
# foul=f = 4
# musty=m = 5
# none=n = 6
# pungent=p = 7
# spicy=s = 8

dictNewValues = {
    'a' : 0, \
    'l' : 1, \
    'c' : 2, \
    'y' : 3, \
    'f' : 4, \
    'm' : 5, \
    'n' : 6, \
    'p' : 7, \
    's' : 8
}

# replace the cell values with the values in the dictionary
dfFungi.replace({'odor': dictNewValues}, inplace=True)

# review the dataframe
dfFungi.head(5)

Unnamed: 0,poisonous,color,odor
0,1,0,7
1,0,9,0
2,0,8,1
3,1,8,7
4,0,3,6


In [55]:
# get the shape of this dataset
dfFungi.shape

(8124, 3)

In [56]:
# I am going to store the first 8000 observations for training and the last 124 for testing
# below I am creating 3 series with the first 8000 observations
sPoisonous = pd.Series(dfFungi.poisonous[:7999])
sColor = pd.Series(dfFungi.color[:7999])
sOdor = pd.Series(dfFungi.odor[:7999])

# I assign those observations to a new dataframe containing the TRAINING data
dfTraining = pd.DataFrame({'color':sColor, 'odor':sOdor, 'poisonous':sPoisonous})

# I create 3 series with the last 124 observations for testing purposes
sPoisonous = pd.Series(dfFungi.poisonous[8000:])
sColor = pd.Series(dfFungi.color[8000:])
sOdor = pd.Series(dfFungi.odor[8000:])

# I assign those observations to a new dataframe containing the TESTING data
dfTests = pd.DataFrame({'color':sColor, 'odor':sOdor, 'poisonous':sPoisonous})

# display the data types of the columns
print(dfTraining.dtypes)
print(dfTests.dtypes)

color        int64
odor         int64
poisonous    int64
dtype: object
color        int64
odor         int64
poisonous    int64
dtype: object


In [57]:
# for this assignment I will create 3 different datasets to test against:
# knnColor : only contains the COLOR dataset
# knnOdor : only contains the ODOR dataset
# knnAll = contains both the color and odor data
# below I instantiate the three different estimators:
knnColor = KNeighborsClassifier(n_neighbors= 5)
knnOdor = KNeighborsClassifier(n_neighbors= 5)
knnAll = KNeighborsClassifier(n_neighbors= 5)

# to fit/train the model with the data in our pandas dataframe
# we will pass the fit() function two arguments fit(features, responses)
# The features are the columns 'color' and 'odor' and the responses
# are in the poisonous column.
# NOTE: I am using the ravel() function. ravel() is a numpy function
# that converts the supllied pandas column to the flattened array
# scikit-learn expects.

# fit the color estimator with the values from the dtraining dataset
knnColor.fit(dfTraining[['color']].values, \
             dfTraining[['poisonous']].values.ravel())           

# fit the odor estimator with the values from the dtraining dataset
knnOdor.fit(dfTraining[['odor']].values, \
            dfTraining[['poisonous']].values.ravel())

# fit the all estimator with the values from the dtraining dataset
knnAll.fit(dfTraining[['color','odor']].values, \
           dfTraining[['poisonous']].values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [58]:
# Test the color estimator against the testing dataset and store the results in the testing dataframe.
dfTests['preColor'] = pd.Series(knnColor.predict(dfTests[['color']].values), dfTests.index)\

# print the accuracy score of the color estimator
print(metrics.accuracy_score( \
                             dfTests[['poisonous']].values.ravel(), \
                             dfTests[['preColor']].values))

0.5403225806451613


In [59]:
# Test the odor estimator against the testing dataset and store the results in the testing dataframe.
dfTests['preOdor'] = pd.Series(knnOdor.predict(dfTests[['odor']].values), dfTests.index)

# print the accuracy score of the odor estimator
print(metrics.accuracy_score( \
                             dfTests[['poisonous']].values.ravel(), \
                             dfTests[['preOdor']].values))

1.0


In [60]:
# Test the all estimator against the testing dataset and store the results in the testing dataframe.
dfTests['preAll'] = pd.Series(knnAll.predict(dfTests[['color','odor']].values), dfTests.index)

# print the accuracy score of the all estimator
print(metrics.accuracy_score( \
                             dfTests[['poisonous']].values.ravel(), \
                             dfTests[['preAll']].values))

1.0


# Conclusions using scikit-learn results:
The results of the accuracy scores show that the mushroom cap-color is not a very effective predictor of edibility, with an accuracy score of only 0.54. On the other hand, mushroom odor appears to be a a much better predictor of edibillity. It has an accuracy of 1.0. 