In [None]:
# Quick and easy overview testing a couple of ML supervised learning models
# as classifiers for the Mushroom dataset.

# The dataset itself consists of multiple categorical features so guess what,
# probably a tree will work wonders (don't need fancy deeplearning here)

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from bokeh.plotting import output_notebook, figure, show
from bokeh.models import HoverTool, ColumnDataSource
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors

%matplotlib inline
output_notebook()

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')

In [None]:
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.describe()

Let's find out the sizing of edible versus poisoneous mushrooms?

In [None]:
df.groupby('class').size()

In [None]:
df.isnull().sum()

In [None]:
# encode the categories - don't use LabelEncoder as all features should be treated equal
# the below is a short cut for OneHotEncoding

for col in df.columns:
    df = pd.get_dummies(df,prefix=col, columns = [col], drop_first=True)   
 
df.head()

We know that data has 'class' as the y (output or predicted value), so all other columns are features and col 0 is label

In [None]:
X = df.iloc[:,1:96]  # all rows, not col0 but all the remainng cols which are features
y = df.iloc[:, 0]  # all rows, label col only

In [None]:
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=1)

Let's see if there are highly correlated features to reduce dimensionality

In [None]:
# find correlations to target
corr_matrix = df.corr().abs()
print(corr_matrix['class_p'].sort_values(ascending=False).head(11))

See https://www.slideshare.net/rayborg/mushroom-tutorial for mushroom explanation.

The following 10 features explains a lot if the class is edible or not:

* odor_n                        0.785557
* odor_f                        0.623842
* stalk-surface-above-ring_k    0.587658
* stalk-surface-below-ring_k    0.573524
* ring-type_p                   0.540469
* gill-size_n                   0.540024
* bruises_t                     0.501530
* stalk-surface-above-ring_s    0.491314
1. * spore-print-color_h           0.490229
* ring-type_l                   0.451619

Particularly we see that if a mushroom has no smell or a foul smell, then it's likely to explain edible/poisenous.

With these 10 features we will test how good a model we can create.

Let's check the opposite - what features explains the least:

In [None]:
print(corr_matrix['class_p'].sort_values(ascending=True).head(11))

Interesting - if a mushroom has a green or pink cap color it says very little about edibility.

In [None]:
# first let's run all the data and setup models
# to find a model that works the best
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
resultsmean = []
resultsstddev = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
    resultsmean.append(cv_results.mean())
    resultsstddev.append(cv_results.std())


In [None]:
resultsDf = pd.DataFrame(
    {'name': names,
     'mean': resultsmean,
     'std dev': resultsstddev
    }
)
resultsDf = resultsDf.sort_values(by=['mean'], ascending=False)
print(resultsDf)

In [None]:
# Make predictions using validation dataset using CART model
model1 = DecisionTreeClassifier()
model1.fit(X_train, Y_train)
predictions = model1.predict(X_validation)

In [None]:
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

In [None]:
#plot graph of feature importances
feat_importances = pd.Series(model1.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

These are the features that are the most important in this model. Edible mushrooms tends to be those that have no odeour if in combo with a club shaped root.

* One could store this model using joblib for later use... https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/ and perhaps transfer to CoreML https://stackoverflow.com/questions/45291093/scikit-learn-convert-multi-output-decision-tree-to-coreml-model#45519253 in order to create a nifty little iOS app?

See also https://developer.apple.com/documentation/coreml/converting_trained_models_to_core_ml

But... 23 questions to an end user with 96 total answers (on avg. 4 options per question) is way too much. Let's see if we can reduce the problem...

In [None]:
# a desire to understand if we can compress feature dimensionality
# using PCA
from sklearn.decomposition import PCA

In [None]:
# PCA
model3 = PCA(n_components=2)
pc = model3.fit_transform(X_train)

In [None]:
len(pca.components_)

29 of the 96 answers are important that they explain most of the edibility.

In [None]:
principalDf = pd.DataFrame(data = pc, columns = ['principal component 1', 'principal component 2'])

In [None]:
finalDf = pd.concat([principalDf, df[['class_p']]], axis = 1)

In [None]:
plt.matshow(pca.components_,cmap='viridis')
#plt.yticks([0,1],['1st Comp','2nd Comp'],fontsize=10)
plt.colorbar()
plt.xticks(range(len(df.columns)),df.columns,rotation=65,ha='left')
plt.tight_layout()
plt.show()# 

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [1, 0]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['class_p'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

In [None]:
print('Explained variation per principal component: {}'.format(model3.explained_variance_ratio_))


Univariate feature selection works by selecting the best features based on univariate statistical tests.
We can use sklearn’s SelectKBest to select a number of features to keep. This method uses statistical tests to select features having the highest correlation to the target. Here we will keep the top 10 features.

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
# feature extraction
k_best = SelectKBest(score_func=f_classif, k=10)
# fit on train set
fit = k_best.fit(X_train, Y_train)
# transform train set
univariate_features = fit.transform(X_train)

In [None]:
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.head()
featureScores.columns = ['Column','Score']  #naming the dataframe columns
print(featureScores.nlargest(10, 'Score'))  #print 10 best features