# Author: Ruslan Brilenkov

# This tutorial is part of my big project 

## The main idea is to teach everybody Python, Data Science, and Machine Learning (ML) despite their educational background.

## Follow [this](https://medium.datadriveninvestor.com/python-tutorial-for-complete-beginners-from-hello-world-to-functions-47ceb8b96555) and [this](https://medium.com/mlearning-ai/machine-learning-for-complete-beginners-introduction-61b3a961b5ae) links to learn more

----

# Let us start with importing libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn import neighbors, metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Opening the dataset

In [2]:
# Loading dataset
data = pd.read_csv('agaricus-lepiota.data')#, header=None)

# print(data.head()) # first 5 raws

print(data.shape)

# Let us manually add the features/columns names
data.columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', \
                'bruises?', 'odor', 'gill-attachment', 
                'gill-spacing' ,'gill-size', 'gill-color', 
                'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 
                'stalk-surface-below-ring', 'stalk-color-above-ring', 
                'stalk-color-below-ring', 'veil-type', 'veil-color', 
                'ring-number', 'ring-type', 'spore-print-color', 
                'population', 'habitat']
data.head()


(8123, 23)


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
1,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
2,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
3,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
4,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g


## Selecting the attrabutes as our labels for this model
### (to reduce the size of the dataset)

## Tip: chose which parameters are the most important ones to determine the label
## For example, let us chose mostly the colors of various mushroom parts (cap, gill, veil, spore-print)

In [3]:
X = data[['cap-shape', 'cap-surface', 'cap-color', 
          'odor', 'gill-color', 'veil-color', 'spore-print-color']].values
print(X)

[['x' 's' 'y' ... 'k' 'w' 'n']
 ['b' 's' 'w' ... 'n' 'w' 'n']
 ['x' 'y' 'w' ... 'n' 'w' 'k']
 ...
 ['f' 's' 'n' ... 'n' 'o' 'b']
 ['k' 'y' 'n' ... 'b' 'w' 'w']
 ['x' 's' 'n' ... 'y' 'o' 'o']]


### Here is an issue:
### our dataset contains names (strings) instead of the numbers (integers/floats) => we cannot fetch strings into our ML algorithm

## Solution: convert strings into numbers


## Features (X) need one type of transformation, called Label Encoder

In [4]:
# defining our encoder
Lbl_Enc_X = LabelEncoder()

for col in range(len(X[0])):
    X[:, col] = Lbl_Enc_X.fit_transform(X[:, col])
    
print(X)



[[5 2 9 ... 4 2 3]
 [0 2 8 ... 5 2 3]
 [5 3 8 ... 5 2 2]
 ...
 [2 2 4 ... 5 1 0]
 [3 3 4 ... 0 2 7]
 [5 2 4 ... 11 1 4]]


## Labels (y) need another type of transformation, called Mapping

In [5]:
y = data[['class']]

# dictionary
Lbl_Map = {
    'e': 0,
    'p':1
          }

# actual convertion
y['class_2'] = y['class'].map(Lbl_Map)

print(y)
print(y['class_2'])


     class  class_2
0        e        0
1        e        0
2        p        1
3        e        0
4        e        0
...    ...      ...
8118     e        0
8119     e        0
8120     e        0
8121     p        1
8122     e        0

[8123 rows x 2 columns]
0       0
1       0
2       1
3       0
4       0
       ..
8118    0
8119    0
8120    0
8121    1
8122    0
Name: class_2, Length: 8123, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [6]:
# print(y[0:5])
# y = np.array(y)
# print(y[0:5])

to learn more about dictionaries, lists, and tuples follow [this link](https://medium.datadriveninvestor.com/python-tutorial-for-complete-beginners-from-hello-world-to-functions-47ceb8b96555)

## Data pre-processing: Train-test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y['class_2'], test_size = 0.2)

# Let us create the model!

In [8]:
# importing our model

from sklearn import svm

In [9]:
# A model is represented as an object
model_svm = svm.SVC()

In [10]:
# But our model is not ready for predictions yet. 
# It needs training!

In [11]:
model_svm.fit(X_train, y_train)

SVC()

In [12]:
print(model_svm)

SVC()


As we see, our model ran, i.e., made a fit based on the training subset

## Let us see an accuracy of our model

First of all, we need to make predictions of our model

In [13]:
predictions = model_svm.predict(X_test)

In [14]:
# accuracy calculation
accuracy = metrics.accuracy_score(y_test, predictions)

print("Predictions = {}".format(predictions[10:15]))
print("Real labels = {}".format(y_test[10:15]))
print("Accuracy is {}".format(accuracy))

Predictions = [1 0 1 1 1]
Real labels = 4474    1
7133    0
5997    1
4819    1
3200    1
Name: class_2, dtype: int64
Accuracy is 0.9643076923076923


# We got ~96% accuracy!
## Our bet on the colors as the most important elements seems to be right! 

To finalize this tutorial, let me add that in our case, we had only 2 classes, namely, edible and poisonous mushrooms. But SVM can work with multiple classes as well.

____

# Let us try different kernels

In [15]:
help(svm)

Help on package sklearn.svm in sklearn:

NAME
    sklearn.svm - The :mod:`sklearn.svm` module includes Support Vector Machine algorithms.

PACKAGE CONTENTS
    _base
    _bounds
    _classes
    _liblinear
    _libsvm
    _libsvm_sparse
    _newrand
    setup
    tests (package)

CLASSES
    sklearn.base.BaseEstimator(builtins.object)
        sklearn.svm._classes.LinearSVC(sklearn.linear_model._base.LinearClassifierMixin, sklearn.linear_model._base.SparseCoefMixin, sklearn.base.BaseEstimator)
    sklearn.base.OutlierMixin(builtins.object)
        sklearn.svm._classes.OneClassSVM(sklearn.base.OutlierMixin, sklearn.svm._base.BaseLibSVM)
    sklearn.base.RegressorMixin(builtins.object)
        sklearn.svm._classes.LinearSVR(sklearn.base.RegressorMixin, sklearn.linear_model._base.LinearModel)
        sklearn.svm._classes.NuSVR(sklearn.base.RegressorMixin, sklearn.svm._base.BaseLibSVM)
        sklearn.svm._classes.SVR(sklearn.base.RegressorMixin, sklearn.svm._base.BaseLibSVM)
    sklearn.li

In [17]:
# Let us iterate over every kernel and compute the accuracy of prediction for comparison

kernels = ('linear', 'poly', 'rbf', 'sigmoid')#, 'precomputed')

for kernel in kernels:
    
    # create the model
    model_svm_kern = svm.SVC(kernel=kernel)
    # fit the data
    model_svm_kern.fit(X_train, y_train)
    # make predictions
    predictions_kern = model_svm_kern.predict(X_test)
    # compute accuracy 
    accuracy_kern = metrics.accuracy_score(y_test, predictions_kern)
    
    # printing out accuracy score for each kernel:
    print("\nKernel {}".format(kernel))
    print("Accuracy = {}\n".format(accuracy_kern))


Kernel linear
Accuracy = 0.7581538461538462


Kernel poly
Accuracy = 0.9433846153846154


Kernel rbf
Accuracy = 0.9643076923076923


Kernel sigmoid
Accuracy = 0.38276923076923075



# As we see, the default 'rbf' kernel performs better on this dataset.