In [1]:
# This is not related to the tutorial

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Scikit-Learn Tutorial
--

Adapted from https://www.dataschool.io

**Introducing the iris dataset**

Introducing the iris dataset
--

![Iris](images/03_iris.png)

- 50 samples of 3 different species of iris (150 samples total)
- Measurements: sepal length, sepal width, petal length, petal width

In [2]:
import pandas as pd
import sklearn as sk
import numpy as np

In [3]:
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 
                   header=None, 
                   names=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'class'])
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Requirements for working with data in scikit-learn

1. Features and target are **separate objects**
2. Features and target should be **numeric**
3. Features and target should be **NumPy arrays**
4. Features and target should have **specific shapes**

Extracting numpy arrays from the online dataset
--
(http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data)

In [4]:
# extract underlying numpy arrays
X = data.iloc[:,0:-1].values   
class_vals = data.iloc[:,-1:].values

X.shape
class_vals.shape

# flatten class_vals to a one dimensional (150,) array
class_vals = np.ravel(class_vals)
class_vals.shape

(150, 4)

(150, 1)

(150,)

In [5]:
from sklearn import preprocessing

# transform class values to numeric values
le = preprocessing.LabelEncoder()
le.fit(class_vals)

list(le.classes_)

y = le.transform(class_vals) 
y

LabelEncoder()

['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

## K-nearest neighbors (KNN) classification

1. Pick a value for K.
2. Search for the K observations in the training data that are "nearest" to the measurements of the unknown iris.
3. Use the most popular response value from the K nearest neighbors as the predicted response value for the unknown iris.

### Example training data (in 2D not related to the Iris dataset; just for illustration)

![Training data](images/04_knn_dataset.png)

### KNN classification map (K=1)

![1NN classification map](images/04_1nn_map.png)

### KNN classification map (K=5)

![5NN classification map](images/04_5nn_map.png)

*Image Credits: [Data3classes](http://commons.wikimedia.org/wiki/File:Data3classes.png#/media/File:Data3classes.png), [Map1NN](http://commons.wikimedia.org/wiki/File:Map1NN.png#/media/File:Map1NN.png), [Map5NN](http://commons.wikimedia.org/wiki/File:Map5NN.png#/media/File:Map5NN.png) by Agor153. Licensed under CC BY-SA 3.0*

## scikit-learn 4-step modeling pattern

**Step 1:** Import the class you plan to use

In [6]:
from sklearn.neighbors import KNeighborsClassifier

**Step 2:** "Instantiate" the "estimator"

- "Estimator" is scikit-learn's term for model
- "Instantiate" means "make an instance of"

In [7]:
knn = KNeighborsClassifier(n_neighbors=1)

- Name of the object does not matter
- Can specify tuning parameters (aka "hyperparameters") during this step
- All parameters not specified are set to their defaults

In [8]:
print(knn)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')


**Step 3:** Fit the model with data (aka "model training")

- Model is learning the relationship between X and y
- Occurs in-place

In [9]:
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

**Step 4:** Predict the response for a new observation

In [10]:
knn.predict([[3, 5, 4, 2]])

array([2], dtype=int64)

- Returns a NumPy array
- Can predict for multiple observations at once

In [11]:
X_new = [[3, 5, 4, 2], [5, 4, 3, 2]]
knn.predict(X_new)

array([2, 1], dtype=int64)

## Using a different value for K

In [12]:
# instantiate the model (using the value K=5)
knn = KNeighborsClassifier(n_neighbors=5)

# fit the model with data
knn.fit(X, y)

# predict the response for new observations
knn.predict(X_new)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

array([1, 1], dtype=int64)

## Using a different classification model - Logistic Regression

In [13]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X, y)

# predict the response for new observations
logreg.predict(X_new)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

array([2, 0], dtype=int64)

## Perceptron

In [14]:
# import the class
from sklearn.linear_model import Perceptron

# instantiate the model (using the default parameters)
percep = Perceptron()

# fit the model with data
percep.fit(X, y)

# predict the response for new observations
percep.predict(X_new)



Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      max_iter=5, n_iter=None, n_jobs=1, penalty=None, random_state=0,
      shuffle=True, tol=None, verbose=0, warm_start=False)

array([2, 0], dtype=int64)

## Naive Bayes

In [15]:
# import the class
from sklearn.naive_bayes import GaussianNB

# instantiate the model (using the default parameters)
nb = GaussianNB()

# fit the model with data
nb.fit(X, y)

# predict the response for new observations
nb.predict(X_new)

GaussianNB(priors=None)

array([2, 1], dtype=int64)

## Decision Tree

In [16]:
# import the class
from sklearn.tree import DecisionTreeClassifier

# instantiate the model (using the default parameters)
tree = DecisionTreeClassifier()

# fit the model with data
tree.fit(X, y)

# predict the response for new observations
tree.predict(X_new)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

array([1, 1], dtype=int64)

Accuracy
--

Here we will evaluate the classifiers by computing their accuracy on the train dataset. This is, however, too optimistic. The real test is to use a test dataset not used for training. We will see that later.

In [17]:
# compute classification accuracy for the logistic regression model
from sklearn import metrics

# predict the response on the train dataset
y_pred_knn    = knn.predict(X)
y_pred_logreg = logreg.predict(X)
y_pred_percep = percep.predict(X)
y_pred_nb     = nb.predict(X)
y_pred_tree   = tree.predict(X)

print("KNN", metrics.accuracy_score(y, y_pred_knn))
print("Logistic Regression", metrics.accuracy_score(y, y_pred_logreg))
print("Perceptron", metrics.accuracy_score(y, y_pred_percep))
print("Naive Bayes", metrics.accuracy_score(y, y_pred_nb))
print("Decision Tree", metrics.accuracy_score(y, y_pred_tree))

KNN 0.966666666667
Logistic Regression 0.96
Perceptron 0.666666666667
Naive Bayes 0.96
Decision Tree 1.0
