# Table of Contents
 <p><div class="lev2"><a href="#Basic-CrossValidation-scoring"><span class="toc-item-num">0.1&nbsp;&nbsp;</span>Basic CrossValidation scoring</a></div><div class="lev2"><a href="#Grid-testing"><span class="toc-item-num">0.2&nbsp;&nbsp;</span>Grid testing</a></div><div class="lev3"><a href="#Iris"><span class="toc-item-num">0.2.1&nbsp;&nbsp;</span>Iris</a></div><div class="lev4"><a href="#Bit-prettier-display"><span class="toc-item-num">0.2.1.1&nbsp;&nbsp;</span>Bit prettier display</a></div><div class="lev3"><a href="#Digits"><span class="toc-item-num">0.2.2&nbsp;&nbsp;</span>Digits</a></div><div class="lev2"><a href="#Multiple-classifiers-testing"><span class="toc-item-num">0.3&nbsp;&nbsp;</span>Multiple classifiers testing</a></div><div class="lev3"><a href="#Iris"><span class="toc-item-num">0.3.1&nbsp;&nbsp;</span>Iris</a></div><div class="lev4"><a href="#Parameters-of-the-best-classifier"><span class="toc-item-num">0.3.1.1&nbsp;&nbsp;</span>Parameters of the best classifier</a></div><div class="lev3"><a href="#Digits"><span class="toc-item-num">0.3.2&nbsp;&nbsp;</span>Digits</a></div><div class="lev4"><a href="#Parameters-of-the-best-classifier"><span class="toc-item-num">0.3.2.1&nbsp;&nbsp;</span>Parameters of the best classifier</a></div><div class="lev3"><a href="#Results-comparison-for-both-datasets"><span class="toc-item-num">0.3.3&nbsp;&nbsp;</span>Results comparison for both datasets</a></div><div class="lev3"><a href="#Classification-report"><span class="toc-item-num">0.3.4&nbsp;&nbsp;</span>Classification report</a></div>

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import randint
from operator import itemgetter
import sklearn.datasets as datasets

In [2]:
iris_dataset = datasets.load_iris()
digits_dataset = datasets.load_digits(4)

In [3]:
print(iris_dataset['DESCR'])

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris d

In [4]:
from sklearn import svm, grid_search, datasets, cross_validation, metrics

from sklearn.ensemble import ExtraTreesClassifier

## Basic CrossValidation scoring

In [5]:
svc = svm.SVC()
linear_svc = svm.LinearSVC()

In [6]:
cross_validation.cross_val_score(svc, iris_dataset['data'], iris_dataset['target']).mean()

0.97344771241830064

In [7]:
cross_validation.cross_val_score(linear_svc, iris_dataset['data'], iris_dataset['target']).mean()

0.96691176470588225

## Grid testing

In [8]:
svc = svm.SVC()

In [9]:
parameters = [{'kernel':['rbf'], 'C':[1e-4, 1e-3, 1e-2, 1e0, 1e1, 1e2, 1e3]},
              {'kernel':['poly'], 'C':[1e-4, 1e-3, 1e-2, 1e0, 1e1, 1e2, 1e3], 'degree':[2,3]},
             {'kernel':['linear'], 'C':[1e-4, 1e-3, 1e-2, 1e0, 1e1, 1e2, 1e3]}]

### Iris

In [10]:
gs = grid_search.GridSearchCV(svc, parameters, n_jobs=4)
gs.fit(iris_dataset['data'], iris_dataset['target'])

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid=[{'C': [0.0001, 0.001, 0.01, 1.0, 10.0, 100.0, 1000.0], 'kernel': ['rbf']}, {'C': [0.0001, 0.001, 0.01, 1.0, 10.0, 100.0, 1000.0], 'degree': [2, 3], 'kernel': ['poly']}, {'C': [0.0001, 0.001, 0.01, 1.0, 10.0, 100.0, 1000.0], 'kernel': ['linear']}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [11]:
gs.best_params_

{'C': 10.0, 'kernel': 'rbf'}

In [12]:
gs.grid_scores_

[mean: 0.93333, std: 0.03277, params: {'C': 0.0001, 'kernel': 'rbf'},
 mean: 0.93333, std: 0.03277, params: {'C': 0.001, 'kernel': 'rbf'},
 mean: 0.93333, std: 0.03277, params: {'C': 0.01, 'kernel': 'rbf'},
 mean: 0.97333, std: 0.00897, params: {'C': 1.0, 'kernel': 'rbf'},
 mean: 0.98000, std: 0.01601, params: {'C': 10.0, 'kernel': 'rbf'},
 mean: 0.96000, std: 0.04217, params: {'C': 100.0, 'kernel': 'rbf'},
 mean: 0.95333, std: 0.03669, params: {'C': 1000.0, 'kernel': 'rbf'},
 mean: 0.86000, std: 0.00404, params: {'C': 0.0001, 'degree': 2, 'kernel': 'poly'},
 mean: 0.87333, std: 0.00809, params: {'C': 0.0001, 'degree': 3, 'kernel': 'poly'},
 mean: 0.88667, std: 0.02253, params: {'C': 0.001, 'degree': 2, 'kernel': 'poly'},
 mean: 0.96667, std: 0.01820, params: {'C': 0.001, 'degree': 3, 'kernel': 'poly'},
 mean: 0.95333, std: 0.02402, params: {'C': 0.01, 'degree': 2, 'kernel': 'poly'},
 mean: 0.97333, std: 0.02446, params: {'C': 0.01, 'degree': 3, 'kernel': 'poly'},
 mean: 0.96667, std: 

#### Bit prettier display

In [13]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.5f} (std: {1:.5f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [14]:
report(gs.grid_scores_, 5)

Model with rank: 1
Mean validation score: 0.98000 (std: 0.01601)
Parameters: {'C': 10.0, 'kernel': 'rbf'}

Model with rank: 2
Mean validation score: 0.98000 (std: 0.01602)
Parameters: {'C': 1.0, 'kernel': 'linear'}

Model with rank: 3
Mean validation score: 0.97333 (std: 0.00897)
Parameters: {'C': 1.0, 'kernel': 'rbf'}

Model with rank: 4
Mean validation score: 0.97333 (std: 0.02446)
Parameters: {'C': 0.01, 'degree': 3, 'kernel': 'poly'}

Model with rank: 5
Mean validation score: 0.97333 (std: 0.03697)
Parameters: {'C': 10.0, 'kernel': 'linear'}



### Digits

In [15]:
gs = grid_search.GridSearchCV(svc, parameters, n_jobs=4)
gs.fit(digits_dataset['data'], digits_dataset['target'])

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid=[{'C': [0.0001, 0.001, 0.01, 1.0, 10.0, 100.0, 1000.0], 'kernel': ['rbf']}, {'C': [0.0001, 0.001, 0.01, 1.0, 10.0, 100.0, 1000.0], 'degree': [2, 3], 'kernel': ['poly']}, {'C': [0.0001, 0.001, 0.01, 1.0, 10.0, 100.0, 1000.0], 'kernel': ['linear']}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [16]:
report(gs.grid_scores_, 5)

Model with rank: 1
Mean validation score: 0.98611 (std: 0.00982)
Parameters: {'C': 0.0001, 'degree': 3, 'kernel': 'poly'}

Model with rank: 2
Mean validation score: 0.98611 (std: 0.00982)
Parameters: {'C': 0.001, 'degree': 2, 'kernel': 'poly'}

Model with rank: 3
Mean validation score: 0.98611 (std: 0.00982)
Parameters: {'C': 0.001, 'degree': 3, 'kernel': 'poly'}

Model with rank: 4
Mean validation score: 0.98611 (std: 0.00982)
Parameters: {'C': 0.01, 'degree': 3, 'kernel': 'poly'}

Model with rank: 5
Mean validation score: 0.98611 (std: 0.00982)
Parameters: {'C': 1.0, 'degree': 3, 'kernel': 'poly'}



## Multiple classifiers testing

In [17]:
from sklearn import ensemble, neighbors, linear_model
from datetime import datetime

In [18]:
classifiers_to_test = [
    {
        'name': 'SVC',
        'classifier': svm.SVC(),
        'parameters': [{'kernel':['rbf'], 'C':[1e-4, 1e-3, 1e-2, 1e0, 1e1, 1e2, 1e3]},
                       {'kernel':['linear'], 'C':[1e-4, 1e-3, 1e-2, 1e0, 1e1, 1e2, 1e3]}]
    },
    {
        'name': 'LinearSVC',
        'classifier': svm.LinearSVC(),
        'parameters': [{'max_iter':[1e3, 1e4, 1e5], 'C':[1e-4, 1e-3, 1e-2, 1e0, 1e1, 1e2, 1e3]}]
    },
    {
        'name': 'ExtraTreesClassifier',
        'classifier': ensemble.ExtraTreesClassifier(),
        'parameters': [{'n_estimators':[5,10,20,50], 'min_samples_leaf':[1,2,4,8]}]
    },
    {
        'name': 'RandomForestClassifier',
        'classifier': ensemble.RandomForestClassifier(),
        'parameters': [{'n_estimators':[5,10,20,50], 'min_samples_leaf':[1,2,4,8]}]
    },
    {
        'name': 'KNeighborsClassifier',
        'classifier': neighbors.KNeighborsClassifier(),
        'parameters': [{'n_neighbors':[5,10,20], 'p':[1,2]}]
    },
    {
        'name': 'LogisticRegression',
        'classifier': linear_model.LogisticRegression(),
        'parameters': [{'max_iter':[1e3, 1e4, 1e5], 'C':[1e-4, 1e-3, 1e-2, 1e0, 1e1, 1e2, 1e3]}]
    }
]

### Iris

In [19]:
classifiers_results_iris = []
X, y = iris_dataset['data'], iris_dataset['target']
for classifier in classifiers_to_test:
    start = datetime.now()
    gs = grid_search.GridSearchCV(classifier['classifier'], classifier['parameters'], n_jobs=4)
    gs.fit(X, y)
    classifiers_results_iris.append({'name': classifier['name'], 'best_score': gs.best_score_, 'grid_search': gs})

    print("{0}\nscore: {1}\ntime: {2}\n*=======================*".format(classifier['name'], 
                                                                         gs.best_score_, 
                                                                         (datetime.now() - start)))

SVC
score: 0.98
time: 0:00:00.157256
LinearSVC
score: 0.9733333333333334
time: 0:00:03.750305
ExtraTreesClassifier
score: 0.9666666666666667
time: 0:00:00.901170
RandomForestClassifier
score: 0.9666666666666667
time: 0:00:00.891911
KNeighborsClassifier
score: 0.9866666666666667
time: 0:00:00.167355
LogisticRegression
score: 0.9733333333333334
time: 0:00:00.173360


#### Parameters of the best classifier

In [20]:
classifiers_results_iris.sort(key=itemgetter('best_score'), reverse=True)
print(classifiers_results_iris[0]['name'])
print(classifiers_results_iris[0]['grid_search'].best_params_)

KNeighborsClassifier
{'p': 2, 'n_neighbors': 5}


In [21]:
report(classifiers_results_iris[0]['grid_search'].grid_scores_)

Model with rank: 1
Mean validation score: 0.98667 (std: 0.00924)
Parameters: {'p': 2, 'n_neighbors': 5}

Model with rank: 2
Mean validation score: 0.97333 (std: 0.00897)
Parameters: {'p': 1, 'n_neighbors': 5}

Model with rank: 3
Mean validation score: 0.97333 (std: 0.00897)
Parameters: {'p': 1, 'n_neighbors': 10}



### Digits

In [22]:
classifiers_results_digits = []
X, y = digits_dataset['data'], digits_dataset['target']
for classifier in classifiers_to_test:
    start = datetime.now()
    gs = grid_search.GridSearchCV(classifier['classifier'], classifier['parameters'], n_jobs=4)
    gs.fit(X, y)
    classifiers_results_digits.append({'name': classifier['name'], 'best_score': gs.best_score_, 'grid_search': gs})

    print("{0}\nscore: {1}\ntime: {2}\n*=======================*".format(classifier['name'], 
                                                                         gs.best_score_, 
                                                                         (datetime.now() - start)))

SVC
score: 0.9805555555555555
time: 0:00:00.666699
LinearSVC
score: 0.9736111111111111
time: 0:00:00.774582
ExtraTreesClassifier
score: 0.9791666666666666
time: 0:00:01.235780
RandomForestClassifier
score: 0.9722222222222222
time: 0:00:01.178545
KNeighborsClassifier
score: 0.9888888888888889
time: 0:00:00.366008
LogisticRegression
score: 0.9763888888888889
time: 0:00:00.595879


#### Parameters of the best classifier

In [23]:
classifiers_results_digits.sort(key=itemgetter('best_score'), reverse=True)
print(classifiers_results_digits[0]['name'])
print(classifiers_results_digits[0]['grid_search'].best_params_)

KNeighborsClassifier
{'p': 2, 'n_neighbors': 5}


### Results comparison for both datasets

In [24]:
iris_results_df = pd.DataFrame([[result['name'], result['best_score']] for result in classifiers_results_iris],
                              columns=['Iris', 'best_score'])
digits_results_df = pd.DataFrame([[result['name'], result['best_score']] for result in classifiers_results_digits],
                                columns=['Digits', 'best_score'])
pd.concat([iris_results_df, digits_results_df], axis=1)

Unnamed: 0,Iris,best_score,Digits,best_score.1
0,KNeighborsClassifier,0.986667,KNeighborsClassifier,0.988889
1,SVC,0.98,SVC,0.980556
2,LinearSVC,0.973333,ExtraTreesClassifier,0.979167
3,LogisticRegression,0.973333,LogisticRegression,0.976389
4,ExtraTreesClassifier,0.966667,LinearSVC,0.973611
5,RandomForestClassifier,0.966667,RandomForestClassifier,0.972222


### Classification report

In [25]:
X, y = iris_dataset['data'], iris_dataset['target']
sss = cross_validation.StratifiedShuffleSplit(y, 1)

In [26]:
train_idx, test_idx = list(sss)[0]
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

In [27]:
y_pred = classifiers_results_iris[0]['grid_search'].predict(X_test)

In [28]:
print(metrics.classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         5
          1       1.00      1.00      1.00         5
          2       1.00      1.00      1.00         5

avg / total       1.00      1.00      1.00        15

