In [1]:

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.datasets import load_digits

# Load the dataset
digits = load_digits()

# Access data and labels
X = digits.data       # Features (image pixels)
y = digits.target     # Labels (0 to 9)

# Check shape
print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)



 

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

print(X.shape)
print(y.shape)

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(
        SVC(), tuned_parameters, scoring='%s_macro' % score
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on train set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on train set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report on test-set:")
    print()
    #print("The model is trained on the full development set.")
    #print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()


Feature matrix shape: (1797, 64)
Target vector shape: (1797,)
(1797, 64)
(1797,)
# Tuning hyper-parameters for precision

Best parameters set found on train set:

{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}

Grid scores on train set:

0.986 (+/-0.016) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.959 (+/-0.028) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.026) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.983 (+/-0.026) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.983 (+/-0.026) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.974 (+/-0.012) for {'C': 1, 'kernel': 'linear'}
0.974 (+/-0.012) for {'C': 10, 'kernel': 'linear'}
0.974 (+/-0.012) for {'C': 100, 'kernel': 'linear'}
0.974 (+/-0.012) for {'C': 1000, 'kernel': 'linear'}

Detailed classification report on test-

In [4]:

import numpy as np
from time import time
import scipy.stats as stats
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.linear_model import SGDClassifier
#from sklearn.utils.fixes import loguniform
from scipy.stats import loguniform

# Load data
X, y = load_digits(return_X_y=True)

# Build classifier
clf = SGDClassifier(loss='hinge', penalty='elasticnet', fit_intercept=True)

# Utility function to report top models
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

# Randomized search parameters
param_dist = {
    'average': [True, False],
    'l1_ratio': stats.uniform(0, 1),
    'alpha': loguniform(1e-4, 1e0)
}

# Run RandomizedSearchCV
n_iter_search = 20
random_search = RandomizedSearchCV(
    clf, param_distributions=param_dist,
    n_iter=n_iter_search, cv=5, random_state=42, n_jobs=-1
)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# Grid search parameters
param_grid = {
    'average': [True, False],
    'l1_ratio': np.linspace(0, 1, num=10),
    'alpha': np.power(10.0, np.arange(-4, 1))
}

# Run GridSearchCV
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5, n_jobs=-1)

start = time()
grid_search.fit(X, y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)


RandomizedSearchCV took 6.80 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.929 (std: 0.030)
Parameters: {'alpha': 0.00011299516083106625, 'average': True, 'l1_ratio': 0.9656320330745594}

Model with rank: 2
Mean validation score: 0.924 (std: 0.030)
Parameters: {'alpha': 0.00010672476836323731, 'average': True, 'l1_ratio': 0.2912291401980419}

Model with rank: 3
Mean validation score: 0.924 (std: 0.033)
Parameters: {'alpha': 0.00030771802712506853, 'average': True, 'l1_ratio': 0.8331949117361643}

GridSearchCV took 34.16 seconds for 100 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.928 (std: 0.025)
Parameters: {'alpha': 0.001, 'average': True, 'l1_ratio': 0.2222222222222222}

Model with rank: 2
Mean validation score: 0.928 (std: 0.032)
Parameters: {'alpha': 1.0, 'average': False, 'l1_ratio': 0.0}

Model with rank: 3
Mean validation score: 0.927 (std: 0.031)
Parameters: {'alpha': 0.0001, 'average': True, 'l1_ratio': 