In [1]:
# Using the toolkit of the datasets module, import the built-in iris dataset
import time

import numpy as np
from sklearn.datasets import load_iris

In [2]:
# display the names of the keys of the set.
iris = load_iris()
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [3]:
from pprint import pprint

# Display the value of the DESCR key.
pprint(iris.DESCR)

('.. _iris_dataset:\n'
 '\n'
 'Iris plants dataset\n'
 '--------------------\n'
 '\n'
 '**Data Set Characteristics:**\n'
 '\n'
 '    :Number of Instances: 150 (50 in each of three classes)\n'
 '    :Number of Attributes: 4 numeric, predictive attributes and the class\n'
 '    :Attribute Information:\n'
 '        - sepal length in cm\n'
 '        - sepal width in cm\n'
 '        - petal length in cm\n'
 '        - petal width in cm\n'
 '        - class:\n'
 '                - Iris-Setosa\n'
 '                - Iris-Versicolour\n'
 '                - Iris-Virginica\n'
 '                \n'
 '    :Summary Statistics:\n'
 '\n'
 '                    Min  Max   Mean    SD   Class Correlation\n'
 '    sepal length:   4.3  7.9   5.84   0.83    0.7826\n'
 '    sepal width:    2.0  4.4   3.05   0.43   -0.4194\n'
 '    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)\n'
 '    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)\n'
 '\n'
 '    :Missing Attribute Values: None\n'
 '   

In [4]:
# Examine the values of the remaining keys and determine the keys that store the values of features and class labels.
X, y = iris.data, iris.target

print("Dataset size: %d  number of features: %d  number of classes: %d"
      % (X.shape[0], X.shape[1], len(np.unique(y))))

Dataset size: 150  number of features: 4  number of classes: 3


In [5]:
# Split the data set into training and test samples in the ratio of 70/30.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [6]:
# Create a linear classification model SGDClassifier and list the available model parameters.
from sklearn.linear_model import SGDClassifier


def list_sgd_parameters():
    model = SGDClassifier()
    print(model.get_params())


list_sgd_parameters()

{'alpha': 0.0001, 'average': False, 'class_weight': None, 'early_stopping': False, 'epsilon': 0.1, 'eta0': 0.0, 'fit_intercept': True, 'l1_ratio': 0.15, 'learning_rate': 'optimal', 'loss': 'hinge', 'max_iter': 1000, 'n_iter_no_change': 5, 'n_jobs': None, 'penalty': 'l2', 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [7]:
# Create a parameter grid that includes 4 types of loss function, two types of regularizers, 5 values of the regularization coefficient from 0.0001 to 0.001, and the number of iterations from 5 to 10 with a step of 1.
from sklearn.model_selection import GridSearchCV


def grid_search_sgd():
    model = SGDClassifier()
    param_grid = {'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                  'penalty': ['l1', 'l2', 'elasticnet'],
                  'alpha': np.linspace(0.0001, 0.001, 5),
                  'n_iter_no_change': np.arange(5, 10, 1)}
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)


grid_search_sgd()

{'alpha': 0.000325, 'loss': 'squared_hinge', 'n_iter_no_change': 7, 'penalty': 'l1'}


In [8]:
# Create a GridSearchCV object, pass it the previously created classifier and parameter grid, and train it. Use accuracy as a metric. Provide for the output of the time spent on iterating over the grid.

def grid_search_sgd_time():
    model = SGDClassifier()
    param_grid = {'loss': ['hinge', 'perceptron'],
                  'penalty': ['l1', 'l2', 'elasticnet'],
                  'alpha': np.linspace(0.0001, 0.001, 5),
                  'n_iter_no_change': np.arange(5, 10, 1)}
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    start = time.time()
    grid_search.fit(X_train, y_train)
    end = time.time()
    print("Time spent: %.2f seconds" % (end - start))
    pprint(grid_search.best_estimator_)
    pprint(grid_search.best_params_)
    pprint(grid_search.best_score_)


grid_search_sgd_time()

Time spent: 2.16 seconds
SGDClassifier(alpha=0.0007750000000000001, loss='perceptron',
              n_iter_no_change=9, penalty='l1')
{'alpha': 0.0007750000000000001,
 'loss': 'perceptron',
 'n_iter_no_change': 9,
 'penalty': 'l1'}
0.9714285714285715


In [9]:
# Organize a random grid search. Print the time spent on random enumeration.
from sklearn.model_selection import RandomizedSearchCV


def random_search_sgd_time():
    model = SGDClassifier()
    param_grid = {'loss': ['hinge', 'perceptron'],
                  'penalty': ['l1', 'l2', 'elasticnet'],
                  'alpha': np.linspace(0.0001, 0.001, 5),
                  'n_iter_no_change': np.arange(5, 10, 1)}
    random_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='accuracy')
    start = time.time()
    random_search.fit(X_train, y_train)
    end = time.time()
    print("Time spent: %.2f seconds" % (end - start))
    pprint(random_search.best_estimator_)
    pprint(random_search.best_params_)
    pprint(random_search.best_score_)


random_search_sgd_time()

Time spent: 0.19 seconds
SGDClassifier(alpha=0.001, loss='perceptron', n_iter_no_change=9, penalty='l1')
{'alpha': 0.001, 'loss': 'perceptron', 'n_iter_no_change': 9, 'penalty': 'l1'}
0.9619047619047618
