# Hands-on Machine Learning with Scikit-learn, Keras and Tensorflow - Ch3 (Part 3)
- categories: [MachineLearning, Notes, Classification]

### 1. Try to build a classifier for the MNIST dataset that achieves over 97% accuracy on the test set. Hint: the KNeighborsClassifier works quite well for this task; you just need to find good hyperparameter values (try a grid search on the weights and n_neighbors hyperparameters).

In [2]:
from sklearn.datasets import fetch_openml
import numpy as np


mnist = fetch_openml('mnist_784', version=1)
data, target = mnist.data, mnist.target
x_train, y_train, x_test, y_test = data[:60000], target[:60000], data[60000:], target[60000:]

In [13]:
# following the hint
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'weights': ['uniform', 'distance'],
    'n_neighbors': [x for x in range(2, 10)]
}


knn = GridSearchCV(
    KNeighborsClassifier(),
    param_grid=param_grid,
    n_jobs=-1,    # using all processors
    cv=5,
    scoring='balanced_accuracy',
    verbose=3    # for display grid search msg (for multi-processing, I found the printout in the terminal...)
)

knn.fit(x_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9],
                         'weights': ['uniform', 'distance']},
             scoring='balanced_accuracy', verbose=3)

In [15]:
print('Best parameters are:', knn.best_params_)
print('Best training accuracy:', knn.best_score_)

Best parameters are: {'n_neighbors': 4, 'weights': 'distance'}
Best training accuracy: 0.9711847422265611


In [16]:
from sklearn.metrics import accuracy_score

y_pred = knn.predict(x_test)
print('Test accuracy:', accuracy_score(y_test, y_pred))

Test accuracy: 0.9714


In [30]:
# let's also try logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


lr_param = {
    'logisticregression__C': [10**x for x in range(-3, 3)]
}
model = make_pipeline(StandardScaler(), LogisticRegression(random_state=2, max_iter=1000))


lr = GridSearchCV(
    model,
    param_grid=lr_param,
    n_jobs=-1,    # using all processors
    cv=5,
    scoring='balanced_accuracy',
    verbose=3    # for display grid search msg (not applicable with n_job?)
)

lr.fit(x_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=1000,
                                                           random_state=2))]),
             n_jobs=-1,
             param_grid={'logisticregression__C': [0.001, 0.01, 0.1, 1, 10,
                                                   100]},
             scoring='balanced_accuracy', verbose=3)

In [60]:
# although the training time is much faster (~16 mins vs ~2 hr), the accuracy is not as good as KNN
print('Best parameter(C):', lr.best_estimator_.get_params()['logisticregression__C'])
print('Best score:', lr.best_score_)

y_pred = lr.predict(x_test)
print('Test accuracy:', accuracy_score(y_test, y_pred))

Best parameter(C): 0.01
Best score: 0.9213184132852655
Test accuracy: 0.9263


In [58]:
# let's also try Random Forest
from sklearn.ensemble import RandomForestClassifier


param_grid = {
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [x for x in range(5, 50)],
}

forest = GridSearchCV(
    RandomForestClassifier(random_state=2),
    param_grid=param_grid,
    n_jobs=-1,    # using all processors
    cv=5,
    scoring='balanced_accuracy',
    verbose=3    # for display grid search msg (for multi-processing, I found the printout in the terminal...)
)

forest.fit(x_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=2), n_jobs=-1,
             param_grid={'max_depth': [29, 33], 'n_estimators': [1000]},
             scoring='balanced_accuracy', verbose=3)

In [62]:
print('The best score:', forest.best_score_)
print('with parameter:', forest.best_params_)

y_pred = forest.predict(x_test)
print('Test accuracy:', accuracy_score(y_test, y_pred))

The best score: 0.9681889511727405
with parameter: {'max_depth': 33, 'n_estimators': 1000}
Test accuracy: 0.9711
