In [1]:
import os
import gzip
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import time


In [2]:
def load_mnist(path, kind='train'):
    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels-idx1-ubyte.gz'
                               % kind)
    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte.gz'
                               % kind)

    with gzip.open(labels_path, 'rb') as lbpath:
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
                               offset=8)

    with gzip.open(images_path, 'rb') as imgpath:
        images = np.frombuffer(imgpath.read(), dtype=np.uint8,
                               offset=16).reshape(len(labels), 784)

    return images, labels

X_train, y_train = load_mnist('data/', kind='train')
X_test, y_test = load_mnist('data/', kind='t10k')


In [3]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape


((60000, 784), (60000,), (10000, 784), (10000,))

In [4]:
def measure_elapsed_time (start_time):
    elapsed_time = time.time() - start_time

    print("Elapsed time: %.2f seconds" % elapsed_time)

In [5]:
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

start_time = time.time()

clf.fit(X_train, y_train)

measure_elapsed_time(start_time)


Elapsed time: 8.81 seconds


In [6]:
# Evaluate the model
from sklearn.metrics import accuracy_score

y_pred = clf.predict(X_test)

print("Accuracy: %.2f %%" % (100 * accuracy_score(y_test, y_pred)))


Accuracy: 57.79 %


In [7]:
# Next, use PCA to reduce the dataset's dimensionality (with an explained variance ratio of 95%).
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)

X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)

X_train_reduced.shape, X_test_reduced.shape


((60000, 187), (10000, 187))

In [8]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

start_time = time.time()

clf.fit(X_train_reduced, y_train)

measure_elapsed_time(start_time)

y_pred = clf.predict(X_test_reduced)

print("Accuracy: %.2f %%" % (100 * accuracy_score(y_test, y_pred)))

Elapsed time: 17.65 seconds
Accuracy: 68.35 %


Against my expecations the training time doubled altough the dimenions of the training set have been reduced. This should not happen as the model needs to process less data and therefore should be faster.
The accuracy on the other side went up by using Principal Component Analysis. This is because the model has less dimensions to process and therefore can focus on the important features. This leads to a better accuracy.

In [11]:
# Apply softmax regression (using the X_train dataset) and time how long it takes, then evaluate the resulting model on the test set. Use LogisticRegression with multi_class set to "multinomial".
from sklearn.linear_model import LogisticRegression

softmax_reg = LogisticRegression(multi_class="multinomial")

start_time = time.time()

softmax_reg.fit(X_train, y_train)

measure_elapsed_time(start_time)

y_pred = softmax_reg.predict(X_test)

print("Accuracy: %.2f %%" % (100 * accuracy_score(y_test, y_pred)))


Elapsed time: 23.86 seconds
Accuracy: 84.12 %


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Use softmax regression (with the reduced dataset) and time how long it takes. Was training much faster? By how much?

softmax_reg = LogisticRegression(multi_class="multinomial")

start_time = time.time()

softmax_reg.fit(X_train_reduced, y_train)

measure_elapsed_time(start_time)

y_pred = softmax_reg.predict(X_test_reduced)

print("Accuracy: %.2f %%" % (100 * accuracy_score(y_test, y_pred)))

Elapsed time: 8.02 seconds
Accuracy: 84.25 %


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
