In [112]:
import itertools

In [1]:
import numpy as np
import pandas as pd

In [29]:
import sklearn.datasets
import sklearn.neighbors
import sklearn.model_selection

In [2]:
import bokeh.plotting
import bokeh.layouts
import bokeh.io

In [4]:
bokeh.io.output_notebook()

In [5]:
def rgba_from_4bit(img_4):
    n, m = img_4.shape
    img_rgba = np.empty((n, m), dtype=np.uint32)
    view = img_rgba.view(dtype=np.uint8).reshape((n, m, 4))
    view[:, :, 3] = 255  # set all alpha values to fully visible
    rgba = 255 - img_4[:, :] / 16 * 255
    
    # rgba is upside-down, hence the ::-1
    view[:, :, 0] = view[:, :, 1] = view[:, :, 2] = rgba[::-1]
    
    return img_rgba

# Goal: build a model to classify handwritten digits
We will use the MNIST Handwritten digits.  Scikit-learn provides 1796 (`DESCR` says 5620?) example digits.  A more exhaustive set of 60,000 training examples and 10,000 test cases are available on [this page]().  These were compiled by [Yann LeCun](http://yann.lecun.com/), Corinna Cortes, and Christopher J. C. Burges.  This would provide a more exhaustive test for our model.

Demonstrating the following:
 - cross-validation 
 - learning curves
 - regularization

In [6]:
digits = sklearn.datasets.load_digits()

In [24]:
print(digits.DESCR)

Optical Recognition of Handwritten Digits Data Set

Notes
-----
Data Set Characteristics:
    :Number of Instances: 5620
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
http://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each block. This generates
an input matrix of 8x8 where each element is a

In [11]:
p = bokeh.plotting.figure(width=110, height=100, x_range=(0, 8), y_range=(0, 8),
                          tools='', title='Training: {}'.format(digits.target[0]))
p.xaxis.visible = p.yaxis.visible = False

p.image_rgba(image=[rgba_from_4bit(digits.images[0])], x=0, y=0, dw=8, dh=8)

bokeh.plotting.show(p)

In [12]:
n_plot = 4
plots = []
w = 80
h = 80

images_and_labels = list(zip(digits.images, digits.target))
for index, (image, label) in enumerate(images_and_labels[:n_plot]):
    img_rgba = rgba_from_4bit(image)
    p = bokeh.plotting.figure(width=w, height=h, tools='', 
                              x_range=(0, 8), y_range=(0, 8),
                              title='Training: {}'.format(label))
    p.xaxis.visible = p.yaxis.visible = False
    p.image_rgba(image=[img_rgba], x=0, y=0, dw=8, dh=8)
    plots.append(p)

grid = bokeh.layouts.gridplot([plots])
bokeh.plotting.show(grid)

In [13]:
X, y = digits.data, digits.target

## Start with the K-nearest Neighbor Classifier (using default 5 neighbors)

In [27]:
classifier = sklearn.neighbors.KNeighborsClassifier()

In [41]:
train_X, test_X, train_y, test_y = sklearn.model_selection.train_test_split(X, y,
                                                                            train_size=0.8,
                                                                            random_state=123)
print('Without stratification:')
print('All:\n', np.bincount(y) / float(len(y)) * 100.0)
print('Training:\n', np.bincount(train_y) / float(len(train_y)) * 100.0)
print('Test:\n', np.bincount(test_y) / float(len(test_y)) * 100.0)

Without stratification:
All:
 [  9.90539789  10.1279911    9.84974958  10.1836394   10.07234279
  10.1279911   10.07234279   9.96104619   9.68280467  10.01669449]
Training:
 [  9.67292971  10.29923452   9.81210856  10.43841336   9.67292971
  10.09046625   9.60334029  10.29923452   9.53375087  10.57759221]
Test:
 [ 10.83333333   9.44444444  10.           9.16666667  11.66666667
  10.27777778  11.94444444   8.61111111  10.27777778   7.77777778]


In [42]:
train_X, test_X, train_y, test_y = sklearn.model_selection.train_test_split(X, y,
                                                                            train_size=0.8,
                                                                            random_state=123,
                                                                            stratify=y)
print('With stratification:')
print('All:\n', np.bincount(y) / float(len(y)) * 100.0)
print('Training:\n', np.bincount(train_y) / float(len(train_y)) * 100.0)
print('Test:\n', np.bincount(test_y) / float(len(test_y)) * 100.0)

With stratification:
All:
 [  9.90539789  10.1279911    9.84974958  10.1836394   10.07234279
  10.1279911   10.07234279   9.96104619   9.68280467  10.01669449]
Training:
 [  9.88169798  10.16005567   9.88169798  10.16005567  10.09046625
  10.09046625  10.09046625   9.9512874    9.67292971  10.02087683]
Test:
 [ 10.          10.           9.72222222  10.27777778  10.          10.27777778
  10.          10.           9.72222222  10.        ]


In [45]:
classifier.fit(train_X, train_y)
pred_y = classifier.predict(test_X)

print("Fraction Correct [Accuracy]:")
print(np.sum(pred_y == test_y) / float(len(test_y)))

Fraction Correct [Accuracy]:
0.991666666667


In [46]:
print('Samples correctly classified:')
correct_idx = np.where(pred_y == test_y)[0]
print(correct_idx)

print('\nSamples incorrectly classified:')
incorrect_idx = np.where(pred_y != test_y)[0]
print(incorrect_idx)

Samples correctly classified:
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  22  23  24  25  26  27  28  29  31  32  33  34  35  36  37
  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55
  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73
  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109
 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 180 181 182
 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
 237 238 239 240 241 

In [63]:
plots = []
w = 80
h = 80

for i in incorrect_idx:
    image = test_X[i].reshape(8, 8)
    img_rgba = rgba_from_4bit(image)
    p = bokeh.plotting.figure(width=w, height=h, tools='', 
                              x_range=(0, 8), y_range=(0, 8),
                              title='P: {}, A: {}'.format(pred_y[i], test_y[i]))
    p.xaxis.visible = p.yaxis.visible = False
    p.image_rgba(image=[img_rgba], x=0, y=0, dw=8, dh=8)
    plots.append(p)

grid = bokeh.layouts.gridplot([plots])
print("Incorrect Predictions:")
bokeh.plotting.show(grid)

Incorrect Predictions:


## What if we treat this as unsupervised learning, i.e, cluster the digits without considering the labels?

In [68]:
import sklearn.cluster
import sklearn.metrics

In [66]:
kmeans = sklearn.cluster.KMeans(n_clusters=10, random_state=123)
labels = kmeans.fit_predict(X)

In [67]:
labels

array([7, 4, 4, ..., 4, 3, 3], dtype=int32)

In [74]:
accuracy = sklearn.metrics.adjusted_rand_score(y, labels)
print('Accuracy score:', accuracy)

Accuracy score: 0.665734419413


Though I would have hoped for better accuracy, this demonstrates the large amount of variation in the data.

In [76]:
confusion_matrix = sklearn.metrics.confusion_matrix(y, labels)
print(confusion_matrix)

[[  0   0   0   0   0   0   0 177   1   0]
 [  2   0  24   0  99  55   1   0   0   1]
 [  0   3 148   2   8   2  13   1   0   0]
 [  0   7   1   9   7   0 157   0   0   2]
 [  0   6   0   0   4   7   0   0 163   1]
 [  1   0   0  48   0   0   1   0   2 130]
 [177   0   0   0   2   1   0   1   0   0]
 [  0 175   0   0   4   0   0   0   0   0]
 [  2   4   3  50 102   6   2   0   0   5]
 [  0   7   0 139   2  20   6   0   0   6]]


It would be helpful to sort out the labels to better understand which numbers were confused.  The maximum values in the confusion matrix provide the needed information to sort this out.

In [130]:
key = {}
for i, r in enumerate(confusion_matrix):
    key[r.argmax()] = i
    print('group {} should be {}'.format(r.argmax(), i))

group 7 should be 0
group 4 should be 1
group 2 should be 2
group 6 should be 3
group 8 should be 4
group 9 should be 5
group 0 should be 6
group 1 should be 7
group 4 should be 8
group 3 should be 9


That almost worked, except 4 maps to both 1 and 8.  We should be able to sort using the columns instead.

In [131]:
key = {}
for i in range(10):
    key[i] = confusion_matrix[:, i].argmax()
    print('group {} should be {}'.format(i, key[i]))

group 0 should be 6
group 1 should be 7
group 2 should be 2
group 3 should be 9
group 4 should be 8
group 5 should be 1
group 6 should be 3
group 7 should be 0
group 8 should be 4
group 9 should be 5


This resolved the conflict between 1 and 8, though there is obviously much confusion between those.  Lets view the sorted matrix.

In [114]:
key

{0: 6, 1: 7, 2: 2, 3: 9, 4: 8, 5: 1, 6: 3, 7: 0, 8: 4, 9: 5}

In [125]:
sorted_matrix = np.empty_like(confusion_matrix)
for i in range(10):
    sorted_matrix[:, key[i]] = confusion_matrix[:, i]
print(sorted_matrix)

[[177   0   0   0   1   0   0   0   0   0]
 [  0  55  24   1   0   1   2   0  99   0]
 [  1   2 148  13   0   0   0   3   8   2]
 [  0   0   1 157   0   2   0   7   7   9]
 [  0   7   0   0 163   1   0   6   4   0]
 [  0   0   0   1   2 130   1   0   0  48]
 [  1   1   0   0   0   0 177   0   2   0]
 [  0   0   0   0   0   0   0 175   4   0]
 [  0   6   3   2   0   5   2   4 102  50]
 [  0  20   0   6   0   6   0   7   2 139]]


Lets check to make sure we did this right.

In [127]:
sorted_labels = np.empty_like(labels)
for i, val in enumerate(labels):
    sorted_labels[i] = key[val]

In [132]:
sorted_confusion_matrix = sklearn.metrics.confusion_matrix(y, sorted_labels)
print(sorted_confusion_matrix)

[[177   0   0   0   1   0   0   0   0   0]
 [  0  55  24   1   0   1   2   0  99   0]
 [  1   2 148  13   0   0   0   3   8   2]
 [  0   0   1 157   0   2   0   7   7   9]
 [  0   7   0   0 163   1   0   6   4   0]
 [  0   0   0   1   2 130   1   0   0  48]
 [  1   1   0   0   0   0 177   0   2   0]
 [  0   0   0   0   0   0   0 175   4   0]
 [  0   6   3   2   0   5   2   4 102  50]
 [  0  20   0   6   0   6   0   7   2 139]]


Looks good.  So the clustering algorithm did the best for 0, 4, 6, and 7 but quite terrible on 1.

In [147]:
n_p = np.array([(sorted_labels == i).sum() for i in range(10)])
n_a = np.array([(y == i).sum() for i in range(10)])
print('number of each:   {}'.format(n_a))
print('number predicted: {}'.format(n_p))

number of each:   [178 182 177 183 181 182 181 179 174 180]
number predicted: [179  91 176 180 166 145 182 202 228 248]


We can also get this by summing the rows and columns of the confusion matrix.

In [133]:
sorted_confusion_matrix.sum(axis=1)

array([178, 182, 177, 183, 181, 182, 181, 179, 174, 180])

So the columns are how each of the actual numbers were classified,

In [134]:
sorted_confusion_matrix.sum(axis=0)

array([179,  91, 176, 180, 166, 145, 182, 202, 228, 248])

and the rows are the breakdown of which numbers where grouped by the clustering algorithm.

Lets show this as a heatmap.

In [169]:
p_pos = bokeh.plotting.figure(width=200, height=200, x_range=(0, 10), y_range=(0, 10),
                              tools='', title='Confusion Matrix')
p_neg = bokeh.plotting.figure(width=200, height=200, x_range=(0, 10), y_range=(0, 10),
                              tools='')

p_pos.xaxis.visible = p_pos.yaxis.visible = False
p_neg.xaxis.visible = p_neg.yaxis.visible = False
img_pos = rgba_from_4bit(sorted_confusion_matrix * 16 / sorted_confusion_matrix.max())
img_neg = rgba_from_4bit((sorted_confusion_matrix * 16 / sorted_confusion_matrix.max() - 16) * -1)

p_pos.image_rgba(image=[img_pos], x=0, y=0, dw=10, dh=10)
p_neg.image_rgba(image=[img_neg], x=0, y=0, dw=10, dh=10)

plots = bokeh.layouts.gridplot([[p_pos, p_neg]])
bokeh.plotting.show(plots)