### build datasets from FASHION_MNIST

In [81]:
%matplotlib inline
import sys
import d2lzh as d2l
from mxnet.gluon import data as gdata
from mxnet import autograd, nd

batch_size = 256
transformer = gdata.vision.transforms.ToTensor()
mnist_train, mnist_test = gdata.vision.FashionMNIST(train=True), gdata.vision.FashionMNIST(train=False)

if sys.platform.startswith('win'):
    num_workers = 1
else:
    num_workers = 4
    
####
!echo num_worker: {num_workers} 
####

train_iter, test_iter = \
    gdata.DataLoader(mnist_train.transform_first(transformer), batch_size, shuffle=True, num_workers=num_workers), \
    gdata.DataLoader(mnist_test.transform_first(transformer), batch_size, shuffle=True, num_workers=num_workers)

num_inputs, num_outputs = 28 * 28, 10 # each image's size = 28^2 pixel
W = nd.random.normal(scale=0.01, shape=(num_inputs, num_outputs))
b = nd.zeros((1, num_outputs))

W.attach_grad()
b.attach_grad()

print(W, b)
print(W.shape, b.shape)

num_worker: 4

[[ 3.6774972e-03  1.1755139e-02 -5.4557086e-03 ...  2.0792931e-03
  -2.3649285e-02  9.5707532e-03]
 [-8.4031123e-04  6.1789686e-03  1.0210714e-02 ...  3.0820500e-03
  -2.3839362e-02  8.0350682e-04]
 [-1.0348832e-02  2.2596184e-03 -5.3320164e-03 ...  2.7553646e-03
   2.3408558e-03 -1.5501755e-02]
 ...
 [-9.3191833e-05 -3.9124326e-03  1.2919210e-03 ... -5.0274194e-03
  -4.7465544e-03 -9.8960726e-03]
 [-9.8233540e-03 -1.6794683e-02  1.5876664e-02 ... -4.8428690e-03
  -3.2504566e-03  9.9565303e-03]
 [ 8.9454530e-03 -2.0942942e-03  2.7219688e-03 ... -1.0475126e-02
  -3.6723365e-04 -4.8272088e-03]]
<NDArray 784x10 @cpu(0)> 
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
<NDArray 1x10 @cpu(0)>
(784, 10) (1, 10)


In [82]:
X = nd.array([[1, 2, 3],[4, 5, 6]])
print(X.sum(axis=0, keepdims=True), X.sum(axis=1, keepdims=True))



[[5. 7. 9.]]
<NDArray 1x3 @cpu(0)> 
[[ 6.]
 [15.]]
<NDArray 2x1 @cpu(0)>


In [83]:
def softmax(X):
    # X's rows = samples
    X_exp = X.exp()
    partition = X_exp.sum(axis=1, keepdims=True)
    return X_exp / partition

X = nd.random.normal(shape=(2, 5))

print(X, X.sum(axis=1))
X_prob = softmax(X)
print(X_prob, X_prob.sum(axis=1))

def net(X):
#     print(X.reshape(-1, num_inputs).shape, W.shape)
    return softmax(nd.dot(X.reshape(-1, num_inputs), W) + b)



[[-0.7114856   0.2477707   1.414129   -0.594405    0.05316189]
 [-0.42178673 -1.0713187  -0.6094676   0.42622593 -0.55523616]]
<NDArray 2x5 @cpu(0)> 
[ 0.40917096 -2.2315834 ]
<NDArray 2 @cpu(0)>

[[0.06552974 0.17101656 0.5490111  0.0736692  0.14077342]
 [0.17981592 0.09391608 0.14904566 0.4198705  0.15735182]]
<NDArray 2x5 @cpu(0)> 
[1. 1.]
<NDArray 2 @cpu(0)>


In [84]:
y_hat = nd.array([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y = nd.array([0, 2], dtype='int32')
print(nd.pick(y_hat, y))

def cross_entropy(y_hat, y):
    return -nd.pick(y_hat, y).log()

def accuracy(y_hat, y):
    return (y_hat.argmax(axis=1) == y.astype('float32')).mean().asscalar()


[0.1 0.5]
<NDArray 2 @cpu(0)>


In [85]:
def evaluate_net(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        y = y.astype('float32')
        acc_sum = (net(X).argmax(axis=1) == y).sum().asscalar()
        n += y.size
        return acc_sum / n


In [86]:
evaluate_net(test_iter, net)

0.11328125

### train 

In [90]:
num_epochs, lr = 5, 0.1

def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size, params=None, lr=None, trainer=None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = .0, .0, 0
        with autograd.record():
            y_hat = net(X)
            l = loss(y_hat, y).sum()
        l.backward()
        if trainer is None:
            d2l.sgd(params, lr, batch_size)
        else:
            trainer.step(batch_size)
        y = y.astype('int32')
        train_l_sum += l.asscalar()
        train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
        n += y.size
        

0.2134 0.5 0
0.2134 0.5 0
0.2134 0.5 0
0.2134 0.5 0
0.2134 0.5 0
0.2134 0.5 0
0.2134 0.5 0
0.2134 0.5 0
0.2134 0.5 0
0.2134 0.5 0
