In [1]:
using Statistics  # mean()
using Random  # randperm()
using Dates

include("./Functional.jl")
include("./Layer.jl")
include("./Optimizer.jl")
include("./MNIST.jl")
;

In [2]:
Random.seed!(2019)
;

### Load MNIST

In [3]:
_x_train = MNIST.images(:train)
_x_test = MNIST.images(:test)
_y_train = MNIST.labels(:train)
_y_test = MNIST.labels(:test)

x_train = convert(Array{Float64, 2}, hcat([vec(Float64.(x)) for x in _x_train]...))
x_test = convert(Array{Float64, 2}, hcat([vec(Float64.(x)) for x in _x_test]...))
y_train = Functional.onehot(Float64, _y_train, 0:9)
y_test = Functional.onehot(Float64, _y_test, 0:9)
@show size(x_train), size(x_test), size(y_train), size(y_test)
;

(size(x_train), size(x_test), size(y_train), size(y_test)) = ((784, 60000), (784, 10000), (10, 60000), (10, 10000))


### Define DataLoader

In [4]:
function dataloader(x, y, ;batch_size=1, shuffle=false)
    function producer(c::Channel, x, y, batch_size, shuffle)
        data_size = size(x, 2)
        if shuffle
            randidx = randperm(data_size)
            x = x[:, randidx]
            y = y[:, randidx]
        end
        i = 1
        while i < data_size-batch_size
            put!(c, (x[:, i:i+batch_size-1], y[:, i:i+batch_size-1]))
            i += batch_size
        end
        put!(c, (x[:, i:end], y[:, i:end]))
    end

    ch = Channel((ch_arg) -> producer(ch_arg, x, y, batch_size,  shuffle))
    return ch
end
;

In [5]:
mutable struct TwoLayerNet{T}
    a1lyr::Layer.AffineLayer{T}
    relu1lyr::Layer.ReluLayer
    bn::Layer.BatchNormalization
    a2lyr::Layer.AffineLayer{T}
    softmaxlyr::Layer.SoftmaxWithLossLayer{T}
    params
end

function (::Type{TwoLayerNet{T}})(input_size::Int, hidden_size::Int, output_size::Int; weight_init_std::Float64=0.01) where T
    W1 = weight_init_std .* randn(T, hidden_size, input_size)
    b1 = zeros(T, hidden_size)
    W2 = weight_init_std .* randn(T, output_size, hidden_size)
    b2 = zeros(T, output_size)
    gamma = ones(hidden_size)
    beta = zeros(hidden_size)
    a1lyr = Layer.AffineLayer(W1, b1)
    relu1lyr = Layer.ReluLayer()
    bn = Layer.BatchNormalization(gamma, beta)
    a2lyr = Layer.AffineLayer(W2, b2)
    softmaxlyr = Layer.SoftmaxWithLossLayer{T}()
    params = [a1lyr.W, a1lyr.b, a2lyr.W, a2lyr.b, bn.gamma, bn.beta]
    TwoLayerNet(a1lyr, relu1lyr, bn, a2lyr, softmaxlyr, params)
end

function setparams(net::TwoLayerNet, params)
    net.a1lyr.W = params[1]
    net.a1lyr.b = params[2]
    net.a2lyr.W = params[3]
    net.a2lyr.b = params[4]
    net.bn.gamma = params[5]
    net.bn.beta = params[6]
end

function predict(net::TwoLayerNet{T}, x::AbstractArray{T}) where T
    a1 = Layer.forward(net.a1lyr, x)
    z1 = Layer.forward(net.relu1lyr, a1)
    z2 = Layer.forward(net.bn, z1)
    a2 = Layer.forward(net.a2lyr, z2)
    return a2
end

function criterion(net::TwoLayerNet{T}, x::AbstractArray{T}, t::AbstractArray{T}) where T
    y = predict(net, x)
    Layer.forward(net.softmaxlyr, y, t)
end

function accuracy(net::TwoLayerNet{T}, x::AbstractArray{T}, t::AbstractArray{T}) where T
    y = vec(mapslices(argmax, predict(net, x), dims=1))
    t1 = vec(mapslices(argmax, t, dims=1))
    mean(y .== t1)
end

function gradient(net::TwoLayerNet{T}, x::AbstractArray{T}, t::AbstractArray{T}) where T
    # forward
    criterion(net, x, t)
    # backward
    dout = one(T)
    dz2 = Layer.backward(net.softmaxlyr, dout)
    da2 = Layer.backward(net.a2lyr, dz2)
    db2 = Layer.backward(net.bn, da2)
    dz1 = Layer.backward(net.relu1lyr, db2)
    da1 = Layer.backward(net.a1lyr, dz1)
    [net.a1lyr.dW, net.a1lyr.db, net.a2lyr.dW, net.a2lyr.db, net.bn.dgamma, net.bn.dbeta]
end
;

### Hyper parameter

In [6]:
const iters_num = 10000
const batch_size = 100
const learning_rate = Float64(1e-2)
const train_size = size(x_train, 2) # => 60000
const iter_per_epoch = max(train_size / batch_size, 1)  # => 600

network = TwoLayerNet{Float64}(784, 50, 10)
optimizer = Optimizer.Adam(network, learning_rate)
;

### Train

In [7]:
train_loss_list = Float64[]
train_acc_list = Float64[]
test_acc_list = Float64[]

0-element Array{Float64,1}

In [8]:
for epoch = 1:2 #iters_num
    loss = 0
    start_time = now()
    iter = 0
    for (x_batch, t_batch) in dataloader(x_train, y_train, batch_size=600, shuffle=true)
        iter += 1
        grads = gradient(network, x_batch, t_batch)
        Optimizer.step(optimizer, grads)
        loss += criterion(network, x_batch, t_batch)
        push!(train_loss_list, loss)
        train_acc = accuracy(network, x_train, y_train)
        test_acc = accuracy(network, x_test, y_test)
        println("$(iter) train_acc=$(train_acc) / test_acc=$(test_acc)")
    end

    train_acc = accuracy(network, x_train, y_train)
    test_acc = accuracy(network, x_test, y_test)
    push!(train_acc_list, train_acc)
    push!(test_acc_list, test_acc)
    @show loss
    println("$(epoch-1): train_acc=$(train_acc) / test_acc=$(test_acc)")
    println((now() - start_time).value / 1000)
end

1 train_acc=0.6551 / test_acc=0.656
2 train_acc=0.7339333333333333 / test_acc=0.7327
3 train_acc=0.7871 / test_acc=0.7939
4 train_acc=0.8319666666666666 / test_acc=0.8402
5 train_acc=0.8197166666666666 / test_acc=0.8232
6 train_acc=0.8518 / test_acc=0.8518
7 train_acc=0.8472833333333334 / test_acc=0.8548
8 train_acc=0.85035 / test_acc=0.8552
9 train_acc=0.8565333333333334 / test_acc=0.8586
10 train_acc=0.8698666666666667 / test_acc=0.877
11 train_acc=0.8723666666666666 / test_acc=0.8786
12 train_acc=0.8728666666666667 / test_acc=0.875
13 train_acc=0.8847666666666667 / test_acc=0.8909
14 train_acc=0.8744166666666666 / test_acc=0.8736
15 train_acc=0.8837666666666667 / test_acc=0.8903
16 train_acc=0.8967166666666667 / test_acc=0.9005
17 train_acc=0.9003833333333333 / test_acc=0.902
18 train_acc=0.90105 / test_acc=0.9046
19 train_acc=0.90205 / test_acc=0.9014
20 train_acc=0.9017666666666667 / test_acc=0.8998
21 train_acc=0.9064333333333333 / test_acc=0.9091
22 train_acc=0.9111666666666667 