# Forward pass and gradient calculation

### Forward functions

In [48]:
input = ones(28,28,1)
input[2,:,1] = input[2,:,1] * 5
filters1 = ones(3,3, 1, 32) # filters format: (kernel_dim_1, kernel_dim_2, in_channel, out_channel) 
filters2 = ones(3,3, 32, 64); # filters format: (kernel_dim_1, kernel_dim_2, in_channel, out_channel) 
linear_weights = ones(10, 1600) # weights format: (out_dim, in_dim)
linear_bias = ones(10);

label = Array{Float32, 2}(undef, 10, 1)
label *= 0
label[5] = 1

1

In [49]:
# Conv 2d, assumes square input, filter and stride = 1, no bias
function conv2d(input, filters)
    dim_f, in_channels, out_channels = size(filters)[2:4]
    dim_i = size(input)[2]
    out_dim = dim_i - dim_f + 1

    output = zeros(out_dim, out_dim, out_channels)
    
    for n in 1:out_channels
        for i in 1:out_dim
            for j in 1:out_dim
                for c in 1:in_channels
                    output[i, j, n] += sum(input[i:i+dim_f-1, j:j+dim_f-1, c] .* filters[:,:, c, n])
                end
            end
        end
    end

    return output
end

conv2d (generic function with 1 method)

In [50]:
# relu
function relu(input)
    return max.(0, input)
end

relu (generic function with 1 method)

In [51]:
# maxpool2d, assumes kernel_size == stride

function maxpool2d(input, kernel_size)
    dim_i, n_filters = size(input)[2:3]
    out_dim = floor(Int, dim_i / kernel_size)
    output = Array{Float32, 3}(undef, out_dim, out_dim, n_filters)
    
    for n in 1:n_filters
        for i in 1:out_dim
            for j in 1:out_dim
                output[i, j, n] = maximum(input[(i-1)*kernel_size+1:i*kernel_size, (j-1)*kernel_size+1:j*kernel_size, n])
            end
        end
    end

    return output
end

maxpool2d (generic function with 1 method)

In [52]:
# flatten
function flatten(input)
    return reshape(input, prod(size(input)), 1)
end

flatten (generic function with 1 method)

In [53]:
# Fully connected (linear)
function linear(input, weights, bias)
        output = Array{Float32, 2}(undef, size(weights)[1], size(input)[2])
    return weights * input + bias
end

linear (generic function with 1 method)

In [54]:
# log softmax using LSE trick
function log_softmax(input)
    c = maximum(input)
    return input .- (c + log(sum(exp.(input .- c))))
end

log_softmax (generic function with 1 method)

In [55]:
# nll_loss, format equivalent to torch.nn.NLLLoss
function nll_loss(y, y_true)
    for i in 1:size(y)[1]
        if y_true[i] == 1
            return -y[i]
        end
    end
end

nll_loss (generic function with 1 method)

In [56]:
# forward pass of network
output1 = maxpool2d(relu(conv2d(input, filters1)), 2)
output2 = maxpool2d(relu(conv2d(output1, filters2)), 2)
preds = log_softmax(linear(flatten(output2), linear_weights, linear_bias))
loss = nll_loss(preds, label)

2.302585093304515

### Gradient calculations

In [None]:
# nll_loss grad

In [None]:
# log_softmax grad

In [None]:
# linear grad

In [None]:
# flatten grad

In [46]:
# maxpool2d grad


In [27]:
# relu grad
function relu_grad(input, grad)
    negative_mask = input .<= 0.0
    grad[negative_mask] .= 0.0
    return grad
end

relu_grad (generic function with 1 method)

In [None]:
# conv2d grad