In [1]:
using HDF5
using CuArrays, CUDAnative, CUDAdrv
using Statistics
using BenchmarkTools
using Test

In [2]:
include("../../NeuralNetworkGPU.jl")

predict (generic function with 1 method)

In [3]:
TRAIN_DATA_PATH="datasets/train_catvnoncat.h5"
TEST_DATA_PATH="datasets/test_catvnoncat.h5"

"datasets/test_catvnoncat.h5"

In [4]:
train_data_x_orig = h5read(TRAIN_DATA_PATH, "train_set_x")
train_data_y_orig = h5read(TRAIN_DATA_PATH, "train_set_y")
print()

In [5]:
test_data_x_orig = h5read(TEST_DATA_PATH, "test_set_x")
test_data_y_orig = h5read(TEST_DATA_PATH, "test_set_y")
print()

In [6]:
classes = h5read(TEST_DATA_PATH, "list_classes")

2-element Array{String,1}:
 "non-cat"
 "cat"    

In [7]:
train_data_y_orig = reshape(train_data_y_orig, (1, size(train_data_y_orig)[1]))

1×209 Array{Int64,2}:
 0  0  1  0  0  0  0  1  0  0  0  1  0  …  1  0  0  1  0  0  0  0  0  0  0  0

In [8]:
test_data_y_orig = reshape(test_data_y_orig, (1, size(test_data_y_orig)[1]))

1×50 Array{Int64,2}:
 1  1  1  1  1  0  1  1  1  1  1  1  1  …  0  0  1  1  1  0  0  0  1  1  1  0

In [9]:
function reshape_image_array(image_array::Array{UInt8,4})
    return reshape(image_array, :, size(image_array)[4])
end
function reshape_image_array_reverse(image_array::Array{UInt8,2})
    return reshape(image_array, 64, 64, 3, :)
end

reshape_image_array_reverse (generic function with 1 method)

In [10]:
test_data_x = reshape_image_array(test_data_x_orig) ./ 255
test_data_x = CuArray{Float32, 2}(test_data_x)

12288×50 CuArray{Float32,2,Nothing}:
 0.619608  0.45098   1.0       0.996078  …  0.160784  0.0705882  0.521569 
 0.407843  0.431373  0.992157  0.996078     0.184314  0.0705882  0.639216 
 0.32549   0.435294  0.996078  0.952941     0.329412  0.0627451  0.294118 
 0.631373  0.537255  1.0       1.0          0.282353  0.137255   0.384314 
 0.415686  0.505882  0.992157  1.0          0.305882  0.141176   0.470588 
 0.333333  0.505882  0.996078  0.964706  …  0.439216  0.121569   0.172549 
 0.635294  0.607843  1.0       1.0          0.403922  0.211765   0.423529 
 0.419608  0.572549  0.992157  1.0          0.4       0.223529   0.517647 
 0.329412  0.568627  0.996078  0.992157     0.486275  0.203922   0.184314 
 0.639216  0.623529  1.0       1.0          0.466667  0.313726   0.462745 
 0.419608  0.596078  0.992157  1.0       …  0.447059  0.345098   0.576471 
 0.329412  0.572549  0.996078  1.0          0.505882  0.313726   0.223529 
 0.639216  0.607843  1.0       1.0          0.529412  0.407843 

In [11]:
train_data_x = reshape_image_array(train_data_x_orig) ./ 255
train_data_x = CuArray{Float32, 2}(train_data_x)

12288×209 CuArray{Float32,2,Nothing}:
 0.0666667  0.768627  0.321569  0.00392157  …  0.560784  0.0862745  0.0313726
 0.121569   0.752941  0.278431  0.0862745      0.607843  0.0941176  0.109804 
 0.219608   0.745098  0.266667  0.00784314     0.647059  0.0901961  0.207843 
 0.0862745  0.756863  0.34902   0.00392157     0.721569  0.0901961  0.054902 
 0.129412   0.729412  0.32549   0.054902       0.745098  0.0980392  0.129412 
 0.231373   0.713726  0.32549   0.00784314  …  0.776471  0.0941176  0.227451 
 0.0980392  0.737255  0.392157  0.00392157     0.556863  0.0941176  0.0745098
 0.137255   0.701961  0.384314  0.0509804      0.584314  0.101961   0.137255 
 0.243137   0.682353  0.407843  0.00392157     0.607843  0.0980392  0.239216 
 0.0980392  0.835294  0.415686  0.0156863      0.529412  0.0941176  0.0745098
 0.137255   0.760784  0.411765  0.113725    …  0.580392  0.101961   0.137255 
 0.243137   0.756863  0.454902  0.0117647      0.596078  0.105882   0.239216 
 0.105882   0.658824  0.46

In [12]:
CuArrays.memory_status()

Effective GPU memory usage: 17.50% (1.050 GiB/6.000 GiB)
CuArrays GPU memory usage: 20.000 MiB
BinnedPool usage: 20.000 MiB (20.000 MiB allocated, 0 bytes cached)
BinnedPool efficiency: 60.70% (12.141 MiB requested, 20.000 MiB allocated)


In [13]:
layer_dims = Array([12288, 7, 1])

3-element Array{Int64,1}:
 12288
     7
     1

In [15]:
parameters, activations = neural_network_dense(train_data_x, train_data_y_orig, layer_dims, 3000, 0.0075)

Cost at iteration 100 is 0.5816171
Cost at iteration 200 is 0.51456255
Cost at iteration 300 is 0.4448268
Cost at iteration 400 is 0.3819224
Cost at iteration 500 is 0.39389595
Cost at iteration 600 is 0.35948426
Cost at iteration 700 is 0.32709426
Cost at iteration 800 is 0.29524958
Cost at iteration 900 is 0.26424006
Cost at iteration 1000 is 0.22269034
Cost at iteration 1100 is 0.15306877
Cost at iteration 1200 is 0.27600452
Cost at iteration 1300 is 0.095913894
Cost at iteration 1400 is 0.080693506
Cost at iteration 1500 is 0.06868048
Cost at iteration 1600 is 0.059079718
Cost at iteration 1700 is 0.051345762
Cost at iteration 1800 is 0.045038708
Cost at iteration 1900 is 0.039841387
Cost at iteration 2000 is 0.03551355
Cost at iteration 2100 is 0.031871565
Cost at iteration 2200 is 0.028786812
Cost at iteration 2300 is 0.026154319
Cost at iteration 2400 is 0.023884112
Cost at iteration 2500 is 0.021920113
Cost at iteration 2600 is 0.020203924
Cost at iteration 2700 is 0.018698698


(Dict{String,CuArray{Float32,N,P} where P where N}("W2" => [0.689119 0.5266208 … 0.9174756 0.7191226],"W1" => [0.008565683 0.0077575734 … 0.000832319 0.0034043246; -0.02076472 0.0035111152 … -0.010130825 0.013295293; … ; 0.004605219 -0.0029152175 … 0.002857751 0.002700298; 0.0018913064 -0.008112621 … 0.0022392953 0.03170336],"b2" => [-0.019859383],"b1" => [-0.0004114479; -0.0008891643; … ; -0.0015644903; -0.0022712783]), (relu, sigmoid))

In [16]:
@btime neural_network_dense($train_data_x, $train_data_y_orig, $layer_dims, 100, 0.0075)

Cost at iteration 100 is 0.52755755
Cost at iteration 100 is 0.5076571
Cost at iteration 100 is 0.5328277
Cost at iteration 100 is 0.5458453
Cost at iteration 100 is 0.55733985
Cost at iteration 100 is 0.5410472
Cost at iteration 100 is 0.5340075
Cost at iteration 100 is 0.66955894
Cost at iteration 100 is 0.575934
Cost at iteration 100 is 0.53449607
Cost at iteration 100 is 0.6043163
Cost at iteration 100 is 0.53845406
Cost at iteration 100 is 0.53691083
Cost at iteration 100 is 0.5546509
Cost at iteration 100 is 0.5764237
Cost at iteration 100 is 0.6095794
Cost at iteration 100 is 0.55004054
Cost at iteration 100 is 0.5238911
Cost at iteration 100 is 0.5643055
Cost at iteration 100 is 0.54129964
Cost at iteration 100 is 0.5547045
Cost at iteration 100 is 0.51433074
Cost at iteration 100 is 0.545058
Cost at iteration 100 is 0.5783218
Cost at iteration 100 is 0.5001445
Cost at iteration 100 is 0.5469671
Cost at iteration 100 is 0.537676
Cost at iteration 100 is 0.50987905
Cost at itera

(Dict{String,CuArray{Float32,N,P} where P where N}("W2" => [0.18218294 0.2734602 … 0.4839038 0.103254825],"W1" => [0.0020584152 0.0030009705 … -0.0012401458 -0.022073034; -0.0045128562 0.001017429 … 0.0003041255 0.011447717; … ; -0.0010683105 -0.0035073983 … -0.0031044902 0.011571417; -0.008762885 -0.008650047 … 0.0036444594 0.002323672],"b2" => [-0.015071838],"b1" => [-0.00069132744; -0.0007818891; … ; -0.00092582987; -0.0005512133]), (relu, sigmoid))

In [17]:
# TODO: copy array to CPU while checking
predicts, accuracy = predict(train_data_x, train_data_y_orig, parameters, activations)

Accuracy is 100.0%


└ @ GPUArrays C:\Users\SarnayakPC\.julia\packages\GPUArrays\1wgPO\src\indexing.jl:16


([0.0 0.0 … 0.0 0.0], 1.0)

In [18]:
predicts, accuracy = predict(test_data_x, test_data_y_orig, parameters, activations)

Accuracy is 72.0%


([1.0 1.0 … 1.0 0.0], 0.72)

## Benchmarking

In [35]:
@btime initialize_parameters($layer_dims)

  550.300 μs (456 allocations: 1.33 MiB)


Dict{String,CuArray{Float32,N,P} where P where N} with 4 entries:
  "W2" => Float32[0.216067 0.0322727 … -0.23179 -0.293956]
  "W1" => Float32[-0.000723434 -0.000757421 … 0.00656295 0.00455156; -0.0060327…
  "b2" => Float32[0.0]
  "b1" => Float32[0.0; 0.0; … ; 0.0; 0.0]

In [19]:
parameters = initialize_parameters(layer_dims)

Dict{String,CuArray{Float32,N,P} where P where N} with 4 entries:
  "W2" => Float32[0.0563876 -0.25896 … 0.439327 -0.666383]
  "W1" => Float32[0.00242421 -0.0123454 … 0.00564323 -0.00678052; -0.0089758 -0…
  "b2" => Float32[0.0]
  "b1" => Float32[0.0; 0.0; … ; 0.0; 0.0]

In [21]:
activations = (relu, sigmoid)

(relu, sigmoid)

In [22]:
y, caches = forward_prop(train_data_x, parameters, activations)

(Float32[0.5304502 0.5698232 … 0.50635797 0.5009594], Dict{String,CuArray{Float32,N,P} where P where N}("A2" => [0.5304502 0.5698232 … 0.50635797 0.5009594],"A1" => [0.0 0.0 … 0.052209593 0.0; 0.06079917 0.14268337 … 0.23181947 0.13214186; … ; 0.31342584 0.69044226 … 0.1831061 0.072554246; 0.0 0.0 … 0.004285112 0.0],"A0" => [0.06666667 0.76862746 … 0.08627451 0.03137255; 0.12156863 0.7529412 … 0.09411765 0.10980392; … ; 0.0 0.3137255 … 0.019607844 0.0; 0.0 0.31764707 … 0.0 0.0],"Z2" => [0.121951744 0.2811298 … 0.025433421 0.0038376711],"Z1" => [-0.11178262 -0.2187877 … 0.052209593 -0.025232561; 0.06079917 0.14268337 … 0.23181947 0.13214186; … ; 0.31342584 0.69044226 … 0.1831061 0.072554246; -0.0073025897 -0.47237018 … 0.004285112 -0.2101373]))

In [50]:
@btime forward_prop($train_data_x, $parameters, $activations)

  61.801 μs (361 allocations: 15.42 KiB)


(Float32[0.4913846 0.48672014 … 0.507836 0.4977944], Dict{String,CuArray{Float32,N,P} where P where N}("A2" => [0.4913846 0.48672014 … 0.507836 0.4977944],"A1" => [0.30746502 0.955148 … 0.07151897 0.17900674; 0.18975675 0.6897727 … 0.14214121 0.27145642; … ; 0.0 0.0 … 0.021278799 0.0; 0.03939242 0.15467197 … 0.046604116 0.17541131],"A0" => [0.06666667 0.76862746 … 0.08627451 0.03137255; 0.12156863 0.7529412 … 0.09411765 0.10980392; … ; 0.0 0.3137255 … 0.019607844 0.0; 0.0 0.31764707 … 0.0 0.0],"Z2" => [-0.034465075 -0.053131934 … 0.03134637 -0.008822517],"Z1" => [0.30746502 0.955148 … 0.07151897 0.17900674; 0.18975675 0.6897727 … 0.14214121 0.27145642; … ; -0.47897834 -0.7099412 … 0.021278799 -0.25276715; 0.03939242 0.15467197 … 0.046604116 0.17541131]))

In [23]:
y_orig = CuArray{Float32, 2}(train_data_y_orig)
cost = cost_binary(y_orig, y)

0.70694107f0

In [24]:
@btime CuArrays.@sync cost_binary($y_orig, $y)

  109.300 μs (104 allocations: 6.97 KiB)


0.70694107f0

In [30]:
# Check the time taken in computing only the vector part of binary cost (without sum)
bin_loss(Y, Ŷ) = CuArrays.@sync Y .* log.(Ŷ) .+ (1 .- Y) .* log.(1 .- Ŷ)

bin_loss (generic function with 1 method)

In [31]:
bin_loss(y_orig, y)

1×209 CuArray{Float32,2,Nothing}:
 -0.755981  -0.843559  -0.662115  …  -0.78873  -0.705945  -0.695068

In [32]:
@benchmark bin_loss($y_orig, $y)

BenchmarkTools.Trial: 
  memory estimate:  5.83 KiB
  allocs estimate:  95
  --------------
  minimum time:     60.700 μs (0.00% GC)
  median time:      87.800 μs (0.00% GC)
  mean time:        95.031 μs (0.45% GC)
  maximum time:     13.035 ms (32.50% GC)
  --------------
  samples:          10000
  evals/sample:     1

### Benchmark Backprop

In [50]:
activations_back = (relu_back, sigmoid_back)
grads = backward_prop(y_orig, y, parameters, caches, layer_dims, activations_back)

Dict{String,CuArray{Float32,N,P} where P where N} with 4 entries:
  "dw1" => Float32[0.00204557 0.00242011 … 0.00205558 0.00110376; -0.0133052 -0…
  "dw2" => Float32[0.0155401 0.0583706 … 0.0687734 0.00538647]
  "db2" => Float32[0.169959]
  "db1" => Float32[0.00570629; -0.041131; … ; 0.0704166; -0.0231952]

In [51]:
@benchmark backward_prop($y_orig, $y, $parameters, $caches, $layer_dims, $activations_back)

BenchmarkTools.Trial: 
  memory estimate:  1.35 MiB
  allocs estimate:  937
  --------------
  minimum time:     1.155 ms (0.00% GC)
  median time:      1.245 ms (0.00% GC)
  mean time:        1.403 ms (4.54% GC)
  maximum time:     6.330 ms (67.70% GC)
  --------------
  samples:          3553
  evals/sample:     1

In [36]:
last_da(Y, Ŷ) = CuArrays.@sync (.- Y ./ Ŷ .+ (1 .- Y) ./ (1 .- Ŷ))

last_da (generic function with 1 method)

In [37]:
last_da(y_orig, y)

1×209 CuArray{Float32,2,Nothing}:
 2.1297  2.32463  -1.93889  2.08738  …  1.98281  2.2006  2.02576  2.00384

In [38]:
@benchmark last_da($y_orig, $y)

BenchmarkTools.Trial: 
  memory estimate:  6.16 KiB
  allocs estimate:  96
  --------------
  minimum time:     65.500 μs (0.00% GC)
  median time:      84.000 μs (0.00% GC)
  mean time:        87.348 μs (0.50% GC)
  maximum time:     11.569 ms (38.02% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [39]:
# Check how much more time it takes to check length, check NaNs, etc. Result: not much
function last_da_2(y_orig, y)
    CuArrays.@sync begin
        @assert length(y_orig) == length(y)
        dA = last_da(y_orig, y)
        if all(isnan.(dA))
            println("dA was NaN!")
            dA = randn(Float32)
        end
    end
    return dA
end

last_da_2 (generic function with 1 method)

In [40]:
last_da_2(y_orig, y)

1×209 CuArray{Float32,2,Nothing}:
 2.1297  2.32463  -1.93889  2.08738  …  1.98281  2.2006  2.02576  2.00384

In [41]:
@benchmark last_da($y_orig, $y)

BenchmarkTools.Trial: 
  memory estimate:  6.16 KiB
  allocs estimate:  96
  --------------
  minimum time:     66.300 μs (0.00% GC)
  median time:      84.500 μs (0.00% GC)
  mean time:        88.526 μs (0.53% GC)
  maximum time:     12.236 ms (38.43% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [42]:
# Check each step of back prop
dZ_calc(dA, Z, activation) = CuArrays.@sync dA .* activation.(Z)
dw_calc(dZ, A_prev, m) = CuArrays.@sync 1/m .* (dZ * transpose(A_prev))
db_calc(dZ, m) = CuArrays.@sync 1/m .* sum(dZ, dims=2)

db_calc (generic function with 1 method)

In [43]:
m = size(y)[2]
dA = last_da_2(y_orig, y)
dZ = dZ_calc(dA, caches["Z2"], sigmoid_back)
dw = dw_calc(dZ, caches["A1"], m)
db = db_calc(dZ, m)

1×1 CuArray{Float64,2,Nothing}:
 0.16995878082713442

In [44]:
@btime dZ_calc($dA, $caches["Z2"], $sigmoid_back)

  55.000 μs (78 allocations: 3.17 KiB)


1×209 CuArray{Float32,2,Nothing}:
 0.53045  0.569823  -0.484241  0.520931  …  0.545579  0.506358  0.500959

In [45]:
@btime dw_calc($dZ, $caches["A1"], $m)

  75.900 μs (81 allocations: 3.02 KiB)


1×7 CuArray{Float64,2,Nothing}:
 0.0155401  0.0583706  -0.000369596  …  0.00939854  0.0687734  0.00538647

In [46]:
# this takes the longest time since it involves a sum
@btime db_calc($dZ, m)

  130.500 μs (165 allocations: 6.00 KiB)


1×1 CuArray{Float64,2,Nothing}:
 0.16995878082713442

In [48]:
CuArrays.memory_status()

Effective GPU memory usage: 18.70% (1.122 GiB/6.000 GiB)
CuArrays GPU memory usage: 22.052 MiB
BinnedPool usage: 22.052 MiB (22.044 MiB allocated, 8.000 KiB cached)
BinnedPool efficiency: 61.16% (13.486 MiB requested, 22.052 MiB allocated)


In [52]:
learning_rate = 0.000001
parameters = update_parameters(parameters, grads, layer_dims, learning_rate)

Dict{String,CuArray{Float32,N,P} where P where N} with 4 entries:
  "W2" => Float32[0.0563876 -0.25896 … 0.439327 -0.666383]
  "W1" => Float32[0.0024242 -0.0123454 … 0.00564323 -0.00678052; -0.00897579 -0…
  "b2" => Float32[-1.69959e-7]
  "b1" => Float32[-5.70629e-9; 4.1131e-8; … ; -7.04166e-8; 2.31952e-8]

In [53]:
@benchmark update_parameters($parameters, $grads, $layer_dims, $learning_rate)

BenchmarkTools.Trial: 
  memory estimate:  12.83 KiB
  allocs estimate:  333
  --------------
  minimum time:     97.600 μs (0.00% GC)
  median time:      114.999 μs (0.00% GC)
  mean time:        121.925 μs (1.16% GC)
  maximum time:     8.119 ms (96.71% GC)
  --------------
  samples:          10000
  evals/sample:     1

## Custom update params kernel

Note: this kernel has now been integrated into NeuralNetworkGPU.jl (the above results are from this kernel)

In [54]:
function update_params_kernel!(params, lr, grads, m, n)
    tx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    ty = (blockIdx().y-1) * blockDim().y + threadIdx().y
#     stride_x = blockDim().x * gridDim().x
#     stride_y = blockDim().y * gridDim().y
#     @cuprintln("tx $tx, ty $ty")
    if tx <= m && ty <= n
        @inbounds params[tx, ty] = params[tx, ty] - lr * grads[tx, ty]
    end
#     @cuprintln("done $tx $ty")
    return nothing
end

update_params_kernel! (generic function with 1 method)

In [55]:
n = 10
m = 20
params_orig = rand(Float32, m, n)
params_ = CuArray(params_orig)
lr = 0.01f0
grads_orig = rand(Float32, m, n) / 10
grads_ = CuArray(grads_orig)
# TILE_WIDTH = 2
# TILE_HEIGHT = 2
# blocks = (TILE_WIDTH,TILE_HEIGHT,1)
# threads = (floor(Int,(n)/blocks[1]),
#            floor(Int,(m)/blocks[2]),
#            1)
# threads = (m, n)
function get_threads_blocks(m, n)
    nthreads = 32
    threads = (min(nthreads, m), min(nthreads, n))
    blocks = (cld(m, threads[1]), cld(n, threads[2]))
    return threads, blocks
end
threads, blocks = get_threads_blocks(m, n)

((20, 10), (1, 1))

In [56]:
# threads = (m, n)
@show blocks
@show threads
@show (m, n)

blocks = (1, 1)
threads = (20, 10)
(m, n) = (20, 10)


(20, 10)

In [57]:
CuArrays.memory_status()

Effective GPU memory usage: 18.73% (1.124 GiB/6.000 GiB)
CuArrays GPU memory usage: 23.046 MiB
BinnedPool usage: 23.046 MiB (22.546 MiB allocated, 512.004 KiB cached)
BinnedPool efficiency: 59.95% (13.816 MiB requested, 23.046 MiB allocated)


In [58]:
CuArrays.@sync @cuda blocks=blocks threads=threads update_params_kernel!(params_, lr, grads_, m, n)

In [59]:
synchronize()

In [60]:
params_correct = params_orig - lr * grads_orig

20×10 Array{Float32,2}:
 0.68085    0.238199   0.281587   …  0.682398   0.164228  0.484359
 0.229621   0.0625436  0.61031       0.0308379  0.734675  0.415978
 0.157855   0.697447   0.165305      0.818794   0.699523  0.642308
 0.0752252  0.667901   0.0436002     0.993286   0.819965  0.266314
 0.897432   0.148809   0.815467      0.61547    0.706034  0.81167 
 0.952269   0.127945   0.901322   …  0.294449   0.829268  0.711455
 0.759238   0.812402   0.0364975     0.753732   0.385238  0.706799
 0.190172   0.157707   0.785977      0.568751   0.865423  0.770159
 0.0350583  0.210437   0.322676      0.7267     0.904937  0.244403
 0.0928938  0.358988   0.740606      0.318996   0.433109  0.985708
 0.31892    0.236091   0.490164   …  0.377191   0.7841    0.764927
 0.370984   0.878873   0.907469      0.726703   0.795002  0.571635
 0.914992   0.418913   0.398795      0.0238975  0.134215  0.463585
 0.882399   0.885625   0.686047      0.12916    0.402672  0.434983
 0.307297   0.0693293  0.658459      0

In [61]:
params_

20×10 CuArray{Float32,2,Nothing}:
 0.68085    0.238199   0.281587   …  0.682398   0.164228  0.484359
 0.229621   0.0625436  0.61031       0.0308379  0.734675  0.415978
 0.157855   0.697447   0.165305      0.818794   0.699523  0.642308
 0.0752252  0.667901   0.0436002     0.993286   0.819965  0.266314
 0.897432   0.148809   0.815467      0.61547    0.706034  0.81167 
 0.952269   0.127945   0.901322   …  0.294449   0.829268  0.711455
 0.759238   0.812402   0.0364975     0.753732   0.385238  0.706799
 0.190172   0.157707   0.785977      0.568751   0.865423  0.770159
 0.0350583  0.210437   0.322676      0.7267     0.904937  0.244403
 0.0928938  0.358988   0.740606      0.318996   0.433109  0.985708
 0.31892    0.236091   0.490164   …  0.377191   0.7841    0.764927
 0.370984   0.878873   0.907469      0.726703   0.795002  0.571635
 0.914992   0.418913   0.398795      0.0238975  0.134215  0.463585
 0.882399   0.885625   0.686047      0.12916    0.402672  0.434983
 0.307297   0.0693293  0.658

In [62]:
@test params_correct == params_

[32m[1mTest Passed[22m[39m

In [63]:
# function split_two_equal(m, n)
#     total = m + n
#     if total % 2 == 0
#         return total÷2, total÷2
#     else
#         return (total÷2), (total÷2)+1
#     end
# end

# Custom function to calculate the number of threads and blocks for the kernel to execute on GPU
function get_threads_blocks(m, n)
    nthreads = 32     # There is a limit to the number of threads per block
    threads = (min(nthreads, m), min(nthreads, n))
    blocks = (cld(m, threads[1]), cld(n, threads[2]))
    return threads, blocks
end

function update_parameters_2(parameters::Dict{String, CuArray{Float32}}, grads::Dict{String, CuArray{Float32}}, layer_dims::Array{Int}, learning_rate::Number)::Dict{String, CuArray{Float32}}
    num_layers = length(layer_dims)
    for l = 1:num_layers-1
        m,n = size(parameters[string("W", l)])
        threads, blocks = get_threads_blocks(m, n)
        @show (m, n)
        @show threads
        @show blocks
        @cuda blocks=blocks threads=threads update_params_kernel!(parameters[string("W", l)], learning_rate, grads[string("dw", l)], m, n)
        
        m,n = size(parameters[string("b", l)])
        threads, blocks = get_threads_blocks(m, n)
        @show threads
        @show blocks
        @cuda blocks=blocks threads=threads update_params_kernel!(parameters[string("b", l)], learning_rate, grads[string("db", l)], m, n)
    end
    synchronize()
    return parameters
end

update_parameters_2 (generic function with 1 method)

In [64]:
layer_dims = [1000, 5, 1]
parameters = initialize_parameters(layer_dims)
X = CuArrays.rand(1000, 100)
activations = (relu, sigmoid)
y, caches = forward_prop(X, parameters, activations)
y_orig = CuArrays.rand(1, 100)
cost = cost_binary(y_orig, y)
grads = backward_prop(y_orig, y, parameters, caches, layer_dims, (relu_back, sigmoid_back))

parameters2 = copy(parameters)
parameters3 = copy(parameters)

Dict{String,CuArray{Float32,N,P} where P where N} with 4 entries:
  "W2" => Float32[0.0633256 -0.336415 … 0.0218597 -0.802502]
  "W1" => Float32[-0.0240596 0.0294514 … -0.00647336 -0.0627137; -0.0275193 0.0…
  "b2" => Float32[0.0]
  "b1" => Float32[0.0; 0.0; … ; 0.0; 0.0]

In [70]:
# This is the original update_params function from NeuralNetwork.jl (but running on GPU)
update_parameters_old(parameters2, grads, layer_dims, 0.01)

Dict{String,CuArray{Float32,N,P} where P where N} with 4 entries:
  "W2" => Float32[0.598389 22.8195 … 0.0218597 45.0686]
  "W1" => Float32[0.421263 0.36523 … 0.185167 0.32061; -10.5397 -9.0468 … -8.93…
  "b2" => Float32[82.1086]
  "b1" => Float32[0.621215; -21.3741; … ; 0.0; -67.1714]

In [66]:
update_parameters_2(parameters3, grads, layer_dims, 0.01)

(m, n) = (5, 1000)
threads = (5, 32)
blocks = (1, 32)
threads = (5, 1)
blocks = (1, 1)
(m, n) = (1, 5)
threads = (1, 5)
blocks = (1, 1)
threads = (1, 1)
blocks = (1, 1)


Dict{String,CuArray{Float32,N,P} where P where N} with 4 entries:
  "W2" => Float32[0.0633352 -0.336 … 0.0218597 -0.80168]
  "W1" => Float32[-0.0240516 0.0294574 … -0.00646992 -0.0627068; -0.0277078 0.0…
  "b2" => Float32[0.00147303]
  "b1" => Float32[1.11371e-5; -0.000383232; … ; 0.0; -0.00120414]

In [67]:
@test parameters2 == parameters3

[32m[1mTest Passed[22m[39m

In [68]:
@btime update_parameters_old($parameters3, $grads, $layer_dims, 0.01)

  449.200 μs (629 allocations: 104.55 KiB)


Dict{String,CuArray{Float32,N,P} where P where N} with 4 entries:
  "W2" => Float32[0.238747 7.25572 … 0.0218597 14.2297]
  "W1" => Float32[0.121933 0.139616 … 0.0563134 0.0629441; -3.47375 -2.95606 … …
  "b2" => Float32[26.9247]
  "b1" => Float32[0.203556; -7.00623; … ; 0.0; -22.0108]

In [69]:
@btime update_parameters($parameters2, $grads, $layer_dims, 0.01)

  83.299 μs (333 allocations: 12.83 KiB)


Dict{String,CuArray{Float32,N,P} where P where N} with 4 entries:
  "W2" => Float32[0.59838 22.8191 … 0.0218597 45.0677]
  "W1" => Float32[0.421255 0.365224 … 0.185163 0.320603; -10.5395 -9.04664 … -8…
  "b2" => Float32[82.1071]
  "b1" => Float32[0.621204; -21.3737; … ; 0.0; -67.1702]

In [71]:
# ^ That's a lot of speedup