In [7]:
using Pkg 
Pkg.activate(".")
using BenchmarkTools
using CUDA

[32m[1m  Activating[22m[39m project at `/mnt/d/Realizibility-index/bnc_julia`


In [8]:
struct Interpolate{A}
    xs::A
    ys::A
end

function (itp::Interpolate)(x)
    i = searchsortedfirst(itp.xs, x)
    i = clamp(i, firstindex(itp.ys), lastindex(itp.ys))
    @inbounds itp.ys[i]
end

xs_cpu = [1.0, 2.0, 3.0]
ys_cpu = [10.0,20.0,30.0]
itp_cpu = Interpolate(xs_cpu, ys_cpu)

pts_cpu = [1.1,2.3]
result_cpu = itp_cpu.(pts_cpu)

2-element Vector{Float64}:
 20.0
 30.0

In [7]:
x_d = CUDA.fill(1.0f0, N)  # a vector stored on the GPU filled with 1.0 (Float32)
y_d = CUDA.fill(2.0f0, N)  # a vector stored on the GPU filled with 2.0
@btime $y_d .+= $x_d  # increment each element of y_d with the corresponding element of x_d

  5.137 μs (86 allocations: 2.12 KiB)


1048576-element CuArray{Float32, 1, CUDA.DeviceMemory}:
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
      ⋮
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0
 404868.0

In [12]:
y_d .+= x_d
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [13]:
function add_broadcast!(y, x)
    CUDA.@sync y .+= x
    return
end

add_broadcast! (generic function with 1 method)

In [14]:
@btime add_broadcast!($y_d, $x_d)

  24.731 μs (86 allocations: 2.12 KiB)


In [15]:
function gpu_add1!(y, x)
    for i = 1:length(y)
        @inbounds y[i] += x[i]
    end
    return nothing
end
fill!(y_d, 2)
@cuda gpu_add1!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [16]:
function bench_gpu1!(y, x)
    CUDA.@sync begin
        @cuda gpu_add1!(y, x)
    end
end

bench_gpu1! (generic function with 1 method)

In [17]:
@btime bench_gpu1!($y_d, $x_d)

  59.946 ms (13 allocations: 256 bytes)


CUDA.HostKernel for gpu_add1!(CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1})

In [19]:
bench_gpu1!(y_d, x_d)  # run it once to force compilation
CUDA.@profile bench_gpu1!(y_d, x_d)

Profiler ran for 71.19 ms, capturing 793 events.

Host-side activity: calling CUDA APIs took 66.44 ms (93.34% of the trace)
┌──────────┬────────────┬───────┬─────────────────────┐
│[1m Time (%) [0m│[1m Total time [0m│[1m Calls [0m│[1m Name                [0m│
├──────────┼────────────┼───────┼─────────────────────┤
│   93.34% │[31m   66.44 ms [0m│     1 │[1m cuStreamSynchronize [0m│
│    0.04% │   31.95 µs │     1 │ cuLaunchKernel      │
└──────────┴────────────┴───────┴─────────────────────┘

Device-side activity: GPU was busy for 70.76 ms (99.40% of the trace)
┌──────────┬────────────┬───────┬───────────────────────────────────────────────────────────────────────┐
│[1m Time (%) [0m│[1m Total time [0m│[1m Calls [0m│[1m Name                                                                  [0m│
├──────────┼────────────┼───────┼───────────────────────────────────────────────────────────────────────┤
│   99.40% │[31m   70.76 ms [0m│     1 │[1m gpu_add1_(CuDeviceArray

In [20]:
CUDA.@profile trace=true bench_gpu1!(y_d, x_d)

Profiler ran for 72.61 ms, capturing 793 events.

Host-side activity: calling CUDA APIs took 67.28 ms (92.67% of the trace)
┌─────┬──────────┬──────────┬────────┬─────────────────────┐
│[1m  ID [0m│[1m    Start [0m│[1m     Time [0m│[1m Thread [0m│[1m Name                [0m│
├─────┼──────────┼──────────┼────────┼─────────────────────┤
│  13 │ 27.79 µs │ 28.92 µs │      1 │ cuLaunchKernel      │
│ 789 │  5.11 ms │[31m 67.28 ms [0m│      2 │[1m cuStreamSynchronize [0m│
└─────┴──────────┴──────────┴────────┴─────────────────────┘

Device-side activity: GPU was busy for 72.28 ms (99.56% of the trace)
┌────┬──────────┬──────────┬─────────┬────────┬──────┬───────────────────────────────────────────────────────────────────────┐
│[1m ID [0m│[1m    Start [0m│[1m     Time [0m│[1m Threads [0m│[1m Blocks [0m│[1m Regs [0m│[1m Name                                                                  [0m│
├────┼──────────┼──────────┼─────────┼────────┼──────┼──────────────────

In [21]:
function gpu_add2!(y, x)
    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
    stride = blockDim().x
    for i = index:stride:length(y)
        @inbounds y[i] += x[i]
    end
    return nothing
end

fill!(y_d, 2)
@cuda threads=256 gpu_add2!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [22]:
function bench_gpu2!(y, x)
    CUDA.@sync begin
        @cuda threads=256 gpu_add2!(y, x)
    end
end

bench_gpu2! (generic function with 1 method)

In [23]:
@btime bench_gpu2!($y_d, $x_d)

  615.262 μs (13 allocations: 256 bytes)


CUDA.HostKernel for gpu_add2!(CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1})

In [24]:
function gpu_add3!(y, x)
    index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = gridDim().x * blockDim().x
    for i = index:stride:length(y)
        @inbounds y[i] += x[i]
    end
    return
end

numblocks = ceil(Int, N/256)

fill!(y_d, 2)
@cuda threads=256 blocks=numblocks gpu_add3!(y_d, x_d)
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [25]:
function bench_gpu3!(y, x)
    numblocks = ceil(Int, length(y)/256)
    CUDA.@sync begin
        @cuda threads=256 blocks=numblocks gpu_add3!(y, x)
    end
end

bench_gpu3! (generic function with 1 method)

In [26]:
@btime bench_gpu3!($y_d, $x_d)

  26.504 μs (12 allocations: 240 bytes)


CUDA.HostKernel for gpu_add3!(CuDeviceVector{Float32, 1}, CuDeviceVector{Float32, 1})

In [27]:
CUDA.@profile trace=true bench_gpu3!(y_d, x_d)

Profiler ran for 145.74 µs, capturing 333 events.

Host-side activity: calling CUDA APIs took 52.15 µs (35.79% of the trace)
┌─────┬───────────┬──────────┬─────────────────────┐
│[1m  ID [0m│[1m     Start [0m│[1m     Time [0m│[1m Name                [0m│
├─────┼───────────┼──────────┼─────────────────────┤
│  13 │  38.47 µs │[31m 34.49 µs [0m│[1m cuLaunchKernel      [0m│
│ 329 │ 141.51 µs │ 775.0 ns │ cuStreamSynchronize │
└─────┴───────────┴──────────┴─────────────────────┘

Device-side activity: GPU was busy for 43.58 µs (29.91% of the trace)
┌────┬───────────┬──────────┬─────────┬────────┬──────┬───────────────────────────────────────────────────────────────────────┐
│[1m ID [0m│[1m     Start [0m│[1m     Time [0m│[1m Threads [0m│[1m Blocks [0m│[1m Regs [0m│[1m Name                                                                  [0m│
├────┼───────────┼──────────┼─────────┼────────┼──────┼──────────────────────────────────────────────────────────────────────

In [28]:
kernel = @cuda launch=false gpu_add3!(y_d, x_d)
config = launch_configuration(kernel.fun)
threads = min(N, config.threads)
blocks = cld(N, threads)

1366

In [29]:
fill!(y_d, 2)
kernel(y_d, x_d; threads, blocks)
@test all(Array(y_d) .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [30]:
function bench_gpu4!(y, x)
    kernel = @cuda launch=false gpu_add3!(y, x)
    config = launch_configuration(kernel.fun)
    threads = min(length(y), config.threads)
    blocks = cld(length(y), threads)

    CUDA.@sync begin
        kernel(y, x; threads, blocks)
    end
end

bench_gpu4! (generic function with 1 method)

In [31]:
@btime bench_gpu4!($y_d, $x_d)

  29.244 μs (22 allocations: 400 bytes)


In [32]:
function gpu_add2_print!(y, x)
    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
    stride = blockDim().x
    @cuprintln("thread $index, block $stride")
    for i = index:stride:length(y)
        @inbounds y[i] += x[i]
    end
    return nothing
end

@cuda threads=16 gpu_add2_print!(y_d, x_d)
synchronize()

thread 1, block 16
thread 2, block 16
thread 3, block 16
thread 4, block 16
thread 5, block 16
thread 6, block 16
thread 7, block 16
thread 8, block 16
thread 9, block 16
thread 10, block 16
thread 11, block 16
thread 12, block 16
thread 13, block 16
thread 14, block 16
thread 15, block 16
thread 16, block 16
