In [1]:
using CUDA, BenchmarkTools

In [2]:
CUDA.memory_status() 

Effective GPU memory usage: 4.69% (379.938 MiB/7.915 GiB)
CUDA allocator usage: 0 bytes
binned usage: 0 bytes (0 bytes allocated, 0 bytes cached)


In [3]:
cpu = rand(Float32, floor(Int, (10^9)/4))
size = Base.summarysize(cpu)

1000000040

In [4]:
gpu = CuArray{Float32}(undef, floor(Int, (10^9)/4));

In [5]:
CUDA.memory_status() 

Effective GPU memory usage: 16.46% (1.303 GiB/7.915 GiB)
CUDA allocator usage: 953.674 MiB
binned usage: 16 bytes (16 bytes allocated, 0 bytes cached)
Discrepancy of 953.674 MiB between memory pool and allocator!


In [6]:
@btime copyto!(gpu, cpu);

  105.030 ms (0 allocations: 0 bytes)


In [7]:
@btime copyto!(cpu, gpu);

  112.806 ms (0 allocations: 0 bytes)


In [None]:
floor(Int, (10^9)/4)

In [None]:
?trunc

In [None]:
VERSION, Base.Threads.nthreads(), Base.Sys.CPU_THREADS

In [None]:
using CUDA, BenchmarkTools, Humanize
import Humanize: digitsep

In [None]:
@assert CUDA.functional(true)

In [None]:
CUDA.memory_status() 

In [None]:
N = 10^9
digitsep(N)

In [None]:
cpu_a = rand(Float32, N);

In [None]:
digitsep(Base.summarysize(cpu_a))

In [None]:
cpu_b = Array{Float32}(undef, N);

In [None]:
CUDA.memory_status() 

In [None]:
4 / 0.42

In [None]:
@benchmark cpu2 = Array(gpu)

In [None]:
@benchmark copyto!(cpu_b, gpu)

In [None]:
@benchmark copyto!(cpu_b, cpu_a)

In [None]:
@btime copyto!(cpu_b, cpu_a);

In [None]:
4 / 0.3

In [None]:
?copyto!

In [None]:
?Array

In [None]:
include("sorting.jl")

In [None]:
using DataStructures, Random, Test

In [None]:
function test_quicksort(T, f, N)
    a = map(x -> T(f(x)), 1:N)
    c = CuArray(a)
    quicksort!(c)
    @test Array(c) == sort(a)
end

In [None]:
test_quicksort(Float32, x -> rand(Float32), 10000)

In [None]:
N = 2^20
x = fill(1.0f0, N)  # a vector filled with 1.0 (Float32)
y = fill(2.0f0, N)  # a vector filled with 2.0


In [None]:
using Test
@test all(y .== 3.0f0)

In [None]:
function sequential_add_A!(y, x)
    for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

In [None]:
function sequential_add_B!(y, x)
    for i in eachindex(y, x)
        y[i] += x[i]
    end
    return nothing
end

In [None]:
x = fill(1.0f0, N)  # a vector filled with 1.0 (Float32)
y = fill(2.0f0, N)  # a vector filled with 2.0
@benchmark sequential_add_A!(y, x)

In [None]:
x = fill(1.0f0, N)  # a vector filled with 1.0 (Float32)
y = fill(2.0f0, N)  # a vector filled with 2.0
@benchmark sequential_add_B!(y, x)

In [None]:
function parallel_add_A!(y, x)
    Threads.@threads for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

In [None]:
function parallel_add_B!(y, x)
    Threads.@threads for i in eachindex(y, x)
        y[i] += x[i]
    end
    return nothing
end

In [None]:
@benchmark parallel_add_A!(y, x)

In [None]:
@benchmark parallel_add_B!(y, x)