In [28]:
using CUDA, BenchmarkTools

In [29]:
VERSION, Base.Threads.nthreads(), Base.Sys.CPU_THREADS

(v"1.5.3", 8, 8)

In [None]:
include("sorting.jl")

In [None]:
using DataStructures, Random, Test

In [None]:

function test_quicksort(T, f, N)
    a = map(x -> T(f(x)), 1:N)
    c = CuArray(a)
    quicksort!(c)
    @test Array(c) == sort(a)
end


In [None]:
a = map(x -> rand(Float32), 1:10)

In [None]:
quicksort!(a)

In [None]:
test_quicksort(Float32, x -> rand(Float32), 10000)

In [38]:
N = 2^20
x = fill(1.0f0, N)  # a vector filled with 1.0 (Float32)
y = fill(2.0f0, N)  # a vector filled with 2.0


1048576-element Array{Float32,1}:
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 ⋮
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0

In [39]:
y .+= x  

1048576-element Array{Float32,1}:
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 ⋮
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0
 3.0

In [3]:
using Test
@test all(y .== 3.0f0)

[32m[1mTest Passed[22m[39m

In [40]:
function sequential_add_A!(y, x)
    for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

sequential_add_A! (generic function with 1 method)

In [41]:
function sequential_add_B!(y, x)
    for i in eachindex(y, x)
        y[i] += x[i]
    end
    return nothing
end

sequential_add_B! (generic function with 1 method)

In [48]:
x = fill(1.0f0, N)  # a vector filled with 1.0 (Float32)
y = fill(2.0f0, N)  # a vector filled with 2.0
@benchmark sequential_add_A!(y, x)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     264.052 μs (0.00% GC)
  median time:      276.700 μs (0.00% GC)
  mean time:        297.786 μs (0.00% GC)
  maximum time:     985.406 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [49]:
x = fill(1.0f0, N)  # a vector filled with 1.0 (Float32)
y = fill(2.0f0, N)  # a vector filled with 2.0
@benchmark sequential_add_B!(y, x)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     607.036 μs (0.00% GC)
  median time:      618.194 μs (0.00% GC)
  mean time:        653.162 μs (0.00% GC)
  maximum time:     1.440 ms (0.00% GC)
  --------------
  samples:          7643
  evals/sample:     1

In [50]:
function parallel_add_A!(y, x)
    Threads.@threads for i in eachindex(y, x)
        @inbounds y[i] += x[i]
    end
    return nothing
end

parallel_add_A! (generic function with 1 method)

In [51]:
function parallel_add_B!(y, x)
    Threads.@threads for i in eachindex(y, x)
        y[i] += x[i]
    end
    return nothing
end

parallel_add_B! (generic function with 1 method)

In [52]:
@benchmark parallel_add_A!(y, x)

BenchmarkTools.Trial: 
  memory estimate:  5.52 KiB
  allocs estimate:  41
  --------------
  minimum time:     171.017 μs (0.00% GC)
  median time:      190.863 μs (0.00% GC)
  mean time:        269.680 μs (0.11% GC)
  maximum time:     8.251 ms (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [53]:
@benchmark parallel_add_B!(y, x)

BenchmarkTools.Trial: 
  memory estimate:  5.52 KiB
  allocs estimate:  41
  --------------
  minimum time:     190.281 μs (0.00% GC)
  median time:      205.716 μs (0.00% GC)
  mean time:        306.835 μs (0.09% GC)
  maximum time:     8.385 ms (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1