# Broadcasting vs loops

In [1]:
import Pkg; Pkg.add("LoopVectorization")

[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/Work/jlProjects/QEDjl-project/forks/QEDprocesses.jl/Project.toml`
[32m[1m  No Changes[22m[39m to `~/Work/jlProjects/QEDjl-project/forks/QEDprocesses.jl/Manifest.toml`


In [2]:
import Pkg; Pkg.add("StructArrays")

[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/Work/jlProjects/QEDjl-project/forks/QEDprocesses.jl/Project.toml`
[32m[1m  No Changes[22m[39m to `~/Work/jlProjects/QEDjl-project/forks/QEDprocesses.jl/Manifest.toml`


In [3]:
using BenchmarkTools
using QEDbase
using Random
using LoopVectorization
using StructArrays

In [4]:
@inline function compute_generic(a::AbstractVector{T},b::AbstractVector{T}) where {T}
    sum(a)*sum(b)
end

compute_generic (generic function with 1 method)

In [5]:
RNG = MersenneTwister(1234)
NA = 20

A = [SFourMomentum(rand(RNG),0,0,rand(RNG)) for _ in 1:NA]
#A = [rand(RNG)  for _ in 1:NA]
B = [SFourMomentum(0,rand(RNG),rand(RNG),0) for _ in 1:NA]
#B = [rand(RNG) for _ in 1:NA]

20-element Vector{SFourMomentum}:
 [0.0, 0.046288741031345504, 0.6983555060532487, 0.0]
 [0.0, 0.3651093677271471, 0.3024777928234499, 0.0]
 [0.0, 0.3725754415996787, 0.15050782744925795, 0.0]
 [0.0, 0.14732938279328955, 0.2834013103457036, 0.0]
 [0.0, 0.40495283364883794, 0.49953074411487797, 0.0]
 [0.0, 0.6588147837334961, 0.5156272179795256, 0.0]
 [0.0, 0.26071522632820776, 0.5955204840509289, 0.0]
 [0.0, 0.2924615242315285, 0.2885798506061561, 0.0]
 [0.0, 0.6181597973815087, 0.6642598175011505, 0.0]
 [0.0, 0.7535081177709988, 0.03688418241886171, 0.0]
 [0.0, 0.6437042811826996, 0.40142056533714965, 0.0]
 [0.0, 0.5250572942486489, 0.6120098074984683, 0.0]
 [0.0, 0.43257652982765626, 0.0822070287962946, 0.0]
 [0.0, 0.19905799020907944, 0.5760819730593403, 0.0]
 [0.0, 0.21817706596841413, 0.3620355262053865, 0.0]
 [0.0, 0.20472832290217324, 0.93298350850828, 0.0]
 [0.0, 0.8272627957034728, 0.09929915955881308, 0.0]
 [0.0, 0.6342997886044144, 0.1327153585755645, 0.0]
 [0.0, 0.775194150

In [6]:
@benchmark compute_generic($A,$B)

BenchmarkTools.Trial: 10000 samples with 996 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m25.309 ns[22m[39m … [35m68.022 ns[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m25.435 ns              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m25.566 ns[22m[39m ± [32m 0.975 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▅[39m█[39m█[34m▆[39m[39m▄[32m▄[39m[39m▃[39m▂[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m▃[39m▂[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[39m█[34m█[39m[39m█

## Loops

In [7]:
function compute_loop(a::AbstractVector{T},b::AbstractVector{T}) where {T}
    compute_generic(a,b)
end

function compute_loop(a::AbstractMatrix{T},b::AbstractVector{T}) where {T}
    res = Vector{Float64}(undef,size(a,2))
    for i in 1:size(a,2)
        res[i] = compute_loop(view(a,:,i),b)
    end
    return res
end

function compute_loop(a::AbstractVector{T},b::AbstractMatrix{T}) where {T}
    res = Vector{Float64}(undef,size(b,2))
    for i in 1:size(b,2)
        res[i] = compute_loop(a,view(b,:,i))
    end
    return res
end



function compute_loop(a::AbstractMatrix{T},b::AbstractMatrix{T}) where {T}
    res = Matrix{Float64}(undef,size(a,2),size(b,2))    
    for i in 1:size(a,2)
        for j in 1:size(b,2)
            res[i,j] = compute_loop(view(a,:,i),view(b,:,j))
        end
    end
    return res
end

compute_loop (generic function with 4 methods)

In [8]:
compute_loop(Amat,Bmat)

LoadError: UndefVarError: `Amat` not defined

In [9]:
@benchmark compute_loop($A,$B)

BenchmarkTools.Trial: 10000 samples with 996 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m25.937 ns[22m[39m … [35m 1.142 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m26.062 ns              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m26.290 ns[22m[39m ± [32m11.177 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▆[39m█[39m█[34m▅[39m[39m▃[39m▄[39m▃[32m▁[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▃[39m▂[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[39m█[34m█[39m[39m█

In [45]:
RNG = MersenneTwister(1234)
NA = 20
NPA = 1000
NPB = 8000

Amat = reshape([SFourMomentum(rand(RNG),0,0,rand(RNG)) for i in 1:NA for j in 1:NPA],NA,NPA)
#Amat = reshape([rand(RNG) for i in 1:NA for j in 1:NPA],NA,NPA)
Bmat = reshape([SFourMomentum(0,rand(RNG),rand(RNG),0) for i in 1:NA for j in 1:NPB],NA,NPB)
#Bmat = reshape([rand(RNG) for i in 1:NA for j in 1:NPB],NA,NPB)

20×8000 Matrix{SFourMomentum}:
 [0.0, 0.370647, 0.605047, 0.0]    …  [0.0, 0.233952, 0.061902, 0.0]
 [0.0, 0.0738437, 0.145856, 0.0]      [0.0, 0.287189, 0.661897, 0.0]
 [0.0, 0.742915, 0.189013, 0.0]       [0.0, 0.983818, 0.102123, 0.0]
 [0.0, 0.974105, 0.733856, 0.0]       [0.0, 0.843953, 0.954599, 0.0]
 [0.0, 0.0802842, 0.520608, 0.0]      [0.0, 0.357858, 0.337542, 0.0]
 [0.0, 0.697297, 0.263278, 0.0]    …  [0.0, 0.233228, 0.873637, 0.0]
 [0.0, 0.183473, 0.70236, 0.0]        [0.0, 0.888544, 0.404108, 0.0]
 [0.0, 0.363925, 0.500057, 0.0]       [0.0, 0.499502, 0.101281, 0.0]
 [0.0, 0.794438, 0.86062, 0.0]        [0.0, 0.745506, 0.198773, 0.0]
 [0.0, 0.90015, 0.607183, 0.0]        [0.0, 0.0135882, 0.0497879, 0.0]
 [0.0, 0.910705, 0.279019, 0.0]    …  [0.0, 0.333372, 0.249483, 0.0]
 [0.0, 0.836186, 0.00902656, 0.0]     [0.0, 0.0915456, 0.0857297, 0.0]
 [0.0, 0.765127, 0.748125, 0.0]       [0.0, 0.0554674, 0.729182, 0.0]
 [0.0, 0.757855, 0.225406, 0.0]       [0.0, 0.479271, 0.347711, 0.0

In [25]:
@benchmark compute_loop($Amat,$B)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m26.375 μs[22m[39m … [35m 47.791 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m26.666 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m26.911 μs[22m[39m ± [32m855.480 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m [39m█[39m [39m [34m [39m[39m [39m [39m [32m [39m[39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▃[39m▆[39m█[39m▇[39m▄

In [12]:
@benchmark compute_loop($A,$Bmat)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m21.083 μs[22m[39m … [35m 43.792 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m21.292 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m21.585 μs[22m[39m ± [32m833.156 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▄[39m▆[39m█[34m▆[39m[39m▆[39m▂[39m▄[39m▃[32m [39m[39m [39m▁[39m▄[39m▅[39m▆[39m▃[39m▃[39m▃[39m▄[39m▂[39m▂[39m▂[39m▁[39m▁[39m [39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[39m█[34m█[39m

In [13]:
@benchmark compute_loop($Amat,$Bmat)

BenchmarkTools.Trial: 218 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m22.467 ms[22m[39m … [35m 24.949 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 7.24%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m22.741 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m22.956 ms[22m[39m ± [32m576.240 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.86% ± 2.17%

  [39m [39m [39m [39m [39m▃[39m▇[39m█[34m▇[39m[39m▄[39m▃[39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m▄[39m▁[39m▄[39m█[

## Broadcast

In [81]:
using Base.Threads

In [85]:
@inline function compute_broadcast(a::AbstractVector{T},b::AbstractVector{T}) where {T<:QEDbase.AbstractFourMomentum}
    compute_generic(a,b)
end

@inline function compute_broadcast(a::AbstractMatrix{T},b::AbstractVector{T}) where {T<:QEDbase.AbstractFourMomentum}
    return compute_broadcast.(eachcol(a),Ref(b))
end

function compute_broadcast(a::AbstractVecOrMat{T},b::AbstractMatrix{T}) where {T<:QEDbase.AbstractFourMomentum}
    return compute_broadcast.(Ref(a),eachcol(b))
end

compute_broadcast (generic function with 3 methods)

In [86]:
#compute_broadcast(Amat,Bmat)
#compute_broadcast(A,Bmat)
compute_broadcast(Amat,B)
#compute_broadcast(A,B)

[33m[1m│ [22m[39m`LoopVectorization.check_args` on your inputs failed; running fallback `@inbounds @fastmath` loop instead.
[33m[1m└ [22m[39m[90m@ LoopVectorization ~/.julia/packages/LoopVectorization/7gWfp/src/condense_loopset.jl:1148[39m


1000-element Vector{Float64}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [54]:
@benchmark compute_broadcast($Amat,$Bmat)

BenchmarkTools.Trial: 23 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m217.010 ms[22m[39m … [35m225.324 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 3.26%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m221.310 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m1.27%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m221.216 ms[22m[39m ± [32m  2.450 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m1.31% ± 1.07%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▃[39m [39m [34m [39m[32m [39m[39m▃[39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▇[39m▁[39m▇[39m▁

In [55]:
@benchmark compute_loop($Amat,$Bmat)

BenchmarkTools.Trial: 22 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m225.945 ms[22m[39m … [35m232.898 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 1.87%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m228.562 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.08%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m228.684 ms[22m[39m ± [32m  1.778 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.74% ± 0.83%

  [39m▁[39m [39m [39m [39m [39m▁[39m▁[39m▁[39m▁[39m [39m▁[39m [39m▁[39m [39m█[39m [39m [39m [39m [39m█[34m [39m[39m [39m [39m [32m [39m[39m [39m▁[39m▁[39m▁[39m [39m [39m [39m▁[39m [39m [39m▁[39m [39m▁[39m▁[39m▁[39m [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m 
  [39m█[39m▁[39m▁[39m▁

# AVX

In [18]:
function compute_avx(a::AbstractVector{T},b::AbstractVector{T}) where {T}
    compute_generic(a,b)
end

function compute_avx(a::AbstractMatrix{T},b::AbstractVector{T}) where {T}
    res = Vector{Float64}(undef,size(a,2))
    @turbo for i in 1:size(a,2)
        res[i] = compute_avx(view(a,:,i),b)
    end
    return res
end

function compute_avx(a::AbstractVector{T},b::AbstractMatrix{T}) where {T}
    res = Vector{Float64}(undef,size(b,2))
    @turbo for i in 1:size(b,2)
        res[i] = compute_avx(a,view(b,:,i))
    end
    return res
end



function compute_avx(a::AbstractMatrix{T},b::AbstractMatrix{T}) where {T}
    res = Matrix{Float64}(undef,size(a,2),size(b,2))    
    @turbo for i in 1:size(a,2)
        for j in 1:size(b,2)
            res[i,j] = compute_avx(view(a,:,i),view(b,:,j))
        end
    end
    return res
end

compute_avx (generic function with 4 methods)

In [19]:
@benchmark compute_avx($Amat,$B)

[33m[1m│ [22m[39m`LoopVectorization.check_args` on your inputs failed; running fallback `@inbounds @fastmath` loop instead.
[33m[1m└ [22m[39m[90m@ Main ~/.julia/packages/LoopVectorization/7gWfp/src/condense_loopset.jl:1148[39m


BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m26.083 μs[22m[39m … [35m50.083 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m26.375 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m26.820 μs[22m[39m ± [32m 1.208 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▂[39m▆[39m█[39m▆[34m▄[39m[39m▃[39m▂[39m [39m [39m▅[32m▆[39m[39m▄[39m▅[39m▅[39m▄[39m▃[39m▃[39m▁[39m▂[39m▂[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m█[39m█[39m█[34m█[39m[39

In [20]:
@benchmark compute_loop($Amat,$B)

BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m26.333 μs[22m[39m … [35m47.875 μs[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m26.625 μs              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m26.991 μs[22m[39m ± [32m 1.051 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m [39m▂[39m█[39m [34m [39m[39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▂[39m█[39m█[39m▆[34m▃[39m[39

## AVX using StructArrays

In [21]:
As = StructVector(A)
@show typeof(As)
Bs = StructVector(B)
@show typeof(Bs)
Amats = StructArray(Amat)
@show typeof(Amats)
Bmats = StructArray(Bmat)
@show typeof(Bmats)

typeof(As) = StructVector{SFourMomentum, NamedTuple{(:E, :px, :py, :pz), NTuple{4, Vector{Float64}}}, Int64}
typeof(Bs) = StructVector{SFourMomentum, NamedTuple{(:E, :px, :py, :pz), NTuple{4, Vector{Float64}}}, Int64}
typeof(Amats) = StructArray{SFourMomentum, 2, NamedTuple{(:E, :px, :py, :pz), NTuple{4, Matrix{Float64}}}, Int64}
typeof(Bmats) = StructArray{SFourMomentum, 2, NamedTuple{(:E, :px, :py, :pz), NTuple{4, Matrix{Float64}}}, Int64}


StructArray{SFourMomentum, 2, NamedTuple{(:E, :px, :py, :pz), NTuple{4, Matrix{Float64}}}, Int64}

In [22]:
@benchmark compute_avx($Amats,$Bmats)

[33m[1m│ [22m[39m`LoopVectorization.check_args` on your inputs failed; running fallback `@inbounds @fastmath` loop instead.
[33m[1m└ [22m[39m[90m@ Main ~/.julia/packages/LoopVectorization/7gWfp/src/condense_loopset.jl:1148[39m


BenchmarkTools.Trial: 11 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m456.681 ms[22m[39m … [35m460.994 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m9.81% … 10.07%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m457.628 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m9.79%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m458.296 ms[22m[39m ± [32m  1.361 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m9.89% ±  0.16%

  [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m█[34m▁[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m [39m▁[39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m 
  [39m█[39m▁[39m▁[39

In [23]:
@benchmark compute_loop($Amat,$Bmat)

BenchmarkTools.Trial: 218 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m22.580 ms[22m[39m … [35m 24.975 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 8.33%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m22.800 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m23.033 ms[22m[39m ± [32m569.104 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.87% ± 2.21%

  [39m [39m [39m▇[39m▆[39m█[39m▇[34m▄[39m[39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▂[39m▅[39m█[39m█[39m█[

In [60]:
 Broadcast.DefaultArrayStyle{0}()


Base.Broadcast.DefaultArrayStyle{0}()

In [112]:
function testsum(N)
    s = 0
    for i in 1:N
        s+=i
    end 
    return s
end

function testsum_threads(N)
    s = 0
    @threads for i in 1:N
        s+=1
    end 
    return s
end

testsum_threads (generic function with 1 method)

In [113]:
@benchmark testsum(100000)

BenchmarkTools.Trial: 10000 samples with 1000 evaluations.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.500 ns[22m[39m … [35m16.500 ns[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m1.583 ns              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m1.572 ns[22m[39m ± [32m 0.178 ns[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.00% ± 0.00%

  [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [34m█[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▃[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m█[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[

In [114]:
@benchmark testsum_threads(100000)

BenchmarkTools.Trial: 4174 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.102 ms[22m[39m … [35m  4.792 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 76.11%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m1.111 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m1.197 ms[22m[39m ± [32m398.782 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m5.00% ± 10.33%

  [34m█[39m[39m▆[32m▄[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [34m█[39m[39m█[32m█[39m[39m█[39