# Exercise: SIMD Data Dependency

Consider the following loop involving four vectors `a`,`b`,`c`, and `d`:

In [3]:
const LOOP_ITERATIONS = 8192
const N = LOOP_ITERATIONS + 2

"naive loop"
function loop_naive!(a, b, c, d)
    @inbounds for i in 1:LOOP_ITERATIONS
        a[i] = a[i] + b[i]
        b[i+2] = c[i] + d[i]
    end
end

a = rand(Float32, N)
b = rand(Float32, N)
c = rand(Float32, N)
d = rand(Float32, N)

loop_naive!(a,b,c,d)

This loop is hard to auto-vectorize because it has a **data-dependency**: we're reading and writing elements of the vector `b`.

**Task 1**: Check the native code produced for `loop_naive!(a,b,c,d)` and convince yourself that the Julia compiler hasn't vectorized this code. (There shouldn't be any usage of `ymm` or `zmm` registers etc.)

In [4]:
# TODO
@code_native loop_naive!(a,b,c,d)

	[0m.text
	[0m.file	[0m"loop_naive!"
	[0m.globl	[0m"japi1_loop_naive!_762"         [90m# -- Begin function japi1_loop_naive!_762[39m
	[0m.p2align	[33m4[39m[0m, [33m0x90[39m
	[0m.type	[0m"japi1_loop_naive!_762"[0m,[0m@function
[91m"japi1_loop_naive!_762":[39m                [90m# @"japi1_loop_naive!_762"[39m
[90m; ┌ @ /home/javier/JuliaUCL24/exercises/Day2/2_SIMD_datadep/simd_datadep.ipynb:5 within `loop_naive!`[39m
[90m# %bb.0:                                # %top[39m
	[96m[1mpush[22m[39m	[0mrbp
	[96m[1mmov[22m[39m	[0mrbp[0m, [0mrsp
	[96m[1mpush[22m[39m	[0mr15
	[96m[1mpush[22m[39m	[0mr14
	[96m[1mpush[22m[39m	[0mr13
	[96m[1mpush[22m[39m	[0mr12
	[96m[1mpush[22m[39m	[0mrbx
	[96m[1mmov[22m[39m	[95mqword[39m [95mptr[39m [33m[[39m[0mrbp [0m- [33m56[39m[33m][39m[0m, [0mrsi
	[96m[1mmov[22m[39m	[0mrax[0m, [95mqword[39m [95mptr[39m [33m[[39m[0mrsi[33m][39m
	[96m[1mmov[22m[39m	[0mrcx[0m, [95mqwo

[0m*[0mrdi [0m+ [33m24[39m[33m][39m [90m# xmm1 = mem[0],zero[39m
[90m; │└[39m
[90m; │┌ @ float.jl:409 within `+`[39m
	[96m[1mvaddps[22m[39m	[0mxmm0[0m, [0mxmm0[0m, [0mxmm1
[90m; │└[39m
[90m; │┌ @ array.jl:1021 within `setindex!`[39m
	[96m[1mvmovlps[22m[39m	[95mqword[39m [95mptr[39m [33m[[39m[0mrcx [0m+ [33m4[39m[0m*[0mrdi [0m+ [33m32[39m[33m][39m[0m, [0mxmm0
	[96m[1madd[22m[39m	[0mrdi[0m, [33m8[39m
	[96m[1mcmp[22m[39m	[0mrdi[0m, [33m8192[39m
	[96m[1mjne[22m[39m	[0m.LBB0_8
	[96m[1mjmp[22m[39m	[0m.LBB0_9
[91m.LBB0_5:[39m                                [90m# %L2.preheader[39m
	[96m[1mxor[22m[39m	[0medi[0m, [0medi
	[0m.p2align	[33m4[39m[0m, [33m0x90[39m
[91m.LBB0_6:[39m                                [90m# %L2[39m
                                        [90m# =>This Inner Loop Header: Depth=1[39m
[90m; │└[39m
[90m; │ @ /home/javier/JuliaUCL24/exercises/Day2/2_SIMD_datadep/simd_datadep.ipynb


**Task 2**: Implement the same loop in `loop_naive_simd!` and try to force SIMD-vectorization with the corresponding performance macro. (You shall keep the `@inbounds` as well.)

In [8]:
"naive loop + try force SIMD"
function loop_naive_simd!(a, b, c, d)
    @inbounds @simd for i in 1:LOOP_ITERATIONS
        a[i] = a[i] + b[i]
        b[i+2] = c[i] + d[i]
    end
end

loop_naive_simd!

**Task 3**: Check the native code of `loop_naive_simd!`. Has the code improved? The learning here is that just putting `@simd` in front of a loop and hoping for the best isn't a particularly good strategy 😉

In [9]:
@code_native loop_naive_simd!(a,b,c,d)

	[0m.text
	[0m.file	[0m"loop_naive_simd!"
	[0m.globl	[0m"japi1_loop_naive_simd!_992"    [90m# -- Begin function japi1_loop_naive_simd!_992[39m
	[0m.p2align	[33m4[39m[0m, [33m0x90[39m
	[0m.type	[0m"japi1_loop_naive_simd!_992"[0m,[0m@function
[91m"japi1_loop_naive_simd!_992":[39m           [90m# @"japi1_loop_naive_simd!_992"[39m
[90m; ┌ @ /home/javier/JuliaUCL24/exercises/Day2/2_SIMD_datadep/simd_datadep.ipynb:2 within `loop_naive_simd!`[39m
[90m# %bb.0:                                # %top[39m
	[96m[1mpush[22m[39m	[0mrbp
	[96m[1mmov[22m[39m	[0mrbp[0m, [0mrsp
	[96m[1mpush[22m[39m	[0mr15
	[96m[1mpush[22m[39m	[0mr14
	[96m[1mpush[22m[39m	[0mr13
	[96m[1mpush[22m[39m	[0mr12
	[96m[1mpush[22m[39m	[0mrbx
	[96m[1mmov[22m[39m	[95mqword[39m [95mptr[39m [33m[[39m[0mrbp [0m- [33m56[39m[33m][39m[0m, [0mrsi
	[96m[1mmov[22m[39m	[0mrax[0m, [95mqword[39m [95mptr[39m [33m[[39m[0mrsi[33m][39m
	[96m[1mmov[22m[


[90m; ││┌ @ essentials.jl:13 within `getindex`[39m
	[96m[1mvmovsd[22m[39m	[0mxmm0[0m, [95mqword[39m [95mptr[39m [33m[[39m[0mrdx [0m+ [33m4[39m[0m*[0mrdi [0m+ [33m24[39m[33m][39m [90m# xmm0 = mem[0],zero[39m
	[96m[1mvmovsd[22m[39m	[0mxmm1[0m, [95mqword[39m [95mptr[39m [33m[[39m[0mrsi [0m+ [33m4[39m[0m*[0mrdi [0m+ [33m24[39m[33m][39m [90m# xmm1 = mem[0],zero[39m
[90m; ││└[39m
[90m; ││┌ @ float.jl:409 within `+`[39m
	[96m[1mvaddps[22m[39m	[0mxmm0[0m, [0mxmm0[0m, [0mxmm1
[90m; ││└[39m
[90m; ││┌ @ array.jl:1021 within `setindex!`[39m
	[96m[1mvmovlps[22m[39m	[95mqword[39m [95mptr[39m [33m[[39m[0mrcx [0m+ [33m4[39m[0m*[0mrdi [0m+ [33m32[39m[33m][39m[0m, [0mxmm0
	[96m[1madd[22m[39m	[0mrdi[0m, [33m8[39m
[90m; ││└[39m
[90m; ││ @ simdloop.jl:78 within `macro expansion`[39m
[90m; ││┌ @ int.jl:87 within `+`[39m
	[96m[1mcmp[22m[39m	[0mrdi[0m, [33m8192[39m
	[96m[1mjne[22m[39m	[0m.LB

**Task 4**: Benchmark and compare the variants. What do you observe?


In [10]:
using BenchmarkTools

@btime loop_naive!($a,$b,$c,$d)
@btime loop_naive_simd!($a,$b,$c,$d)

  2.719 μs (0 allocations: 0 bytes)


  2.814 μs (0 allocations: 0 bytes)



**Task 5**: Take a closer look at the loop. Can you "resolve" the data-dependency issue by splitting up the loop into two separate loops? Implement this improved version in the functions below. Use `@simd` for the loops in the second function. (Again, keep `@inbounds` for all loops in both functions.)

In [11]:
"optimized loop"
function loop_opt!(a, b, c, d)

    @inbounds for i in 1:LOOP_ITERATIONS
        a[i] = a[i] + b[i]
    end

    @inbounds for i in 1:LOOP_ITERATIONS
        b[i+2] = c[i] + d[i]
    end

end

"optimized loop + `@simd`"

function loop_opt_simd!(a, b, c, d)
    @inbounds @simd for i in 1:LOOP_ITERATIONS
        a[i] = a[i] + b[i]
    end

    @inbounds @simd for i in 1:LOOP_ITERATIONS
        b[i+2] = c[i] + d[i]
    end
    
end

loop_opt_simd! (generic function with 1 method)

**Task 6**: Benchmark those new variants as well.
  * How do they compare to each other?
  * Did the SIMD performance macro help? (Hint: It shouldn't.)
  * How does the performance compare to the unoptimized variants above?

In [12]:
@btime loop_opt!($a,$b,$c,$d)
@btime loop_opt_simd!($a,$b,$c,$d)

  1.399 μs (0 allocations: 0 bytes)


  1.404 μs (0 allocations: 0 bytes)



**Task 7**: Check the native code of e.g. `loop_opt_simd!`. Did it vectorize properly? (Look e.g. for `ymm` and `zmm` registers as well as a block of `vaddps` instructions. Note though, that this is system-dependent.)

In [14]:
@code_native loop_opt_simd!(a,b,c,d)

	[0m.text
	[0m.file	[0m"loop_opt_simd!"
	[0m.globl	[0m"japi1_loop_opt_simd!_1232"     [90m# -- Begin function japi1_loop_opt_simd!_1232[39m
	[0m.p2align	[33m4[39m[0m, [33m0x90[39m
	[0m.type	[0m"japi1_loop_opt_simd!_1232"[0m,[0m@function
[91m"japi1_loop_opt_simd!_1232":[39m            [90m# @"japi1_loop_opt_simd!_1232"[39m
[90m; ┌ @ /home/javier/JuliaUCL24/exercises/Day2/2_SIMD_datadep/simd_datadep.ipynb:16 within `loop_opt_simd!`[39m
[90m# %bb.0:                                # %top[39m
	[96m[1mpush[22m[39m	[0mrbp
	[96m[1mmov[22m[39m	[0mrbp[0m, [0mrsp
	[96m[1mmov[22m[39m	[95mqword[39m [95mptr[39m [33m[[39m[0mrbp [0m- [33m8[39m[33m][39m[0m, [0mrsi
	[96m[1mmov[22m[39m	[0mrax[0m, [95mqword[39m [95mptr[39m [33m[[39m[0mrsi[33m][39m
	[96m[1mmov[22m[39m	[0mrdi[0m, [95mqword[39m [95mptr[39m [33m[[39m[0mrsi [0m+ [33m8[39m[33m][39m
	[96m[1mmov[22m[39m	[0mrcx[0m, [95mqword[39m [95mptr[39m [33m[[39

[0mymm3[0m, [0mymm3[0m, [95mymmword[39m [95mptr[39m [33m[[39m[0mrax [0m+ [33m4[39m[0m*[0mrdi [0m+ [33m96[39m[33m][39m
[90m; ││└[39m
[90m; ││┌ @ array.jl:1021 within `setindex!`[39m
	[96m[1mvmovups[22m[39m	[95mymmword[39m [95mptr[39m [33m[[39m[0mrsi [0m+ [33m4[39m[0m*[0mrdi[33m][39m[0m, [0mymm0
	[96m[1mvmovups[22m[39m	[95mymmword[39m [95mptr[39m [33m[[39m[0mrsi [0m+ [33m4[39m[0m*[0mrdi [0m+ [33m32[39m[33m][39m[0m, [0mymm1
	[96m[1mvmovups[22m[39m	[95mymmword[39m [95mptr[39m [33m[[39m[0mrsi [0m+ [33m4[39m[0m*[0mrdi [0m+ [33m64[39m[33m][39m[0m, [0mymm2
	[96m[1mvmovups[22m[39m	[95mymmword[39m [95mptr[39m [33m[[39m[0mrsi [0m+ [33m4[39m[0m*[0mrdi [0m+ [33m96[39m[33m][39m[0m, [0mymm3
[90m; ││└[39m
[90m; ││┌ @ essentials.jl:13 within `getindex`[39m
	[96m[1mvmovups[22m[39m	[0mymm0[0m, [95mymmword[39m [95mptr[39m [33m[[39m[0mrsi [0m+ [33m4[39m[0m*[0mrdi [0m+ [3


[90m; ││└[39m
[90m; ││┌ @ float.jl:409 within `+`[39m
	[96m[1mvaddss[22m[39m	[0mxmm0[0m, [0mxmm0[0m, [95mdword[39m [95mptr[39m [33m[[39m[0mrdx [0m+ [33m4[39m[0m*[0mrsi[33m][39m
[90m; ││└[39m
[90m; ││┌ @ array.jl:1021 within `setindex!`[39m
	[96m[1mvmovss[22m[39m	[95mdword[39m [95mptr[39m [33m[[39m[0mrax [0m+ [33m4[39m[0m*[0mrsi [0m+ [33m8[39m[33m][39m[0m, [0mxmm0
[90m; ││└[39m
[90m; ││┌ @ essentials.jl:13 within `getindex`[39m
	[96m[1mvmovss[22m[39m	[0mxmm0[0m, [95mdword[39m [95mptr[39m [33m[[39m[0mrcx [0m+ [33m4[39m[0m*[0mrsi [0m+ [33m4[39m[33m][39m [90m# xmm0 = mem[0],zero,zero,zero[39m
[90m; ││└[39m
[90m; ││┌ @ float.jl:409 within `+`[39m
	[96m[1mvaddss[22m[39m	[0mxmm0[0m, [0mxmm0[0m, [95mdword[39m [95mptr[39m [33m[[39m[0mrdx [0m+ [33m4[39m[0m*[0mrsi [0m+ [33m4[39m[33m][39m
[90m; ││└[39m
[90m; ││┌ @ array.jl:1021 within `setindex!`[39m
	[96m[1mvmovss[22m[39m	[95md