In [1]:
using Revise
using SnpArrays
using LinearAlgebra
using Random
using LoopVectorization
using MendelIHT
using BenchmarkTools

┌ Info: Precompiling SnpArrays [4e780e97-f5bf-4111-9dc4-b70aaf691b06]
└ @ Base loading.jl:1317
┌ Info: Precompiling MendelIHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1317


# $C = A^tB$ correctness

If we want to center/scale the SnpArray, we have
$$
C_{ij} = \sum_{k} \left(\frac{A_{ik} - \mu_i}{\sigma_i}\right)B_{kj} = \sum_{k} \frac{A_{ik}B_{kj} - \mu_iB_{kj}}{\sigma_i}
$$
+ $C = n \times p$
+ $A = m \times n$
+ $B = m \times p$
+ SnpArray model = Additive, dominant, recessive

In [2]:
model = ADDITIVE_MODEL
center = true
scale = true
impute = true
m = 4098
n = 1025
p = 1025
x = simulate_random_snparray(undef, m, n)
if impute
    for j in 1:n, i in 1:m
        rand() < 0.01 && (x[i, j] = 0x01) # create ~1% missings
    end
end

A = SnpLinAlg{Float64}(x, model=model, impute=impute, center=center, scale=scale)
B = rand(m, p)
C = zeros(n, p)
LinearAlgebra.mul!(C, Transpose(A), B)

Ctrue = convert(Matrix{Float64}, x, impute=impute, model=model, center=center, scale=scale)' * B
@show all(isapprox(C, Ctrue, atol=1e-7))

all(isapprox(C, Ctrue, atol = 1.0e-7)) = true


true

## Speed: SnpLinAlg-(matrix) vs multiple SnpLinAlg-vector

In [3]:
# C = A'B by multiple C[:, i] = A'B[:, i]
function adhoc_mul!(
    out::AbstractMatrix, 
    st::Union{Transpose{T, SnpLinAlg{T}}, Adjoint{T, SnpLinAlg{T}}},
    v::AbstractMatrix) where T
    for i in 1:size(v, 2)
        outi = @view(out[:, i])
        vi = @view(v[:, i])
        SnpArrays.mul!(outi, st, vi)
    end
end

adhoc_mul! (generic function with 1 method)

### r = 2

In [4]:
n = 5340   # number of samples
p = 24523  # number of SNPs
r = 2      # number of traits
x = simulate_random_snparray(undef, n, p)

# test correctness
A = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, impute=false, center=true, scale=true)
B = rand(n, r)
C = zeros(p, r)
Ctest = zeros(p, r)
LinearAlgebra.mul!(C, Transpose(A), B)
adhoc_mul!(Ctest, Transpose(A), B)
all(isapprox(C, Ctest, atol=1e-7))

true

In [63]:
@benchmark LinearAlgebra.mul!($C, $(A'), $B) # SnpLinAlg-matrix

BenchmarkTools.Trial: 
  memory estimate:  272 bytes
  allocs estimate:  1
  --------------
  minimum time:     65.141 ms (0.00% GC)
  median time:      67.397 ms (0.00% GC)
  mean time:        67.514 ms (0.00% GC)
  maximum time:     81.520 ms (0.00% GC)
  --------------
  samples:          75
  evals/sample:     1

In [64]:
@benchmark adhoc_mul!($Ctest, $(A'), $B) # multiple SnpLinAlg-vector

BenchmarkTools.Trial: 
  memory estimate:  352 bytes
  allocs estimate:  2
  --------------
  minimum time:     152.332 ms (0.00% GC)
  median time:      154.193 ms (0.00% GC)
  mean time:        155.205 ms (0.00% GC)
  maximum time:     179.908 ms (0.00% GC)
  --------------
  samples:          33
  evals/sample:     1

In [65]:
Afloat = convert(Matrix{Float64}, A)
BLAS.set_num_threads(8)
@benchmark LinearAlgebra.mul!($C, $(Afloat'), $B) # BLAS with 8 threads

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     58.193 ms (0.00% GC)
  median time:      59.450 ms (0.00% GC)
  mean time:        59.974 ms (0.00% GC)
  maximum time:     73.805 ms (0.00% GC)
  --------------
  samples:          84
  evals/sample:     1

In [66]:
BLAS.set_num_threads(1)
@benchmark LinearAlgebra.mul!($C, $(Afloat'), $B) # BLAS with 1 threads

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     172.730 ms (0.00% GC)
  median time:      182.772 ms (0.00% GC)
  mean time:        184.433 ms (0.00% GC)
  maximum time:     201.468 ms (0.00% GC)
  --------------
  samples:          28
  evals/sample:     1

### r = 5

In [68]:
n = 5340   # number of samples
p = 24523  # number of SNPs
r = 5      # number of traits
x = simulate_random_snparray(undef, n, p)

# test correctness
A = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, impute=false, center=true, scale=true)
B = rand(n, r)
C = zeros(p, r)
Ctest = zeros(p, r)
LinearAlgebra.mul!(C, Transpose(A), B)
adhoc_mul!(Ctest, Transpose(A), B)
all(isapprox(C, Ctest, atol=1e-7))

true

In [69]:
@benchmark LinearAlgebra.mul!($C, $(A'), $B) # SnpLinAlg-matrix

BenchmarkTools.Trial: 
  memory estimate:  272 bytes
  allocs estimate:  1
  --------------
  minimum time:     150.262 ms (0.00% GC)
  median time:      153.544 ms (0.00% GC)
  mean time:        154.391 ms (0.00% GC)
  maximum time:     163.649 ms (0.00% GC)
  --------------
  samples:          33
  evals/sample:     1

In [70]:
@benchmark adhoc_mul!($Ctest, $(A'), $B) # multiple SnpLinAlg-vector

BenchmarkTools.Trial: 
  memory estimate:  880 bytes
  allocs estimate:  5
  --------------
  minimum time:     388.541 ms (0.00% GC)
  median time:      398.383 ms (0.00% GC)
  mean time:        397.483 ms (0.00% GC)
  maximum time:     411.374 ms (0.00% GC)
  --------------
  samples:          13
  evals/sample:     1

In [71]:
Afloat = convert(Matrix{Float64}, A)
BLAS.set_num_threads(8)
@benchmark LinearAlgebra.mul!($C, $(Afloat'), $B) # BLAS with 8 threads

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     60.062 ms (0.00% GC)
  median time:      61.299 ms (0.00% GC)
  mean time:        61.655 ms (0.00% GC)
  maximum time:     68.966 ms (0.00% GC)
  --------------
  samples:          82
  evals/sample:     1

In [72]:
BLAS.set_num_threads(1)
@benchmark LinearAlgebra.mul!($C, $(Afloat'), $B) # BLAS with 1 threads

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     193.392 ms (0.00% GC)
  median time:      201.759 ms (0.00% GC)
  mean time:        203.793 ms (0.00% GC)
  maximum time:     223.175 ms (0.00% GC)
  --------------
  samples:          25
  evals/sample:     1

# $C = AB$ correctness

If we want to center/scale the SnpArray, we have
$$
C_{ij} = \sum_{k} \left(\frac{A_{ik} - \mu_k}{\sigma_k}\right)B_{kj} = \sum_{k} \frac{A_{ik}B_{kj} - \mu_kB_{kj}}{\sigma_k}
$$
+ $C = m \times p$
+ $A = m \times n$
+ $B = n \times p$
+ SnpArray model = Additive, dominant, recessive

In [40]:
model = ADDITIVE_MODEL
center = true
scale = true
impute = true
m = 4098
n = 1025
p = 1025
x = simulate_random_snparray(undef, m, n)
if impute
    for j in 1:n, i in 1:m
        rand() < 0.01 && (x[i, j] = 0x01) # create ~1% missings
    end
end

A = SnpLinAlg{Float64}(x, model=model, impute=impute, center=center, scale=scale)
B = ones(n, p)
C = zeros(m, p)
LinearAlgebra.mul!(C, A, B)

Ctrue = convert(Matrix{Float64}, x, impute=impute, model=model, center=center, scale=scale) * B
@show all(C .≈ Ctrue)

all(C .≈ Ctrue) = true


true

## Speed: SnpLinAlg-(matrix) vs multiple SnpLinAlg-vector

In [3]:
# C = AB by multiple C[:, i] = AB[:, i]
function adhoc_mul!(
    out::AbstractMatrix, 
    st::AbstractSnpLinAlg,
    v::AbstractMatrix)
    for i in 1:size(v, 2)
        outi = @view(out[:, i])
        vi = @view(v[:, i])
        SnpArrays.mul!(outi, st, vi)
    end
end

adhoc_mul! (generic function with 1 method)

### r = 2

In [41]:
n = 5340   # number of samples
p = 24523  # number of SNPs
r = 2      # number of traits
x = simulate_random_snparray(undef, n, p)

# test correctness
A = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, impute=true, center=true, scale=true)
B = ones(p, r)
C = zeros(n, r)
Ctest = zeros(n, r)
LinearAlgebra.mul!(C, A, B)
adhoc_mul!(Ctest, A, B)
all(Ctest .≈ C)

true

In [107]:
@benchmark LinearAlgebra.mul!($C, $A, $B) # SnpLinAlg-matrix

BenchmarkTools.Trial: 
  memory estimate:  128 bytes
  allocs estimate:  1
  --------------
  minimum time:     79.696 ms (0.00% GC)
  median time:      82.416 ms (0.00% GC)
  mean time:        83.003 ms (0.00% GC)
  maximum time:     96.007 ms (0.00% GC)
  --------------
  samples:          61
  evals/sample:     1

In [109]:
@benchmark adhoc_mul!($Ctest, $A, $B) # multiple SnpLinAlg-vector

BenchmarkTools.Trial: 
  memory estimate:  192 bytes
  allocs estimate:  2
  --------------
  minimum time:     114.517 ms (0.00% GC)
  median time:      122.829 ms (0.00% GC)
  mean time:        122.406 ms (0.00% GC)
  maximum time:     126.932 ms (0.00% GC)
  --------------
  samples:          41
  evals/sample:     1

In [31]:
Afloat = convert(Matrix{Float64}, A)
BLAS.set_num_threads(8)
@benchmark LinearAlgebra.mul!($C, $Afloat, $B) # BLAS with 8 threads

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     45.535 ms (0.00% GC)
  median time:      46.517 ms (0.00% GC)
  mean time:        46.781 ms (0.00% GC)
  maximum time:     53.563 ms (0.00% GC)
  --------------
  samples:          107
  evals/sample:     1

In [32]:
BLAS.set_num_threads(1)
@benchmark LinearAlgebra.mul!($C, $Afloat, $B) # BLAS with 1 threads

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     132.407 ms (0.00% GC)
  median time:      136.632 ms (0.00% GC)
  mean time:        138.222 ms (0.00% GC)
  maximum time:     153.705 ms (0.00% GC)
  --------------
  samples:          37
  evals/sample:     1

### r = 5

In [34]:
n = 5340
p = 24523
r = 5
x = simulate_random_snparray(undef, n, p)

# test correctness
A = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, impute=true, center=true, scale=true)
B = ones(p, r)
C = zeros(n, r)
Ctest = zeros(n, r)
LinearAlgebra.mul!(C, A, B)
adhoc_mul!(Ctest, A, B)
all(Ctest .≈ C)

true

In [35]:
@benchmark LinearAlgebra.mul!($C, $A, $B) # SnpLinAlg-matrix

BenchmarkTools.Trial: 
  memory estimate:  128 bytes
  allocs estimate:  1
  --------------
  minimum time:     132.495 ms (0.00% GC)
  median time:      135.236 ms (0.00% GC)
  mean time:        135.548 ms (0.00% GC)
  maximum time:     147.035 ms (0.00% GC)
  --------------
  samples:          37
  evals/sample:     1

In [36]:
@benchmark adhoc_mul!($Ctest, $A, $B)

BenchmarkTools.Trial: 
  memory estimate:  480 bytes
  allocs estimate:  5
  --------------
  minimum time:     230.470 ms (0.00% GC)
  median time:      236.705 ms (0.00% GC)
  mean time:        295.087 ms (0.00% GC)
  maximum time:     699.488 ms (0.00% GC)
  --------------
  samples:          19
  evals/sample:     1

In [37]:
Afloat = convert(Matrix{Float64}, A)
BLAS.set_num_threads(8)
@benchmark LinearAlgebra.mul!($C, $Afloat, $B) # BLAS with 8 threads

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     45.511 ms (0.00% GC)
  median time:      49.890 ms (0.00% GC)
  mean time:        51.769 ms (0.00% GC)
  maximum time:     68.548 ms (0.00% GC)
  --------------
  samples:          97
  evals/sample:     1

In [38]:
BLAS.set_num_threads(1)
@benchmark LinearAlgebra.mul!($C, $Afloat, $B) # BLAS with 1 threads

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     148.367 ms (0.00% GC)
  median time:      154.699 ms (0.00% GC)
  mean time:        155.224 ms (0.00% GC)
  maximum time:     166.649 ms (0.00% GC)
  --------------
  samples:          33
  evals/sample:     1

## AX on-the-fly centering vs centering at the end

In [10]:
n = 5340   # number of samples
p = 24523  # number of SNPs
r = 2      # number of traits
x = simulate_random_snparray(undef, n, p)

# test correctness
A = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, impute=true, center=true, scale=true)
B = rand(p, r)
C = zeros(n, r)
LinearAlgebra.mul!(C, A, B)
Ctrue = convert(Matrix{Float64}, x, impute=true, center=true, scale=true) * B
all(Ctrue .≈ C)

true

In [11]:
# on the fly
@benchmark LinearAlgebra.mul!($C, $A, $B) seconds=30

BenchmarkTools.Trial: 
  memory estimate:  128 bytes
  allocs estimate:  1
  --------------
  minimum time:     108.810 ms (0.00% GC)
  median time:      112.315 ms (0.00% GC)
  mean time:        112.493 ms (0.00% GC)
  maximum time:     127.028 ms (0.00% GC)
  --------------
  samples:          267
  evals/sample:     1

In [9]:
# centering at the end
@benchmark LinearAlgebra.mul!($C, $A, $B) seconds=30

BenchmarkTools.Trial: 
  memory estimate:  128 bytes
  allocs estimate:  1
  --------------
  minimum time:     121.378 ms (0.00% GC)
  median time:      125.050 ms (0.00% GC)
  mean time:        125.445 ms (0.00% GC)
  maximum time:     138.678 ms (0.00% GC)
  --------------
  samples:          240
  evals/sample:     1

## $Ax$ bug

In [84]:
# before fixing issue 90, any n between 4097 and 4099 doesn't work!
n = 4099
p = 1000
x = simulate_random_snparray(undef, n, p, min_ma=0)

A = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, impute=false, center=false, scale=false)
b = ones(p)
c = A * b
ctrue = convert(Matrix{Float64}, A) * b
@test all(c .≈ ctrue)

rows_filled = 4099
M = 1024, Miter = 1, Mrem = 3
N = 1000, Niter = 0, Nrem = 1000


[32m[1mTest Passed[22m[39m

In [17]:
using Revise
using SnpArrays
using LinearAlgebra
using Random
using LoopVectorization
using MendelIHT
using Test
using BenchmarkTools

# after fixing issue 90, if n is 1024, 2048 or 3072, still won't work!
n = 4097
p = 2048
x = simulate_random_snparray(undef, n, p) #defined in my package `MendelIHT.jl`

x[1, 2048] = 0x01
x[2048, 1] = 0x01

A = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, impute=true, center=false, scale=false)
b = ones(p)
c = A * b
ctrue = convert(Matrix{Float64}, x, impute=true, center=false, scale=false) * b
@test all(c .≈ ctrue)

[32m[1mTest Passed[22m[39m

## $A^tx$ bug

In [61]:
# before fixing issue 90, any n between 8193 and 8195 doesn't work!
n = 8193
p = 1000
x = simulate_random_snparray(undef, n, p, min_ma=0)

A = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, impute=false, center=false, scale=false)
b = ones(n)
c = A' * b
ctrue = convert(Matrix{Float64}, A)' * b
@test all(c .≈ ctrue)

rows_filled = 8193
M = 2048, Miter = 1, Mrem = 1
N = 1000, Niter = 0, Nrem = 1000


[32m[1mTest Passed[22m[39m

In [89]:
using Revise
using SnpArrays
using LinearAlgebra
using Random
using LoopVectorization
using MendelIHT
using Test
using BenchmarkTools

# after fixing issue 90, if n is 2048, 4096 or 6144, still won't work!
n = 8192+6144
p = 2048
x = simulate_random_snparray(undef, n, p) #defined in my package `MendelIHT.jl`

# x[1, 8193] = 0x01
# x[8193, 1] = 0x01

A = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, impute=true, center=false, scale=false)
b = ones(n)
c = A' * b
ctrue = convert(Matrix{Float64}, x, impute=true, center=false, scale=false)' * b
@test all(c .≈ ctrue)

rows_filled = 14336
M = 3584, Miter = 1, Mrem = 6144
N = 2048, Niter = 1, Nrem = 0


[32m[1mTest Passed[22m[39m

In [4]:
model = ADDITIVE_MODEL
center = true
scale = true
impute = true
n = 8193
p = 1000
x = simulate_random_snparray(undef, n, p, min_ma=0)
if impute
    for j in 1:p, i in 1:n
        rand() < 0.01 && (x[i, j] = 0x01) # create ~1% missings
    end
end

A = SnpLinAlg{Float64}(x, model=model, impute=impute, center=center, scale=scale)
b = rand(n)
c = zeros(p)
LinearAlgebra.mul!(c, Transpose(A), b)

ctrue = convert(Matrix{Float64}, x, impute=impute, model=model, center=center, scale=scale)' * b
@show all(isapprox(c, ctrue, atol=1e-7))

all(isapprox(c, ctrue, atol = 1.0e-7)) = true


true

## Unroll for AtX

In [3]:
using Revise
using LinearAlgebra
using Random
using LoopVectorization
using BenchmarkTools

function my_gemm_noturbo!(out, s::Matrix{UInt8}, V)
    fill!(out, zero(eltype(out)))
    srows = size(s, 1)
    scols = size(s, 2)
    Vcols = size(V, 2)
    k = srows >> 2
    rem = srows & 3

    for c in 1:Vcols
        for i in 1:scols
            for l in 1:k
                block = s[l, i]
                # unrolled loop
                for p in 1:4
                    Aij = (block >> (2 * (p - 1))) & 3
                    out[i, c] += ((Aij >= 2) + (Aij == 3)) * V[4 * (l - 1) + p, c]
                end
            end
        end
    end
    # TODO handle remainder
end

# unrolled loop crashes julia
function my_gemm_unroll!(out, s::Matrix{UInt8}, V)
    srows = size(s, 1)
    scols = size(s, 2)
    Vcols = size(V, 2)
    k = srows >> 2
    rem = srows & 3

    @avx for c in 1:Vcols
        for i in 1:scols
            for l in 1:k
                block = s[l, i]
                # unrolled loop
                for p in 1:4
                    Aij = (block >> (2 * (p - 1))) & 3
                    out[i, c] += ((Aij >= 2) + (Aij == 3)) * V[4 * (l - 1) + p, c]
                end
            end
        end
    end
    # TODO handle remainder
end

# manually unroll gives wrong answers
function my_gemm_manuel_unroll!(out, s::Matrix{UInt8}, V)
    srows = size(s, 1)
    scols = size(s, 2)
    Vcols = size(V, 2)
    k = srows >> 2
    rem = srows & 3

    @avx for c in 1:Vcols
        for i in 1:scols
            for l in 1:k
                block = s[l, i]
                # unrolled loop
                p = 1
                Aij = (block >> (2 * (p - 1))) & 3
                out[i, c] += ((Aij >= 2) + (Aij == 3)) * V[4 * (l - 1) + p, c]
                p = 2
                Aij = (block >> (2 * (p - 1))) & 3
                out[i, c] += ((Aij >= 2) + (Aij == 3)) * V[4 * (l - 1) + p, c]
                p = 3
                Aij = (block >> (2 * (p - 1))) & 3
                out[i, c] += ((Aij >= 2) + (Aij == 3)) * V[4 * (l - 1) + p, c]
                p = 4
                Aij = (block >> (2 * (p - 1))) & 3
                out[i, c] += ((Aij >= 2) + (Aij == 3)) * V[4 * (l - 1) + p, c]
            end
        end
    end
    # TODO handle remainder
end

out_true = zeros(10, 10)
out_test1 = zeros(10, 10)
out_test2 = zeros(10, 10)
s = rand(UInt8, 100, 10)
V = rand(400, 10);

my_gemm_noturbo!(out_true, s, V)
my_gemm_unroll!(out_test1, s, V); all(out_test1 .≈ out_true)

true

In [23]:
my_gemm_manuel_unroll!(out_test2, s, V); all(out_test2 .≈ out_true)

false

In [30]:
using Revise
using LinearAlgebra
using Random
using LoopVectorization
using BenchmarkTools

function my_gemm_rem!(out, s, V, μ, σinv, c)
    maxp = size(V, 1)
    l = 1
    @avx for i in 1:size(out, 1)
        block = s[1, i]
        for p in 1:maxp
            Aij = (block >> (2 * (p - 1))) & 3
            out[i, c] += (((Aij >= 2) * 1.0 + (Aij == 3) * 1.0 + (Aij == 1) * μ[i]) *
                V[4 * (l - 1) + p, c]) * σinv[i]
        end
    end
end

function my_gemm_noturbo!(out, s::Matrix{UInt8}, V, srows, scols, Vcols, μ)
    k = srows >> 2
    rem = srows & 3
    fill!(out, 0)

    for c in 1:Vcols
        for i in 1:scols
            for l in 1:k
                block = s[l, i]
                for p in 1:4
                    Aij = (block >> (2 * (p - 1))) & 3
                    out[i, c] += (((Aij >= 2) + (Aij == 3) + (Aij == 1) * μ[i]) *
                        V[4 * (l - 1) + p, c])
                end
            end
        end
    end
#     if rem != 0
#         for c in 1:Vcols
#             _snparray_AtX_additive_rem(out, @view(s[k + 1:k + 1, :]), 
#                 @view(V[4k + 1:end, :]), c, μ, σinv, c)
#         end
#     end
    nothing
end

function my_gemm!(out, s::Matrix{UInt8}, V, srows, scols, Vcols, μ)
    k = srows >> 2
    rem = srows & 3
    fill!(out, 0)

    @avx for c in 1:Vcols
        for i in 1:scols
            for l in 1:k
                block = s[l, i]
                for p in 1:4
                    Aij = (block >> (2 * (p - 1))) & 3
                    out[i, c] += (((Aij >= 2) + (Aij == 3) + (Aij == 1) * μ[i]) *
                        V[4 * (l - 1) + p, c])
                end
            end
        end
    end
#     if rem != 0
#         for c in 1:Vcols
#             _snparray_AtX_additive_rem(out, @view(s[k + 1:k + 1, :]), 
#                 @view(V[4k + 1:end, :]), c, μ, σinv, c)
#         end
#     end
    nothing
end

my_gemm! (generic function with 2 methods)

In [31]:
out_true = zeros(100, 100)
out_test1 = zeros(100, 100)
out_test2 = zeros(100, 100)
μ = rand(100)
s = rand(UInt8, 100, 100)
V = rand(400, 100);

my_gemm_noturbo!(out_true, s, V, 400, 100, 100, μ)
my_gemm!(out_test1, s, V, 400, 100, 100, μ); all(out_test1 .≈ out_true)


false

In [32]:
out_test1[5], out_true[5]

(155.5004938609506, 142.38538989036144)