In [1]:
using Revise
using SnpArrays
using LinearAlgebra
using Random
using LoopVectorization
using MendelIHT

┌ Info: Precompiling SnpArrays [4e780e97-f5bf-4111-9dc4-b70aaf691b06]
└ @ Base loading.jl:1317
┌ Info: Precompiling MendelIHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1317


## No center/scale/impute

In [None]:
k = srows >> 2 # fast div(srows, 4)
rem = srows & 3 # fast rem(srows, 4)
packedstride = size(s, 1)

# out[i, k] = s[i, j] * V[j, k] for j in 1:n
@avx for k in 1:size(s, 2)
    for j in 1:size(V, 2)
        for i in 1:size(s, 1)
            l = 2 * ((i-1) & 3)
            block = s[(j-1) * packedstride + ((i-1) >> 2) + 1]
            Aij = (block >> l) & 3
            out[i, k] += $expr
        end
    end
end

In [22]:
# x = SnpArray(SnpArrays.datadir("EUR_subset.bed"))
n = 500
p = 1024
q = 1024
x = simulate_random_snparray(undef, n, p)

A = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, impute=false, center=false, scale=false)
B = ones(p, q)
C = zeros(n, q)
LinearAlgebra.mul!(C, A, B)
C

M = 125, Miter = 0, Mrem = 125
N = 1024, Niter = 1, Nrem = 0
P = 1024, Piter = 1, Prem = 0


LoadError: UndefVarError: _ftn! not defined

In [21]:
Ctrue = convert(Matrix{Float64}, A) * B

500×1024 Matrix{Float64}:
 516.0  516.0  516.0  516.0  516.0  …  516.0  516.0  516.0  516.0  516.0
 517.0  517.0  517.0  517.0  517.0     517.0  517.0  517.0  517.0  517.0
 479.0  479.0  479.0  479.0  479.0     479.0  479.0  479.0  479.0  479.0
 516.0  516.0  516.0  516.0  516.0     516.0  516.0  516.0  516.0  516.0
 499.0  499.0  499.0  499.0  499.0     499.0  499.0  499.0  499.0  499.0
 474.0  474.0  474.0  474.0  474.0  …  474.0  474.0  474.0  474.0  474.0
 511.0  511.0  511.0  511.0  511.0     511.0  511.0  511.0  511.0  511.0
 493.0  493.0  493.0  493.0  493.0     493.0  493.0  493.0  493.0  493.0
 504.0  504.0  504.0  504.0  504.0     504.0  504.0  504.0  504.0  504.0
 488.0  488.0  488.0  488.0  488.0     488.0  488.0  488.0  488.0  488.0
 498.0  498.0  498.0  498.0  498.0  …  498.0  498.0  498.0  498.0  498.0
 523.0  523.0  523.0  523.0  523.0     523.0  523.0  523.0  523.0  523.0
 516.0  516.0  516.0  516.0  516.0     516.0  516.0  516.0  516.0  516.0
   ⋮                     

## Simple gemm kernel

see https://github.com/JuliaSIMD/LoopVectorization.jl/issues/136

In [18]:
# runs, but not testes for correctness
function LinearAlgebra.mul!(C::Matrix{T}, A::Matrix{UInt8}, B::Vector{T}) where T 
    packedstride = size(A, 1)
    m = size(A, 1)
    n = size(A, 2)
    q = size(B, 2)
    # C[i, k] = A[i, j] * B[j, k] for j in 1:n
    @avx for k in 1:n
        for j in 1:q
            for i in 1:m
                l = 2 * ((i-1) & 3)
                block = A[(j-1) * packedstride + ((i-1) >> 2) + 1]
                Aij = (block >> l) & 3
                C[i, k] += (((Aij >= 2) + (Aij >= 3))) * B[j, k]
            end
        end
    end
    y
end
n = 1024
C = rand(n>>2, n)
B = rand(n, n)
A = rand(UInt8, (n>>2) + (n%4 != 0), n)
mul!(C, A, B)
C

256×1024 Matrix{Float64}:
 66434.3  64444.8  64477.0  64325.0  …  64434.7  64684.9  65697.5  67257.2
 64257.3  61606.1  63287.7  63319.2     63468.6  62763.4  62964.8  63074.6
 67281.3  66249.3  64499.8  66071.7     66680.0  64896.3  67123.5  67308.1
 63134.7  63196.3  62403.8  62460.3     64281.3  62370.7  63617.7  63397.0
 66956.1  66625.4  65609.6  64534.9     64802.6  65023.1  66008.6  66759.5
 67794.7  64426.7  64144.5  64350.9  …  64788.2  63908.4  64972.4  64790.6
 67356.6  64806.9  64644.5  66182.2     65452.9  64286.9  66074.6  66276.3
 66572.2  64912.0  65080.6  64689.8     65311.9  64538.2  65481.4  66282.2
 65137.1  63702.4  64275.9  64050.9     65489.8  63791.7  64423.3  65201.1
 64235.2  63802.3  63511.9  62459.4     63186.4  62899.6  62262.1  64026.0
 66430.6  66459.6  65568.3  65449.1  …  66250.1  64175.8  66636.0  66110.1
 68341.8  66758.4  66542.2  65753.8     66790.0  67162.7  67627.1  67548.5
 64793.8  64138.0  63631.7  64048.2     64150.4  63272.4  65139.7  65746.7