In [1]:
using Revise
using SnpArrays
using LinearAlgebra
using Random
using LoopVectorization
using MendelIHT

┌ Info: Precompiling SnpArrays [4e780e97-f5bf-4111-9dc4-b70aaf691b06]
└ @ Base loading.jl:1317
┌ Info: Precompiling MendelIHT [921c7187-1484-5754-b919-5d3ed9ac03c4]
└ @ Base loading.jl:1317


## No center/scale/impute

In [43]:
# x = SnpArray(SnpArrays.datadir("EUR_subset.bed"))
m = 1000
n = 1025
p = 1024
x = simulate_random_snparray(undef, m, n)

A = SnpLinAlg{Float64}(x, model=ADDITIVE_MODEL, impute=false, center=false, scale=false)
B = ones(n, p)
C = zeros(m, p)
LinearAlgebra.mul!(C, A, B)

M = 250, Miter = 0, Mrem = 250
N = 1025, Niter = 1, Nrem = 1
P = 1024, Piter = 1, Prem = 0


In [44]:
Ctrue = convert(Matrix{Float64}, x, model=ADDITIVE_MODEL) * B
@show all(C .≈ Ctrue)

all(C .≈ Ctrue) = true


true

## Simple gemm kernel

see https://github.com/JuliaSIMD/LoopVectorization.jl/issues/136

In [30]:
# runs, but not testes for correctness
function LinearAlgebra.mul!(C::Matrix{T}, A::Matrix{UInt8}, B::Vector{T}) where T 
    packedstride = size(A, 1)
    m = size(A, 1)
    n = size(A, 2)
    q = size(B, 2)
    # C[i, c] = A[i, j] * B[j, c] for j in 1:n
    @avx for c in 1:n
        for j in 1:q
            for i in 1:m
                l = 2 * ((i-1) & 3)
                block = A[(j-1) * packedstride + ((i-1) >> 2) + 1]
                Aij = (block >> l) & 3
                C[i, c] += (((Aij >= 2) + (Aij >= 3))) * B[j, c]
            end
        end
    end
    y
end
n = 1024
C = rand(n>>2, n)
B = rand(n, n)
A = rand(UInt8, (n>>2) + (n%4 != 0), n)
mul!(C, A, B)
C

256×1024 Matrix{Float64}:
 66012.8  66641.7  65025.1  64186.5  …  62550.3  66977.2  62757.8  64928.9
 65071.0  66994.6  64153.1  63523.2     59848.2  65712.0  60407.1  65453.8
 63063.4  64976.7  62558.5  62178.4     60108.9  64247.2  59379.2  63561.1
 67164.3  69165.1  66512.6  65626.3     62287.9  66759.9  63090.0  66733.7
 64344.0  67225.5  62390.2  62855.4     61037.4  66694.4  60443.2  63421.1
 66974.4  69101.0  65191.4  65199.6  …  62378.3  66368.3  62947.4  65874.6
 67436.5  67708.1  66113.9  65659.2     62817.5  68048.3  64262.6  67750.7
 67885.3  67859.0  66351.9  64965.0     63276.4  67816.0  63294.5  68540.1
 66367.7  69754.9  65240.1  65074.8     63377.6  66524.2  62105.1  65863.2
 66904.8  68131.1  65629.9  64834.6     61812.3  65100.7  62354.6  67237.5
 64348.9  66867.6  64281.4  62161.7  …  63048.0  65335.2  62803.6  64806.2
 63451.1  65376.0  63215.3  61829.6     58754.5  63450.4  60032.9  63124.4
 68083.8  66956.7  65964.5  65147.8     62862.5  66294.5  64135.6  67155.6