diff --git a/docs/SnpArrays.ipynb b/docs/SnpArrays.ipynb index e80e9435..5cb07a4d 100644 --- a/docs/SnpArrays.ipynb +++ b/docs/SnpArrays.ipynb @@ -266,7 +266,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " 87.555 μs (64 allocations: 390.03 KiB)\n" + " 94.032 μs (64 allocations: 390.03 KiB)\n" ] } ], @@ -647,11 +647,11 @@ "data": { "text/plain": [ "5×3 SnpArray:\n", - " 0x00 0x00 0x00\n", - " 0x00 0x00 0x00\n", - " 0x00 0x00 0x00\n", - " 0x00 0x00 0x00\n", - " 0x00 0x00 0x00" + " 0x00 0x01 0x01\n", + " 0x00 0x03 0x01\n", + " 0x01 0x03 0x02\n", + " 0x02 0x01 0x00\n", + " 0x00 0x00 0x03" ] }, "execution_count": 19, @@ -679,12 +679,11 @@ "data": { "text/plain": [ "5×3 SnpArray:\n", - - " 0x00 0x00 0x00\n", - " 0x00 0x00 0x00\n", - " 0x00 0x00 0x00\n", - " 0x00 0x00 0x00\n", - " 0x00 0x00 0x00" + " 0x00 0x01 0x01\n", + " 0x00 0x03 0x01\n", + " 0x01 0x03 0x02\n", + " 0x02 0x01 0x00\n", + " 0x00 0x00 0x03" ] }, "execution_count": 20, @@ -705,11 +704,11 @@ "data": { "text/plain": [ "5×3 SnpArray:\n", - " 0x02 0x00 0x00\n", - " 0x00 0x00 0x00\n", - " 0x00 0x00 0x00\n", - " 0x00 0x00 0x00\n", - " 0x00 0x00 0x00" + " 0x02 0x01 0x01\n", + " 0x00 0x03 0x01\n", + " 0x01 0x03 0x02\n", + " 0x02 0x01 0x00\n", + " 0x00 0x00 0x03" ] }, "execution_count": 21, @@ -736,11 +735,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## `convert` and `copyto!`\n", + "### `convert` and `copyto!`\n", "\n", "Most common usage of SnpArray is to convert genotypes to numeric values for statistical analysis. Conversion rule depends on genetic models (additive, dominant, or recessive), centering, scaling, or imputation.\n", "\n", - "### `convert`\n", + "#### `convert`\n", "\n", "`convert` function has 4 keyword arguments: `model`, `center`, `scale`, and `impute`.\n", "\n", @@ -1068,7 +1067,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### `copyto!`\n", + "#### `copyto!`\n", "\n", "`copyto!` is the in-place version of `convert`. It takes the same keyword arguments (`model`, `center`, `scale`, `impute`) as `convert`." ] @@ -1136,7 +1135,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " 3.719 μs (0 allocations: 0 bytes)\n" + " 3.655 μs (0 allocations: 0 bytes)\n" ] } ], @@ -1207,7 +1206,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " 7.138 μs (0 allocations: 0 bytes)\n" + " 7.205 μs (0 allocations: 0 bytes)\n" ] } ], @@ -1278,7 +1277,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " 6.860 μs (0 allocations: 0 bytes)\n" + " 6.856 μs (0 allocations: 0 bytes)\n" ] } ], @@ -1303,7 +1302,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " 50.433 ms (10150 allocations: 475.78 KiB)\n" + " 50.329 ms (10150 allocations: 475.78 KiB)\n" ] } ], @@ -1333,7 +1332,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " 64.308 ms (0 allocations: 0 bytes)\n" + " 64.344 ms (0 allocations: 0 bytes)\n" ] } ], @@ -1346,9 +1345,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Summaries\n", + "### Summaries\n", "\n", - "### Counts\n", + "#### Counts\n", "\n", "Counts of each the four possible values for each column are returned by `counts`.`" ] @@ -1396,7 +1395,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " 6.047 ns (0 allocations: 0 bytes)\n" + " 6.043 ns (0 allocations: 0 bytes)\n" ] } ], @@ -1408,7 +1407,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Minor allele frequencies\n", + "#### Minor allele frequencies\n", "\n", "Minor allele frequencies (MAF) for each SNP." ] @@ -1516,7 +1515,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### `mean` and `var`\n", + "#### `mean` and `var`\n", "\n", "The package provides methods for the generics `mean` and `var` from the `Statistics` package." ] @@ -1600,7 +1599,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " 17.328 μs (2 allocations: 79.39 KiB)\n" + " 17.263 μs (2 allocations: 79.39 KiB)\n" ] } ], @@ -1665,7 +1664,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Missing rate\n", + "#### Missing rate\n", "\n", "Proportion of missing genotypes" ] @@ -1766,7 +1765,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Location of the missing values\n", + "#### Location of the missing values\n", "\n", "The positions of the missing data are evaluated by" ] @@ -1826,7 +1825,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " 43.661 ms (19274 allocations: 1.80 MiB)\n" + " 43.843 ms (19274 allocations: 1.80 MiB)\n" ] } ], @@ -1894,7 +1893,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Genetic relationship matrix\n", + "### Genetic relationship matrix\n", "\n", "`grm` function computes the empirical kinship matrix using either the classical genetic relationship matrix, `grm(A, model=:GRM)`, or the method of moment method, `grm(A, model=:MoM)`, or the robust method, `grm(A, model=:Robust)`. \n", "\n", @@ -1958,7 +1957,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " 502.287 ms (25 allocations: 28.95 MiB)\n" + " 505.403 ms (25 allocations: 28.95 MiB)\n" ] } ], @@ -2028,7 +2027,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " 315.064 ms (26 allocations: 14.60 MiB)\n" + " 312.411 ms (26 allocations: 14.60 MiB)\n" ] } ], @@ -2148,7 +2147,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Filtering\n", + "### Filtering\n", "\n", "Before GWAS, we often need to filter SNPs and/or samples according to genotyping success rates, minor allele frequencies, and Hardy-Weinberg Equilibrium test. This can be achieved by the `filter` function.\n", "\n", @@ -2213,7 +2212,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " 150.800 ms (11460 allocations: 171.28 MiB)\n" + " 148.931 ms (11460 allocations: 171.28 MiB)\n" ] } ], @@ -2235,7 +2234,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Filter Plink files\n", + "#### Filter Plink files\n", "\n", "Filter a set of Plink files according to row indices and column indices. By result, filtered Plink files are saved as `srcname.filtered.bed`, `srcname.filtered.fam`, and `srcname.filtered.bim`, where `srcname` is the source Plink file name. You can also specify destimation file name using keyword `des`." ] @@ -2507,15 +2506,10 @@ { "data": { "text/plain": [ - "7-element Array{String,1}:\n", + "3-element Array{String,1}:\n", " \"./tmp_hcat_arr_1.bed\"\n", " \"./tmp_hvcat_arr_1.bed\"\n", - " \"./tmp_vcat_arr_1.bed\"\n", - " \"./tmp_vcat_arr_2.bed\"\n", - " \"./tmp_vcat_arr_3.bed\"\n", - " \"./tmp_vcat_arr_4.bed\"\n", - " \"./tmp_vcat_arr_5.bed\"" - + " \"./tmp_vcat_arr_1.bed\"" ] }, "execution_count": 71, @@ -2629,12 +2623,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## SnpBitMatrix\n", + "## Linear Algebra\n", "\n", - "In some applications we want to perform linear algebra using SnpArray directly without expanding it to numeric matrix. This is achieved by the `SnpBitMatrix` type. The implementation assumes:\n", + "In some applications we want to perform linear algebra using SnpArray directly without expanding it to numeric matrix. This is achieved in three different `struct`s:\n", "\n", - "1. the SnpArray does not have missing genotypes, and\n", - "2. the matrix corresponding to SnpArray is the matrix of A2 allele counts.\n", + "1. Direct operations on a plink-formatted `SnpArray`: `SnpLinAlg`\n", + "2. Operations on transformed `BitMatrix`es: `SnpBitMatrix`\n", + "3. Direct operations on a plink-formatted data on an Nvidia GPU: `CuSnpArray`.\n", + "\n", + "`SnpLinAlg` and `SnpBitMatrix` use Chris Elrod's [LoopVectorization.jl](https://github.com/chriselrod/LoopVectorization.jl) internally. It is much faster on machines with AVX support. `CuSnpArray` uses [CUDA.jl](https://juliagpu.gitlab.io/CUDA.jl/) internally.\n", + "\n", + "The implementation assumes that the matrix corresponding to SnpArray is the matrix of the A2 allele counts. `SnpLinAlg` and `CuSnpArray` impute any missing genotype with its column mean by default. They can also configured to impute missing genotypes with zero. `SnpBitMatrix` can only impute missing values with zero. \n", "\n", "### Constructor\n", "\n", @@ -2691,7 +2690,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To instantiate a SnpBitMatrix based on SnpArray," + "To instantiate a SnpLinAlg based on SnpArray," ] }, { @@ -2700,6 +2699,8 @@ "metadata": {}, "outputs": [], "source": [ + "const EURsla = SnpLinAlg{Float64}(EUR, model=ADDITIVE_MODEL, center=true, scale=true);\n", + "const EURsla_ = SnpLinAlg{Float64}(EUR, model=ADDITIVE_MODEL, center=true, scale=true, impute=false);\n", "const EURbm = SnpBitMatrix{Float64}(EUR, model=ADDITIVE_MODEL, center=true, scale=true);" ] }, @@ -2707,9 +2708,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The constructor shares the same keyword arguments as the `convert` or `copyto!` functions. The type parameter, `Float64` in this example, indicates the SnpBitMatrix acts like a Float64 matrix.\n", + "The constructor shares the same keyword arguments as the `convert` or `copyto!` functions. The type parameter, `Float64` in this example, indicates the SnpLinAlg acts like a Float64 matrix.\n", + "SnpLinAlg directly uses the SnpArray for computation. \n", "\n", - "The memory usage of the SnpBitMatrix should be similar to the SnpArray, or equivalently bed file size, if `model=ADDITIVE_MODEL`, or half of that of SnpArray if `model=DOMINANT_MODEL` or `model=RECESSIVE_MODEL`." + "On the other hand, memory usage of SnpBitMatrix should be similar to the SnpArray, or equivalently bed file size, if `model=ADDITIVE_MODEL`, or half of that of SnpArray if `model=DOMINANT_MODEL` or `model=RECESSIVE_MODEL`." ] }, { @@ -2720,7 +2722,7 @@ { "data": { "text/plain": [ - "(6876757, 6421960)" + "(6876757, 8177221, 6421960)" ] }, "execution_count": 78, @@ -2729,16 +2731,16 @@ } ], "source": [ - "Base.summarysize(EUR), Base.summarysize(EURbm)" + "Base.summarysize(EUR), Base.summarysize(EURsla), Base.summarysize(EURbm)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Linear algebra\n", + "### `mul!()`\n", "\n", - "A SnpBitMatrix acts similar to a regular matrix and responds to `size`, `eltype`, and SnpBitMatrix-vector multiplications." + "SnpLinAlg and SnpBitMatrix act similar to a regular matrix and responds to `size`, `eltype`, and matrix-vector multiplications." ] }, { @@ -2750,6 +2752,9 @@ "name": "stdout", "output_type": "stream", "text": [ + "size(EURsla) = (379, 54051)\n", + "eltype(EURsla) = Float64\n", + "typeof(EURsla) <: AbstractMatrix = true\n", "size(EURbm) = (379, 54051)\n", "eltype(EURbm) = Float64\n", "typeof(EURbm) <: AbstractMatrix = true\n" @@ -2757,6 +2762,10 @@ } ], "source": [ + "@show size(EURsla)\n", + "@show eltype(EURsla)\n", + "@show typeof(EURsla) <: AbstractMatrix;\n", + "\n", "@show size(EURbm)\n", "@show eltype(EURbm)\n", "@show typeof(EURbm) <: AbstractMatrix;" @@ -2766,31 +2775,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "SnpBitMatrix-vector multiplication is mathematically equivalent to the corresponding Float matrix contained from `convert` or `copyto!` a SnpArray." + "Matrix-vector multiplications with SnpLinAlg and SnpBitMatrix are mathematically equivalent to the corresponding Float matrix contained from `convert` or `copyto!` a SnpArray." ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2.837775572500693e-11" - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "using LinearAlgebra\n", "v1 = randn(size(EUR, 1))\n", "v2 = randn(size(EUR, 2))\n", - "A = convert(Matrix{Float64}, EUR, model=ADDITIVE_MODEL, center=true, scale=true)\n", - "norm(EURbm * v2 - A * v2)" + "A = convert(Matrix{Float64}, EUR, model=ADDITIVE_MODEL, center=true, scale=true);" ] }, { @@ -2801,7 +2798,7 @@ { "data": { "text/plain": [ - "5.399567080480323e-12" + "3.255157000354014e-11" ] }, "execution_count": 81, @@ -2810,14 +2807,7 @@ } ], "source": [ - "norm(EURbm' * v1 - A' * v1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this example, the Float64 matrix fits into memory so the SnpBitMatrix-vector multiplication is much slower than Matrix{Float64}-vector multiplication (highly optimized BLAS)." + "norm(EURsla * v2 - A * v2)" ] }, { @@ -2826,17 +2816,18 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - " 8.495 ms (0 allocations: 0 bytes)\n" - ] + "data": { + "text/plain": [ + "5.398163705112499e-12" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "out1 = Vector{Float64}(undef, size(EUR, 1))\n", - "out2 = Vector{Float64}(undef, size(EUR, 2))\n", - "@btime(mul!($out1, $EURbm, $v2));" + "norm(EURsla' * v1 - A' *v1)" ] }, { @@ -2845,15 +2836,18 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - " 7.844 ms (0 allocations: 0 bytes)\n" - ] + "data": { + "text/plain": [ + "3.590044545281484e-11" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "@btime(mul!($out1, $A, $v2));" + "norm(EURbm * v2 - A * v2)" ] }, { @@ -2862,39 +2856,39 @@ "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - " 6.612 ms (1 allocation: 16 bytes)\n" - ] + "data": { + "text/plain": [ + "8.54121862883797e-12" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "@btime(mul!($out2, $transpose($EURbm), $v1));" + "norm(EURbm' * v1 - A' * v1)" ] }, { - "cell_type": "code", - "execution_count": 85, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 3.011 ms (0 allocations: 0 bytes)\n" - ] - } - ], "source": [ - "@btime(mul!($out2, $transpose($A), $v1));" + "See Linear Algebra page for performance comparison among BLAS, SnpLinAlg, and SnpBitMatrix. In general, SnpLinAlg and SnpBitMatrix operations are at least twice faster than the corresponding Matrix{Float64}-vector multiplication. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In another test example with ~1GB bed file, SnpBitMatrix-vector multiplication is about 3-5 folder faster than the corresponding Matrix{Float64}-vector multiplication, because the Matrix{Float64} matrix cannot fit into the memory." + "In general, computing $Ax$ is more effective in SnpLinAlg, and computing $A^T x$ is faster in SnpBitMatrix. Note that SnpLinAlg does not allocate additional memory, and can impute missing values with column means. See Linear Algebra page for more information." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In a test example with ~1GB bed file, SnpBitMatrix-vector multiplication is about 3-5 fold faster than the corresponding Matrix{Float64}-vector multiplication, because the Matrix{Float64} matrix cannot fit into the memory." ] }, { @@ -2906,7 +2900,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ @@ -2916,7 +2910,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 86, "metadata": {}, "outputs": [ { @@ -2925,7 +2919,7 @@ "2600" ] }, - "execution_count": 87, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } @@ -2936,7 +2930,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 87, "metadata": {}, "outputs": [ { @@ -2957,16 +2951,16 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "8.987658251177042e-14" + "5.05574647736198e-14" ] }, - "execution_count": 89, + "execution_count": 88, "metadata": {}, "output_type": "execute_result" } @@ -2981,16 +2975,16 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "1.6136185547316357e-14" + "1.4647267077134765e-14" ] }, - "execution_count": 90, + "execution_count": 89, "metadata": {}, "output_type": "execute_result" } @@ -3003,77 +2997,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Two alternatives: SnpLinAlg and CuSnpArray\n", - "\n", - "Basically, creating SnpBitMatrix doubles the memory usage. If it becomes an issue, direct linear algebra operations from a SnpArray is possible. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### SnpLinAlg" - ] - }, - { - "cell_type": "code", - "execution_count": 91, - "metadata": {}, - "outputs": [], - "source": [ - "const EURla = SnpLinAlg{Float64}(EUR; model=ADDITIVE_MODEL, center=true, scale=true);" - ] - }, - { - "cell_type": "code", - "execution_count": 92, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 6.880 ms (264 allocations: 14.03 KiB)\n" - ] - } - ], - "source": [ - "v1 = randn(size(EUR, 1))\n", - "v2 = randn(size(EUR, 2))\n", - "out1 = Vector{Float64}(undef, size(EUR, 1))\n", - "out2 = Vector{Float64}(undef, size(EUR, 2))\n", - "@btime mul!($out1, $EURla, $v2);" - ] - }, - { - "cell_type": "code", - "execution_count": 93, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 9.760 ms (3 allocations: 128 bytes)\n" - ] - } - ], - "source": [ - "@btime mul!($out2, transpose($EURla), $v1);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Computing $Ax$ is faster in SnpLinAlg, and computing $A^T x$ is faster in SnpBitMatrix. Note that SnpLinAlg does not allocate additional memory. See Linear Algebra page for more information." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### CuSnpArray" + "### GPU support: CuSnpArray" ] }, { @@ -3085,7 +3009,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 90, "metadata": {}, "outputs": [ { @@ -3101,6 +3025,10 @@ "source": [ "ENV[\"JULIA_CUDA_USE_BINARYBUILDER\"] = \"false\" # will use local CUDA installation\n", "using CUDA, Adapt\n", + "out1 = randn(size(EUR, 1))\n", + "out2 = randn(size(EUR, 2))\n", + "v1 = randn(size(EUR, 1))\n", + "v2 = randn(size(EUR, 2))\n", "v1_d = adapt(CuVector{Float64}, v1) # sends data to GPU\n", "v2_d = adapt(CuVector{Float64}, v2)\n", "out1_d = adapt(CuVector{Float64}, out1)\n", @@ -3111,7 +3039,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 91, "metadata": {}, "outputs": [ { @@ -3127,7 +3055,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " 18.361 ms (253 allocations: 7.69 KiB)\n" + " 18.204 ms (253 allocations: 7.69 KiB)\n" ] } ], @@ -3137,14 +3065,14 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 626.435 μs (465 allocations: 15.00 KiB)\n" + " 544.722 μs (465 allocations: 15.00 KiB)\n" ] } ], @@ -3168,16 +3096,16 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "4.38122146476498e-11" + "2.2306544292754367e-11" ] }, - "execution_count": 97, + "execution_count": 93, "metadata": {}, "output_type": "execute_result" } @@ -3204,7 +3132,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 94, "metadata": {}, "outputs": [ { @@ -3239,7 +3167,7 @@ ")" ] }, - "execution_count": 98, + "execution_count": 94, "metadata": {}, "output_type": "execute_result" } @@ -3259,7 +3187,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 95, "metadata": {}, "outputs": [ { @@ -3294,7 +3222,7 @@ ")" ] }, - "execution_count": 99, + "execution_count": 95, "metadata": {}, "output_type": "execute_result" } @@ -3305,7 +3233,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 96, "metadata": {}, "outputs": [ { @@ -3340,7 +3268,7 @@ ")" ] }, - "execution_count": 100, + "execution_count": 96, "metadata": {}, "output_type": "execute_result" } @@ -3351,7 +3279,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 97, "metadata": {}, "outputs": [ { @@ -3386,7 +3314,7 @@ ")" ] }, - "execution_count": 101, + "execution_count": 97, "metadata": {}, "output_type": "execute_result" } @@ -3404,7 +3332,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 98, "metadata": {}, "outputs": [ { @@ -3439,7 +3367,7 @@ ")" ] }, - "execution_count": 102, + "execution_count": 98, "metadata": {}, "output_type": "execute_result" } @@ -3459,7 +3387,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 99, "metadata": {}, "outputs": [ { @@ -3474,7 +3402,7 @@ " \"18\" => SnpData(people: 379, snps: 12242,…" ] }, - "execution_count": 103, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } @@ -3492,7 +3420,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 100, "metadata": {}, "outputs": [ { @@ -3527,7 +3455,7 @@ ")" ] }, - "execution_count": 104, + "execution_count": 100, "metadata": {}, "output_type": "execute_result" } @@ -3538,7 +3466,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -3547,7 +3475,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 102, "metadata": {}, "outputs": [ { @@ -3558,7 +3486,7 @@ " \"2\" => SnpData(people: 201, snps: 54051,…" ] }, - "execution_count": 106, + "execution_count": 102, "metadata": {}, "output_type": "execute_result" } @@ -3578,7 +3506,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 103, "metadata": {}, "outputs": [ { @@ -3613,7 +3541,7 @@ ")" ] }, - "execution_count": 107, + "execution_count": 103, "metadata": {}, "output_type": "execute_result" } @@ -3624,7 +3552,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 104, "metadata": {}, "outputs": [ { @@ -3659,7 +3587,7 @@ ")" ] }, - "execution_count": 108, + "execution_count": 104, "metadata": {}, "output_type": "execute_result" } @@ -3670,7 +3598,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 105, "metadata": {}, "outputs": [ { @@ -3705,7 +3633,7 @@ ")" ] }, - "execution_count": 109, + "execution_count": 105, "metadata": {}, "output_type": "execute_result" } @@ -3716,7 +3644,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 106, "metadata": {}, "outputs": [ { @@ -3751,7 +3679,7 @@ ")" ] }, - "execution_count": 110, + "execution_count": 106, "metadata": {}, "output_type": "execute_result" } @@ -3762,7 +3690,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 107, "metadata": {}, "outputs": [ { @@ -3797,7 +3725,7 @@ ")" ] }, - "execution_count": 111, + "execution_count": 107, "metadata": {}, "output_type": "execute_result" } @@ -3808,7 +3736,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 108, "metadata": {}, "outputs": [ { @@ -3843,7 +3771,7 @@ ")" ] }, - "execution_count": 112, + "execution_count": 108, "metadata": {}, "output_type": "execute_result" } @@ -3863,16 +3791,16 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 109, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 0.066388 seconds (117.18 k allocations: 7.904 MiB)\n", - " 0.049178 seconds (125.97 k allocations: 11.520 MiB)\n", - " 0.051638 seconds (134.18 k allocations: 8.737 MiB)\n" + " 0.067004 seconds (117.20 k allocations: 7.905 MiB)\n", + " 0.102031 seconds (125.97 k allocations: 11.520 MiB, 51.01% gc time)\n", + " 0.050454 seconds (134.18 k allocations: 8.737 MiB)\n" ] }, { @@ -3907,7 +3835,7 @@ ")" ] }, - "execution_count": 113, + "execution_count": 109, "metadata": {}, "output_type": "execute_result" } @@ -3925,17 +3853,17 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 110, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 0.143816 seconds (783.53 k allocations: 36.258 MiB)\n", - " 0.575327 seconds (1.19 M allocations: 61.140 MiB)\n", - " 0.003704 seconds (8 allocations: 4.897 MiB)\n", - " 0.000827 seconds (8 allocations: 1.650 MiB)\n" + " 0.146038 seconds (783.53 k allocations: 36.252 MiB)\n", + " 0.568753 seconds (1.19 M allocations: 61.158 MiB)\n", + " 0.003368 seconds (8 allocations: 4.897 MiB)\n", + " 0.001259 seconds (8 allocations: 1.650 MiB)\n" ] }, { @@ -3970,7 +3898,7 @@ ")" ] }, - "execution_count": 114, + "execution_count": 110, "metadata": {}, "output_type": "execute_result" } @@ -3990,7 +3918,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 111, "metadata": {}, "outputs": [ { @@ -3999,7 +3927,7 @@ "Process(`\u001b[4mcp\u001b[24m \u001b[4m/home/kose/.julia/dev/SnpArrays/src/../data/mouse.fam\u001b[24m \u001b[4mmouse_reorder.fam\u001b[24m`, ProcessExited(0))" ] }, - "execution_count": 115, + "execution_count": 111, "metadata": {}, "output_type": "execute_result" } @@ -4013,7 +3941,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 112, "metadata": {}, "outputs": [ { @@ -4022,7 +3950,7 @@ "(1940, 10150)" ] }, - "execution_count": 116, + "execution_count": 112, "metadata": {}, "output_type": "execute_result" } @@ -4042,7 +3970,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 113, "metadata": {}, "outputs": [], "source": [ @@ -4053,7 +3981,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 114, "metadata": {}, "outputs": [ { @@ -4075,12 +4003,12 @@ "│ Row │ fid │ iid │ father │ mother │ sex │ phenotype │\n", "│ │ Abstrac… │ AbstractS… │ AbstractStr… │ AbstractStr… │ Abstrac… │ Abstract… │\n", "├─────┼──────────┼────────────┼──────────────┼──────────────┼──────────┼───────────┤\n", - "│ 1 │ 1_5 │ A053629514 │ F5.3:E2.2(6) │ F5.3:A3.2(5) │ 1 │ -9 │\n", - "│ 2 │ 1_18 │ A067077564 │ G1.5:B5.4(5) │ G1.5:F5.4(5) │ 1 │ -9 │\n", - "│ 3 │ 1_2 │ A048029086 │ G1.3:B1.2(3) │ G1.3:F3.2(3) │ 2 │ -9 │\n", - "│ 4 │ 1_1 │ A048097274 │ H4.2:C5.1(4) │ H4.2:G1.1(7) │ 1 │ -9 │\n", - "│ 5 │ 1_15 │ A063796366 │ H2.5:C3.4(3) │ H2.5:G4.4(2) │ 2 │ -9 │\n", - "│ 6 │ 1_45 │ A084280094 │ 6.6:F1.5(6) │ 6.6:G1.5(1) │ 1 │ -9 │\n", + "│ 1 │ 1_3 │ A067289112 │ E2.4:H3.3(2) │ E2.4:D4.3(2) │ 1 │ -9 │\n", + "│ 2 │ 1_1 │ A048031769 │ H4.2:C5.1(4) │ H4.2:G1.1(7) │ 1 │ -9 │\n", + "│ 3 │ 1_2 │ A084286354 │ H4.4:C2.3(5) │ H4.4:G3.3(7) │ 1 │ -9 │\n", + "│ 4 │ 1_3 │ A048072266 │ G5.2:B5.1(4) │ G5.2:F5.1(2) │ 2 │ -9 │\n", + "│ 5 │ 1_3 │ A063311623 │ A5.4:D4.3(5) │ A5.4:H1.3(6) │ 1 │ -9 │\n", + "│ 6 │ 1_54 │ A063011268 │ B3.5:E4.4(2) │ B3.5:A3.4(4) │ 1 │ -9 │\n", "…,\n", "srcbed: mouse_reorder.bed\n", "srcbim: mouse_reorder.bim\n", @@ -4088,7 +4016,7 @@ ")" ] }, - "execution_count": 118, + "execution_count": 114, "metadata": {}, "output_type": "execute_result" } @@ -4106,7 +4034,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ diff --git a/docs/SnpLinAlg.ipynb b/docs/SnpLinAlg.ipynb index 0b448f90..60563dda 100644 --- a/docs/SnpLinAlg.ipynb +++ b/docs/SnpLinAlg.ipynb @@ -96,11 +96,13 @@ "source": [ "EUR_100_bm = SnpBitMatrix{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false)\n", "EUR_100_sla = SnpLinAlg{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false)\n", - "EUR_100_mat = convert(Matrix{Float64}, EUR_100, model=ADDITIVE_MODEL, center=true, scale=true);\n", + "EUR_100_sla_ = SnpLinAlg{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false, impute=false)\n", + "EUR_100_mat = convert(Matrix{Float64}, EUR_100, model=ADDITIVE_MODEL, center=false, scale=false);\n", "\n", "EUR_101_bm = SnpBitMatrix{Float64}(EUR_101; model=ADDITIVE_MODEL, center=false, scale=false)\n", "EUR_101_sla = SnpLinAlg{Float64}(EUR_101; model=ADDITIVE_MODEL, center=false, scale=false)\n", - "EUR_101_mat = convert(Matrix{Float64}, EUR_101, model=ADDITIVE_MODEL, center=true, scale=true);" + "EUR_101_sla_ = SnpLinAlg{Float64}(EUR_101; model=ADDITIVE_MODEL, center=false, scale=false, impute=false)\n", + "EUR_101_mat = convert(Matrix{Float64}, EUR_101, model=ADDITIVE_MODEL, center=false, scale=false);" ] }, { @@ -121,7 +123,8 @@ "source": [ "ENV[\"JULIA_CUDA_USE_BINARYBUILDER\"] = \"false\"\n", "using CUDA\n", - "EUR_100_cu = CuSnpArray{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false);" + "EUR_100_cu = CuSnpArray{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false);\n", + "EUR_100_cu_ = CuSnpArray{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false, impute=false);" ] }, { @@ -148,6 +151,7 @@ "outputs": [], "source": [ "v1 = randn(size(EUR_100, 1))\n", + "v1_ = randn(size(EUR_100, 1))\n", "v2 = randn(size(EUR_100, 2));" ] }, @@ -170,12 +174,12 @@ " memory estimate: 0 bytes\n", " allocs estimate: 0\n", " --------------\n", - " minimum time: 321.207 ms (0.00% GC)\n", - " median time: 347.112 ms (0.00% GC)\n", - " mean time: 375.497 ms (0.00% GC)\n", - " maximum time: 769.390 ms (0.00% GC)\n", + " minimum time: 383.853 ms (0.00% GC)\n", + " median time: 417.986 ms (0.00% GC)\n", + " mean time: 431.205 ms (0.00% GC)\n", + " maximum time: 530.847 ms (0.00% GC)\n", " --------------\n", - " samples: 15\n", + " samples: 12\n", " evals/sample: 1" ] }, @@ -208,10 +212,10 @@ " memory estimate: 0 bytes\n", " allocs estimate: 0\n", " --------------\n", - " minimum time: 1.745 s (0.00% GC)\n", - " median time: 1.905 s (0.00% GC)\n", - " mean time: 2.144 s (0.00% GC)\n", - " maximum time: 2.782 s (0.00% GC)\n", + " minimum time: 1.849 s (0.00% GC)\n", + " median time: 2.064 s (0.00% GC)\n", + " mean time: 2.047 s (0.00% GC)\n", + " maximum time: 2.229 s (0.00% GC)\n", " --------------\n", " samples: 3\n", " evals/sample: 1" @@ -231,7 +235,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Direct linear algebra on a SnpArray: " + "Direct linear algebra on a SnpArray, with mean imputation: " ] }, { @@ -243,15 +247,15 @@ "data": { "text/plain": [ "BenchmarkTools.Trial: \n", - " memory estimate: 8.23 KiB\n", - " allocs estimate: 158\n", + " memory estimate: 38.33 KiB\n", + " allocs estimate: 1616\n", " --------------\n", - " minimum time: 1.014 s (0.00% GC)\n", - " median time: 1.021 s (0.00% GC)\n", - " mean time: 1.020 s (0.00% GC)\n", - " maximum time: 1.027 s (0.00% GC)\n", + " minimum time: 1.241 s (0.00% GC)\n", + " median time: 1.355 s (0.00% GC)\n", + " mean time: 1.352 s (0.00% GC)\n", + " maximum time: 1.457 s (0.00% GC)\n", " --------------\n", - " samples: 5\n", + " samples: 4\n", " evals/sample: 1" ] }, @@ -268,7 +272,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The below is the benchmark for SnpBitMatrix:" + "With zero imputation:" ] }, { @@ -280,13 +284,13 @@ "data": { "text/plain": [ "BenchmarkTools.Trial: \n", - " memory estimate: 0 bytes\n", - " allocs estimate: 0\n", + " memory estimate: 38.33 KiB\n", + " allocs estimate: 1616\n", " --------------\n", - " minimum time: 1.041 s (0.00% GC)\n", - " median time: 1.056 s (0.00% GC)\n", - " mean time: 1.058 s (0.00% GC)\n", - " maximum time: 1.076 s (0.00% GC)\n", + " minimum time: 1.017 s (0.00% GC)\n", + " median time: 1.037 s (0.00% GC)\n", + " mean time: 1.034 s (0.00% GC)\n", + " maximum time: 1.045 s (0.00% GC)\n", " --------------\n", " samples: 5\n", " evals/sample: 1" @@ -298,30 +302,58 @@ } ], "source": [ - "@benchmark (LinearAlgebra.mul!($v1, $EUR_100_bm, $v2))" + "@benchmark LinearAlgebra.mul!($v1, $EUR_100_sla_, $v2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's try CUDA. The device is Nvidia Titan V." + "Indeed, we are paying some price for mean imputation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The below is the benchmark for SnpBitMatrix (always zero-imputed):" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "BenchmarkTools.Trial: \n", + " memory estimate: 0 bytes\n", + " allocs estimate: 0\n", + " --------------\n", + " minimum time: 1.045 s (0.00% GC)\n", + " median time: 1.051 s (0.00% GC)\n", + " mean time: 1.051 s (0.00% GC)\n", + " maximum time: 1.058 s (0.00% GC)\n", + " --------------\n", + " samples: 5\n", + " evals/sample: 1" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "using Adapt" + "@benchmark (LinearAlgebra.mul!($v1, $EUR_100_bm, $v2))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Moving data to GPU: " + "At first glance, the result from SnpBitMatrix might look better than SnpLinAlg. However, SnpLinAlg is more stable in performance when the number of samples is not multiple of 4 or 8." ] }, { @@ -330,8 +362,8 @@ "metadata": {}, "outputs": [], "source": [ - "v1_d = adapt(CuArray{Float64}, v1)\n", - "v2_d = adapt(CuArray{Float64}, v2);" + "v1 = randn(size(EUR_101, 1))\n", + "v2 = randn(size(EUR_101, 2));" ] }, { @@ -339,28 +371,19 @@ "execution_count": 15, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "┌ Warning: `Target(triple::String)` is deprecated, use `Target(; triple = triple)` instead.\n", - "│ caller = ip:0x0\n", - "└ @ Core :-1\n" - ] - }, { "data": { "text/plain": [ "BenchmarkTools.Trial: \n", - " memory estimate: 3.28 KiB\n", - " allocs estimate: 130\n", + " memory estimate: 44.13 KiB\n", + " allocs estimate: 1722\n", " --------------\n", - " minimum time: 22.138 ms (0.00% GC)\n", - " median time: 22.356 ms (0.00% GC)\n", - " mean time: 22.352 ms (0.00% GC)\n", - " maximum time: 23.825 ms (0.00% GC)\n", + " minimum time: 1.259 s (0.00% GC)\n", + " median time: 1.268 s (0.00% GC)\n", + " mean time: 1.268 s (0.00% GC)\n", + " maximum time: 1.277 s (0.00% GC)\n", " --------------\n", - " samples: 224\n", + " samples: 4\n", " evals/sample: 1" ] }, @@ -370,15 +393,7 @@ } ], "source": [ - "using BenchmarkTools\n", - "@benchmark CUDA.@sync LinearAlgebra.mul!($v1_d, $EUR_100_cu, $v2_d)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The speedup is obvious. Let's check correctness:" + "@benchmark LinearAlgebra.mul!($v1, $EUR_101_sla, $v2)" ] }, { @@ -389,7 +404,17 @@ { "data": { "text/plain": [ - "true" + "BenchmarkTools.Trial: \n", + " memory estimate: 44.13 KiB\n", + " allocs estimate: 1722\n", + " --------------\n", + " minimum time: 1.038 s (0.00% GC)\n", + " median time: 1.043 s (0.00% GC)\n", + " mean time: 1.043 s (0.00% GC)\n", + " maximum time: 1.051 s (0.00% GC)\n", + " --------------\n", + " samples: 5\n", + " evals/sample: 1" ] }, "execution_count": 16, @@ -398,60 +423,62 @@ } ], "source": [ - "isapprox(collect(v1_d), v1)" + "@benchmark LinearAlgebra.mul!($v1, $EUR_101_sla_, $v2)" ] }, { "cell_type": "code", "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "EUR_100_mat_d = adapt(CuArray, EUR_100_mat);" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { "text/plain": [ "BenchmarkTools.Trial: \n", - " memory estimate: 2.58 KiB\n", - " allocs estimate: 85\n", + " memory estimate: 0 bytes\n", + " allocs estimate: 0\n", " --------------\n", - " minimum time: 78.002 ms (0.00% GC)\n", - " median time: 80.142 ms (0.00% GC)\n", - " mean time: 80.055 ms (0.00% GC)\n", - " maximum time: 83.111 ms (0.00% GC)\n", + " minimum time: 1.544 s (0.00% GC)\n", + " median time: 1.561 s (0.00% GC)\n", + " mean time: 1.561 s (0.00% GC)\n", + " maximum time: 1.577 s (0.00% GC)\n", " --------------\n", - " samples: 63\n", + " samples: 4\n", " evals/sample: 1" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "@benchmark CUDA.@sync LinearAlgebra.mul!($v1_d, $EUR_100_mat_d, $v2_d)" + "@benchmark LinearAlgebra.mul!($v1, $EUR_101_bm, $v2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Using CuSnpArray is both faster and memory-efficient compared to linear algebra with floating point matrix on GPU." + "Now let's try CUDA. The device is Nvidia Titan V." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "using Adapt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "At first glance, the result from SnpBitMatrix might look similar to SnpLinAlg. However, SnpLinAlg is more stable in performance when the number of samples is not multiple of 4 or 8." + "Moving data to GPU: " ] }, { @@ -460,8 +487,12 @@ "metadata": {}, "outputs": [], "source": [ - "v1 = randn(size(EUR_101, 1))\n", - "v2 = randn(size(EUR_101, 2));" + "v1 = randn(size(EUR_100, 1))\n", + "v1_ = randn(size(EUR_100, 1))\n", + "v2 = randn(size(EUR_100, 2));\n", + "v1_d = adapt(CuArray{Float64}, v1)\n", + "v1_d_ = similar(v1_d)\n", + "v2_d = adapt(CuArray{Float64}, v2);" ] }, { @@ -469,19 +500,28 @@ "execution_count": 20, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "┌ Warning: `Target(triple::String)` is deprecated, use `Target(; triple = triple)` instead.\n", + "│ caller = ip:0x0\n", + "└ @ Core :-1\n" + ] + }, { "data": { "text/plain": [ "BenchmarkTools.Trial: \n", - " memory estimate: 14.03 KiB\n", - " allocs estimate: 264\n", + " memory estimate: 3.28 KiB\n", + " allocs estimate: 130\n", " --------------\n", - " minimum time: 1.022 s (0.00% GC)\n", - " median time: 1.024 s (0.00% GC)\n", - " mean time: 1.024 s (0.00% GC)\n", - " maximum time: 1.028 s (0.00% GC)\n", + " minimum time: 22.319 ms (0.00% GC)\n", + " median time: 22.429 ms (0.00% GC)\n", + " mean time: 22.722 ms (0.00% GC)\n", + " maximum time: 26.938 ms (0.00% GC)\n", " --------------\n", - " samples: 5\n", + " samples: 220\n", " evals/sample: 1" ] }, @@ -491,7 +531,15 @@ } ], "source": [ - "@benchmark LinearAlgebra.mul!($v1, $EUR_101_sla, $v2)" + "using BenchmarkTools\n", + "@benchmark CUDA.@sync LinearAlgebra.mul!($v1_d, $EUR_100_cu, $v2_d)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For CuSnpArray, the additional cost for mean imputation is negligible." ] }, { @@ -503,15 +551,15 @@ "data": { "text/plain": [ "BenchmarkTools.Trial: \n", - " memory estimate: 0 bytes\n", - " allocs estimate: 0\n", + " memory estimate: 3.28 KiB\n", + " allocs estimate: 130\n", " --------------\n", - " minimum time: 1.187 s (0.00% GC)\n", - " median time: 1.188 s (0.00% GC)\n", - " mean time: 1.191 s (0.00% GC)\n", - " maximum time: 1.197 s (0.00% GC)\n", + " minimum time: 22.243 ms (0.00% GC)\n", + " median time: 22.431 ms (0.00% GC)\n", + " mean time: 22.805 ms (0.00% GC)\n", + " maximum time: 57.050 ms (0.00% GC)\n", " --------------\n", - " samples: 5\n", + " samples: 220\n", " evals/sample: 1" ] }, @@ -521,7 +569,73 @@ } ], "source": [ - "@benchmark LinearAlgebra.mul!($v1, $EUR_101_bm, $v2)" + "@benchmark CUDA.@sync LinearAlgebra.mul!($v1_d_, $EUR_100_cu_, $v2_d)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "EUR_100_mat_d = adapt(CuArray, EUR_100_mat);" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "BenchmarkTools.Trial: \n", + " memory estimate: 2.58 KiB\n", + " allocs estimate: 85\n", + " --------------\n", + " minimum time: 75.951 ms (0.00% GC)\n", + " median time: 76.516 ms (0.00% GC)\n", + " mean time: 77.909 ms (0.00% GC)\n", + " maximum time: 82.096 ms (0.00% GC)\n", + " --------------\n", + " samples: 65\n", + " evals/sample: 1" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@benchmark CUDA.@sync LinearAlgebra.mul!($v1_d, $EUR_100_mat_d, $v2_d)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The speedup is obvious, CuSnpArrays is 30-50x faster than on CPU, and using CuSnpArray is both faster and memory-efficient compared to linear algebra with floating point matrix on GPU." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "true" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "isapprox(v1_d, v1_d_)" ] }, { @@ -533,7 +647,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -545,7 +659,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -555,16 +669,16 @@ " memory estimate: 16 bytes\n", " allocs estimate: 1\n", " --------------\n", - " minimum time: 662.131 ms (0.00% GC)\n", - " median time: 678.294 ms (0.00% GC)\n", - " mean time: 701.543 ms (0.00% GC)\n", - " maximum time: 790.350 ms (0.00% GC)\n", + " minimum time: 842.187 ms (0.00% GC)\n", + " median time: 882.157 ms (0.00% GC)\n", + " mean time: 913.111 ms (0.00% GC)\n", + " maximum time: 1.049 s (0.00% GC)\n", " --------------\n", - " samples: 8\n", + " samples: 6\n", " evals/sample: 1" ] }, - "execution_count": 23, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -575,7 +689,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -585,16 +699,16 @@ " memory estimate: 16 bytes\n", " allocs estimate: 1\n", " --------------\n", - " minimum time: 605.351 ms (0.00% GC)\n", - " median time: 618.706 ms (0.00% GC)\n", - " mean time: 620.553 ms (0.00% GC)\n", - " maximum time: 658.692 ms (0.00% GC)\n", + " minimum time: 618.434 ms (0.00% GC)\n", + " median time: 630.293 ms (0.00% GC)\n", + " mean time: 689.063 ms (0.00% GC)\n", + " maximum time: 881.782 ms (0.00% GC)\n", " --------------\n", - " samples: 9\n", + " samples: 8\n", " evals/sample: 1" ] }, - "execution_count": 24, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -605,7 +719,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -615,16 +729,16 @@ " memory estimate: 3.08 KiB\n", " allocs estimate: 118\n", " --------------\n", - " minimum time: 26.926 ms (0.00% GC)\n", - " median time: 27.330 ms (0.00% GC)\n", - " mean time: 27.494 ms (0.00% GC)\n", - " maximum time: 31.492 ms (0.00% GC)\n", + " minimum time: 26.710 ms (0.00% GC)\n", + " median time: 26.930 ms (0.00% GC)\n", + " mean time: 27.350 ms (0.00% GC)\n", + " maximum time: 32.306 ms (0.00% GC)\n", " --------------\n", - " samples: 182\n", + " samples: 183\n", " evals/sample: 1" ] }, - "execution_count": 25, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -635,7 +749,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -644,7 +758,7 @@ "true" ] }, - "execution_count": 26, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -655,7 +769,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -665,7 +779,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -675,16 +789,16 @@ " memory estimate: 128 bytes\n", " allocs estimate: 3\n", " --------------\n", - " minimum time: 667.369 ms (0.00% GC)\n", - " median time: 681.264 ms (0.00% GC)\n", - " mean time: 697.866 ms (0.00% GC)\n", - " maximum time: 789.988 ms (0.00% GC)\n", + " minimum time: 850.050 ms (0.00% GC)\n", + " median time: 862.818 ms (0.00% GC)\n", + " mean time: 953.739 ms (0.00% GC)\n", + " maximum time: 1.391 s (0.00% GC)\n", " --------------\n", - " samples: 8\n", + " samples: 6\n", " evals/sample: 1" ] }, - "execution_count": 28, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -695,7 +809,37 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "BenchmarkTools.Trial: \n", + " memory estimate: 128 bytes\n", + " allocs estimate: 3\n", + " --------------\n", + " minimum time: 673.269 ms (0.00% GC)\n", + " median time: 693.418 ms (0.00% GC)\n", + " mean time: 700.729 ms (0.00% GC)\n", + " maximum time: 784.130 ms (0.00% GC)\n", + " --------------\n", + " samples: 8\n", + " evals/sample: 1" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@benchmark LinearAlgebra.mul!($v2, transpose($EUR_101_sla_), $v1)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -705,16 +849,16 @@ " memory estimate: 16 bytes\n", " allocs estimate: 1\n", " --------------\n", - " minimum time: 612.441 ms (0.00% GC)\n", - " median time: 624.015 ms (0.00% GC)\n", - " mean time: 641.973 ms (0.00% GC)\n", - " maximum time: 770.845 ms (0.00% GC)\n", + " minimum time: 620.830 ms (0.00% GC)\n", + " median time: 637.698 ms (0.00% GC)\n", + " mean time: 656.708 ms (0.00% GC)\n", + " maximum time: 798.543 ms (0.00% GC)\n", " --------------\n", " samples: 8\n", " evals/sample: 1" ] }, - "execution_count": 29, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } diff --git a/docs/src/index.md b/docs/src/index.md index e82efafb..7f1127c3 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -167,7 +167,7 @@ Because the file is memory-mapped opening the file and accessing the data is fas @btime(SnpArray(SnpArrays.datadir("mouse.bed"))); ``` - 87.555 μs (64 allocations: 390.03 KiB) + 94.032 μs (64 allocations: 390.03 KiB) By default, the memory-mapped file is read only, changing entries is not allowed. @@ -430,11 +430,11 @@ tmpbf = SnpArray(undef, 5, 3) 5×3 SnpArray: - 0x00 0x00 0x00 - 0x00 0x00 0x00 - 0x00 0x00 0x00 - 0x00 0x00 0x00 - 0x00 0x00 0x00 + 0x00 0x01 0x01 + 0x00 0x03 0x01 + 0x01 0x03 0x02 + 0x02 0x01 0x00 + 0x00 0x00 0x03 @@ -449,11 +449,11 @@ tmpbf = SnpArray("tmp.bed", tmpbf) 5×3 SnpArray: - 0x00 0x00 0x00 - 0x00 0x00 0x00 - 0x00 0x00 0x00 - 0x00 0x00 0x00 - 0x00 0x00 0x00 + 0x00 0x01 0x01 + 0x00 0x03 0x01 + 0x01 0x03 0x02 + 0x02 0x01 0x00 + 0x00 0x00 0x03 @@ -467,11 +467,11 @@ tmpbf 5×3 SnpArray: - 0x02 0x00 0x00 - 0x00 0x00 0x00 - 0x00 0x00 0x00 - 0x00 0x00 0x00 - 0x00 0x00 0x00 + 0x02 0x01 0x01 + 0x00 0x03 0x01 + 0x01 0x03 0x02 + 0x02 0x01 0x00 + 0x00 0x00 0x03 @@ -481,11 +481,11 @@ tmpbf rm("tmp.bed", force=true) ``` -## `convert` and `copyto!` +### `convert` and `copyto!` Most common usage of SnpArray is to convert genotypes to numeric values for statistical analysis. Conversion rule depends on genetic models (additive, dominant, or recessive), centering, scaling, or imputation. -### `convert` +#### `convert` `convert` function has 4 keyword arguments: `model`, `center`, `scale`, and `impute`. @@ -730,7 +730,7 @@ convert(Vector{Float64}, @view(mouse[:, end]), center=true, scale=true, impute=t -### `copyto!` +#### `copyto!` `copyto!` is the in-place version of `convert`. It takes the same keyword arguments (`model`, `center`, `scale`, `impute`) as `convert`. @@ -780,7 +780,7 @@ copyto!(v, @view(mouse[:, 1])) @btime(copyto!($v, $@view(mouse[:, 1]))); ``` - 3.719 μs (0 allocations: 0 bytes) + 3.655 μs (0 allocations: 0 bytes) Copy columns using defaults @@ -830,7 +830,7 @@ copyto!(v2, @view(mouse[:, 1:2])) @btime(copyto!($v2, $@view(mouse[:, 1:2]))); ``` - 7.138 μs (0 allocations: 0 bytes) + 7.205 μs (0 allocations: 0 bytes) Center and scale @@ -879,7 +879,7 @@ copyto!(v, @view(mouse[:, 1]), center=true, scale=true) @btime(copyto!($v, $(@view(mouse[:, 1])), center=true, scale=true)); ``` - 6.860 μs (0 allocations: 0 bytes) + 6.856 μs (0 allocations: 0 bytes) Looping over all columns @@ -895,7 +895,7 @@ end @btime(loop_test($v, $mouse)) ``` - 50.433 ms (10150 allocations: 475.78 KiB) + 50.329 ms (10150 allocations: 475.78 KiB) Copy whole SnpArray @@ -906,12 +906,12 @@ M = similar(mouse, Float64) @btime(copyto!($M, $mouse)); ``` - 64.308 ms (0 allocations: 0 bytes) + 64.344 ms (0 allocations: 0 bytes) -## Summaries +### Summaries -### Counts +#### Counts Counts of each the four possible values for each column are returned by `counts`.` @@ -941,10 +941,10 @@ The counts by column and by row are cached in the `SnpArray` object. Accesses af @btime(counts($mouse, dims=1)); ``` - 6.047 ns (0 allocations: 0 bytes) + 6.043 ns (0 allocations: 0 bytes) -### Minor allele frequencies +#### Minor allele frequencies Minor allele frequencies (MAF) for each SNP. @@ -1026,7 +1026,7 @@ minorallele(mouse) -### `mean` and `var` +#### `mean` and `var` The package provides methods for the generics `mean` and `var` from the `Statistics` package. @@ -1076,7 +1076,7 @@ These methods make use of the cached column or row counts and thus are very fast @btime(mean($mouse, dims=1)); ``` - 17.328 μs (2 allocations: 79.39 KiB) + 17.263 μs (2 allocations: 79.39 KiB) The column-wise or row-wise standard deviations are returned by `std`. @@ -1119,7 +1119,7 @@ std(mouse, dims=2) -### Missing rate +#### Missing rate Proportion of missing genotypes @@ -1199,7 +1199,7 @@ missingrate(mouse, 2) -### Location of the missing values +#### Location of the missing values The positions of the missing data are evaluated by @@ -1246,7 +1246,7 @@ mp = missingpos(mouse) @btime(missingpos($mouse)); ``` - 43.661 ms (19274 allocations: 1.80 MiB) + 43.843 ms (19274 allocations: 1.80 MiB) So, for example, the number of missing data values in each column can be evaluated as @@ -1279,7 +1279,7 @@ view(counts(mouse, dims=1), 2:2, :) -## Genetic relationship matrix +### Genetic relationship matrix `grm` function computes the empirical kinship matrix using either the classical genetic relationship matrix, `grm(A, model=:GRM)`, or the method of moment method, `grm(A, model=:MoM)`, or the robust method, `grm(A, model=:Robust)`. @@ -1330,7 +1330,7 @@ grm(mouse, method=:GRM) @btime(grm($mouse, method=:GRM)); ``` - 502.287 ms (25 allocations: 28.95 MiB) + 505.403 ms (25 allocations: 28.95 MiB) Using Float32 (single precision) potentially saves memory usage and computation time. @@ -1378,7 +1378,7 @@ grm(mouse, method=:GRM, t=Float32) @btime(grm($mouse, method=:GRM, t=Float32)); ``` - 315.064 ms (26 allocations: 14.60 MiB) + 312.411 ms (26 allocations: 14.60 MiB) By default, `grm` exlcude SNPs with minor allele frequency below 0.01. This can be changed by the keyword argument `minmaf`. @@ -1463,7 +1463,7 @@ grm(mouse, cinds=1:2:size(mouse, 2)) -## Filtering +### Filtering Before GWAS, we often need to filter SNPs and/or samples according to genotyping success rates, minor allele frequencies, and Hardy-Weinberg Equilibrium test. This can be achieved by the `filter` function. @@ -1502,7 +1502,7 @@ count(rowmask), count(colmask) @btime(SnpArrays.filter($mouse, min_success_rate_per_row=0.999, min_success_rate_per_col=0.999)); ``` - 150.800 ms (11460 allocations: 171.28 MiB) + 148.931 ms (11460 allocations: 171.28 MiB) One may use the `rowmask` and `colmask` to filter and save filtering result as Plink files. @@ -1510,7 +1510,7 @@ One may use the `rowmask` and `colmask` to filter and save filtering result as P SnpArrays.filter(SnpArrays.datadir("mouse"), rowmask, colmask) ``` -### Filter Plink files +#### Filter Plink files Filter a set of Plink files according to row indices and column indices. By result, filtered Plink files are saved as `srcname.filtered.bed`, `srcname.filtered.fam`, and `srcname.filtered.bim`, where `srcname` is the source Plink file name. You can also specify destimation file name using keyword `des`. @@ -1694,14 +1694,10 @@ readdir(glob"tmp_*", ".") - 7-element Array{String,1}: + 3-element Array{String,1}: "./tmp_hcat_arr_1.bed" "./tmp_hvcat_arr_1.bed" "./tmp_vcat_arr_1.bed" - "./tmp_vcat_arr_2.bed" - "./tmp_vcat_arr_3.bed" - "./tmp_vcat_arr_4.bed" - "./tmp_vcat_arr_5.bed" @@ -1770,12 +1766,17 @@ rm(SnpArrays.datadir("mouse.test.vcat.bed"), force=true) rm(SnpArrays.datadir("mouse.test.hvcat.bed"), force=true) ``` -## SnpBitMatrix +## Linear Algebra + +In some applications we want to perform linear algebra using SnpArray directly without expanding it to numeric matrix. This is achieved in three different `struct`s: -In some applications we want to perform linear algebra using SnpArray directly without expanding it to numeric matrix. This is achieved by the `SnpBitMatrix` type. The implementation assumes: +1. Direct operations on a plink-formatted `SnpArray`: `SnpLinAlg` +2. Operations on transformed `BitMatrix`es: `SnpBitMatrix` +3. Direct operations on a plink-formatted data on an Nvidia GPU: `CuSnpArray`. -1. the SnpArray does not have missing genotypes, and -2. the matrix corresponding to SnpArray is the matrix of A2 allele counts. +`SnpLinAlg` and `SnpBitMatrix` use Chris Elrod's [LoopVectorization.jl](https://github.com/chriselrod/LoopVectorization.jl) internally. It is much faster on machines with AVX support. `CuSnpArray` uses [CUDA.jl](https://juliagpu.gitlab.io/CUDA.jl/) internally. + +The implementation assumes that the matrix corresponding to SnpArray is the matrix of the A2 allele counts. `SnpLinAlg` and `CuSnpArray` impute any missing genotype with its column mean by default. They can also configured to impute missing genotypes with zero. `SnpBitMatrix` can only impute missing values with zero. ### Constructor @@ -1819,112 +1820,118 @@ const EUR = SnpArray(SnpArrays.datadir("EUR_subset.bed")) -To instantiate a SnpBitMatrix based on SnpArray, +To instantiate a SnpLinAlg based on SnpArray, ```julia +const EURsla = SnpLinAlg{Float64}(EUR, model=ADDITIVE_MODEL, center=true, scale=true); +const EURsla_ = SnpLinAlg{Float64}(EUR, model=ADDITIVE_MODEL, center=true, scale=true, impute=false); const EURbm = SnpBitMatrix{Float64}(EUR, model=ADDITIVE_MODEL, center=true, scale=true); ``` -The constructor shares the same keyword arguments as the `convert` or `copyto!` functions. The type parameter, `Float64` in this example, indicates the SnpBitMatrix acts like a Float64 matrix. +The constructor shares the same keyword arguments as the `convert` or `copyto!` functions. The type parameter, `Float64` in this example, indicates the SnpLinAlg acts like a Float64 matrix. +SnpLinAlg directly uses the SnpArray for computation. -The memory usage of the SnpBitMatrix should be similar to the SnpArray, or equivalently bed file size, if `model=ADDITIVE_MODEL`, or half of that of SnpArray if `model=DOMINANT_MODEL` or `model=RECESSIVE_MODEL`. +On the other hand, memory usage of SnpBitMatrix should be similar to the SnpArray, or equivalently bed file size, if `model=ADDITIVE_MODEL`, or half of that of SnpArray if `model=DOMINANT_MODEL` or `model=RECESSIVE_MODEL`. ```julia -Base.summarysize(EUR), Base.summarysize(EURbm) +Base.summarysize(EUR), Base.summarysize(EURsla), Base.summarysize(EURbm) ``` - (6876757, 6421960) + (6876757, 8177221, 6421960) -### Linear algebra +### `mul!()` -A SnpBitMatrix acts similar to a regular matrix and responds to `size`, `eltype`, and SnpBitMatrix-vector multiplications. +SnpLinAlg and SnpBitMatrix act similar to a regular matrix and responds to `size`, `eltype`, and matrix-vector multiplications. ```julia +@show size(EURsla) +@show eltype(EURsla) +@show typeof(EURsla) <: AbstractMatrix; + @show size(EURbm) @show eltype(EURbm) @show typeof(EURbm) <: AbstractMatrix; ``` + size(EURsla) = (379, 54051) + eltype(EURsla) = Float64 + typeof(EURsla) <: AbstractMatrix = true size(EURbm) = (379, 54051) eltype(EURbm) = Float64 typeof(EURbm) <: AbstractMatrix = true -SnpBitMatrix-vector multiplication is mathematically equivalent to the corresponding Float matrix contained from `convert` or `copyto!` a SnpArray. +Matrix-vector multiplications with SnpLinAlg and SnpBitMatrix are mathematically equivalent to the corresponding Float matrix contained from `convert` or `copyto!` a SnpArray. ```julia using LinearAlgebra v1 = randn(size(EUR, 1)) v2 = randn(size(EUR, 2)) -A = convert(Matrix{Float64}, EUR, model=ADDITIVE_MODEL, center=true, scale=true) -norm(EURbm * v2 - A * v2) +A = convert(Matrix{Float64}, EUR, model=ADDITIVE_MODEL, center=true, scale=true); +``` + + +```julia +norm(EURsla * v2 - A * v2) ``` - 2.837775572500693e-11 + 3.255157000354014e-11 ```julia -norm(EURbm' * v1 - A' * v1) +norm(EURsla' * v1 - A' *v1) ``` - 5.399567080480323e-12 - + 5.398163705112499e-12 -In this example, the Float64 matrix fits into memory so the SnpBitMatrix-vector multiplication is much slower than Matrix{Float64}-vector multiplication (highly optimized BLAS). ```julia -out1 = Vector{Float64}(undef, size(EUR, 1)) -out2 = Vector{Float64}(undef, size(EUR, 2)) -@btime(mul!($out1, $EURbm, $v2)); +norm(EURbm * v2 - A * v2) ``` - 8.495 ms (0 allocations: 0 bytes) -```julia -@btime(mul!($out1, $A, $v2)); -``` + 3.590044545281484e-11 - 7.844 ms (0 allocations: 0 bytes) ```julia -@btime(mul!($out2, $transpose($EURbm), $v1)); +norm(EURbm' * v1 - A' * v1) ``` - 6.612 ms (1 allocation: 16 bytes) -```julia -@btime(mul!($out2, $transpose($A), $v1)); -``` + 8.54121862883797e-12 + - 3.011 ms (0 allocations: 0 bytes) +See Linear Algebra page for performance comparison among BLAS, SnpLinAlg, and SnpBitMatrix. In general, SnpLinAlg and SnpBitMatrix operations are at least twice faster than the corresponding Matrix{Float64}-vector multiplication. -In another test example with ~1GB bed file, SnpBitMatrix-vector multiplication is about 3-5 folder faster than the corresponding Matrix{Float64}-vector multiplication, because the Matrix{Float64} matrix cannot fit into the memory. +In general, computing $Ax$ is more effective in SnpLinAlg, and computing $A^T x$ is faster in SnpBitMatrix. Note that SnpLinAlg does not allocate additional memory, and can impute missing values with column means. See Linear Algebra page for more information. + +In a test example with ~1GB bed file, SnpBitMatrix-vector multiplication is about 3-5 fold faster than the corresponding Matrix{Float64}-vector multiplication, because the Matrix{Float64} matrix cannot fit into the memory. `SnpBitMatrix` can be created from a subarray of SnpArray. @@ -1970,7 +1977,7 @@ norm(EURsubbm * v2 - A * v2) - 8.987658251177042e-14 + 5.05574647736198e-14 @@ -1982,43 +1989,11 @@ norm(EURsubbm' * v1 - A' * v1) - 1.6136185547316357e-14 - - - -### Two alternatives: SnpLinAlg and CuSnpArray - -Basically, creating SnpBitMatrix doubles the memory usage. If it becomes an issue, direct linear algebra operations from a SnpArray is possible. - -#### SnpLinAlg - - -```julia -const EURla = SnpLinAlg{Float64}(EUR; model=ADDITIVE_MODEL, center=true, scale=true); -``` - - -```julia -v1 = randn(size(EUR, 1)) -v2 = randn(size(EUR, 2)) -out1 = Vector{Float64}(undef, size(EUR, 1)) -out2 = Vector{Float64}(undef, size(EUR, 2)) -@btime mul!($out1, $EURla, $v2); -``` - - 6.880 ms (264 allocations: 14.03 KiB) - - -```julia -@btime mul!($out2, transpose($EURla), $v1); -``` - - 9.760 ms (3 allocations: 128 bytes) + 1.4647267077134765e-14 -Computing $Ax$ is faster in SnpLinAlg, and computing $A^T x$ is faster in SnpBitMatrix. Note that SnpLinAlg does not allocate additional memory. See Linear Algebra page for more information. -#### CuSnpArray +### GPU support: CuSnpArray On machines with Nvidia GPU, matrix-vector multiplications can be performed on it via CuSnpArray. The input vectors should be CuVectors. @@ -2026,6 +2001,10 @@ On machines with Nvidia GPU, matrix-vector multiplications can be performed on i ```julia ENV["JULIA_CUDA_USE_BINARYBUILDER"] = "false" # will use local CUDA installation using CUDA, Adapt +out1 = randn(size(EUR, 1)) +out2 = randn(size(EUR, 2)) +v1 = randn(size(EUR, 1)) +v2 = randn(size(EUR, 2)) v1_d = adapt(CuVector{Float64}, v1) # sends data to GPU v2_d = adapt(CuVector{Float64}, v2) out1_d = adapt(CuVector{Float64}, out1) @@ -2049,7 +2028,7 @@ const EURcu = CuSnpArray{Float64}(EUR; model=ADDITIVE_MODEL, center=true, scale= └ @ Core :-1 - 18.361 ms (253 allocations: 7.69 KiB) + 18.204 ms (253 allocations: 7.69 KiB) @@ -2057,7 +2036,7 @@ const EURcu = CuSnpArray{Float64}(EUR; model=ADDITIVE_MODEL, center=true, scale= @btime mul!($out2_d, transpose($EURcu), $v1_d); ``` - 626.435 μs (465 allocations: 15.00 KiB) + 544.722 μs (465 allocations: 15.00 KiB) The operations are parallelized along the output dimension, hence the GPU was not fully utilized in the first case. With 100-time larger data, 30 to 50-fold speedup were observed for both cases with Nvidia Titan V. See linear algebra page for more information. @@ -2072,7 +2051,7 @@ norm(collect(EURcu' * v1_d) - EURbm' * v1) - 4.38122146476498e-11 + 2.2306544292754367e-11 @@ -2600,9 +2579,9 @@ We can merge the splitted dictionary back into one SnpData using `merge_plink`. merged = SnpArrays.merge_plink("tmp.merged", splitted) # write_plink is included here ``` - 0.066388 seconds (117.18 k allocations: 7.904 MiB) - 0.049178 seconds (125.97 k allocations: 11.520 MiB) - 0.051638 seconds (134.18 k allocations: 8.737 MiB) + 0.067004 seconds (117.20 k allocations: 7.905 MiB) + 0.102031 seconds (125.97 k allocations: 11.520 MiB, 51.01% gc time) + 0.050454 seconds (134.18 k allocations: 8.737 MiB) @@ -2645,10 +2624,10 @@ You can also merge the plink formatted files based on their common prefix. merged_from_splitted_files = merge_plink("tmp.split.chr"; des = "tmp.merged.2") ``` - 0.143816 seconds (783.53 k allocations: 36.258 MiB) - 0.575327 seconds (1.19 M allocations: 61.140 MiB) - 0.003704 seconds (8 allocations: 4.897 MiB) - 0.000827 seconds (8 allocations: 1.650 MiB) + 0.146038 seconds (783.53 k allocations: 36.252 MiB) + 0.568753 seconds (1.19 M allocations: 61.158 MiB) + 0.003368 seconds (8 allocations: 4.897 MiB) + 0.001259 seconds (8 allocations: 1.650 MiB) @@ -2750,12 +2729,12 @@ mouse_toreorder │ Row │ fid │ iid │ father │ mother │ sex │ phenotype │ │ │ Abstrac… │ AbstractS… │ AbstractStr… │ AbstractStr… │ Abstrac… │ Abstract… │ ├─────┼──────────┼────────────┼──────────────┼──────────────┼──────────┼───────────┤ - │ 1 │ 1_5 │ A053629514 │ F5.3:E2.2(6) │ F5.3:A3.2(5) │ 1 │ -9 │ - │ 2 │ 1_18 │ A067077564 │ G1.5:B5.4(5) │ G1.5:F5.4(5) │ 1 │ -9 │ - │ 3 │ 1_2 │ A048029086 │ G1.3:B1.2(3) │ G1.3:F3.2(3) │ 2 │ -9 │ - │ 4 │ 1_1 │ A048097274 │ H4.2:C5.1(4) │ H4.2:G1.1(7) │ 1 │ -9 │ - │ 5 │ 1_15 │ A063796366 │ H2.5:C3.4(3) │ H2.5:G4.4(2) │ 2 │ -9 │ - │ 6 │ 1_45 │ A084280094 │ 6.6:F1.5(6) │ 6.6:G1.5(1) │ 1 │ -9 │ + │ 1 │ 1_3 │ A067289112 │ E2.4:H3.3(2) │ E2.4:D4.3(2) │ 1 │ -9 │ + │ 2 │ 1_1 │ A048031769 │ H4.2:C5.1(4) │ H4.2:G1.1(7) │ 1 │ -9 │ + │ 3 │ 1_2 │ A084286354 │ H4.4:C2.3(5) │ H4.4:G3.3(7) │ 1 │ -9 │ + │ 4 │ 1_3 │ A048072266 │ G5.2:B5.1(4) │ G5.2:F5.1(2) │ 2 │ -9 │ + │ 5 │ 1_3 │ A063311623 │ A5.4:D4.3(5) │ A5.4:H1.3(6) │ 1 │ -9 │ + │ 6 │ 1_54 │ A063011268 │ B3.5:E4.4(2) │ B3.5:A3.4(4) │ 1 │ -9 │ …, srcbed: mouse_reorder.bed srcbim: mouse_reorder.bim diff --git a/docs/src/linalg.md b/docs/src/linalg.md index 3282f509..fc2457da 100644 --- a/docs/src/linalg.md +++ b/docs/src/linalg.md @@ -37,25 +37,25 @@ const EUR = SnpArray(SnpArrays.datadir("EUR_subset.bed")); Let's try with EUR data repeated 100 and 101 times: 37900 by 54051 and 38279 by 54051, respectively. - ```julia EUR_10 = [EUR; EUR; EUR; EUR; EUR; EUR; EUR; EUR; EUR; EUR] EUR_100 = [EUR_10; EUR_10; EUR_10; EUR_10; EUR_10; EUR_10; EUR_10; EUR_10; EUR_10; EUR_10]; EUR_101 = [EUR_100; EUR]; ``` - We create instnaces of SnpLinAlg, SnpBitmatrix and CuSnpArray: ```julia EUR_100_bm = SnpBitMatrix{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false) EUR_100_sla = SnpLinAlg{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false) -EUR_100_mat = convert(Matrix{Float64}, EUR_100, model=ADDITIVE_MODEL, center=true, scale=true); +EUR_100_sla_ = SnpLinAlg{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false, impute=false) +EUR_100_mat = convert(Matrix{Float64}, EUR_100, model=ADDITIVE_MODEL, center=false, scale=false); EUR_101_bm = SnpBitMatrix{Float64}(EUR_101; model=ADDITIVE_MODEL, center=false, scale=false) EUR_101_sla = SnpLinAlg{Float64}(EUR_101; model=ADDITIVE_MODEL, center=false, scale=false) -EUR_101_mat = convert(Matrix{Float64}, EUR_101, model=ADDITIVE_MODEL, center=true, scale=true); +EUR_101_sla_ = SnpLinAlg{Float64}(EUR_101; model=ADDITIVE_MODEL, center=false, scale=false, impute=false) +EUR_101_mat = convert(Matrix{Float64}, EUR_101, model=ADDITIVE_MODEL, center=false, scale=false); ``` @@ -63,6 +63,7 @@ EUR_101_mat = convert(Matrix{Float64}, EUR_101, model=ADDITIVE_MODEL, center=tru ENV["JULIA_CUDA_USE_BINARYBUILDER"] = "false" using CUDA EUR_100_cu = CuSnpArray{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false); +EUR_100_cu_ = CuSnpArray{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false, impute=false); ``` ┌ Warning: `haskey(::TargetIterator, name::String)` is deprecated, use `Target(; name = name) !== nothing` instead. @@ -81,6 +82,7 @@ using BenchmarkTools ```julia v1 = randn(size(EUR_100, 1)) +v1_ = randn(size(EUR_100, 1)) v2 = randn(size(EUR_100, 2)); ``` @@ -99,12 +101,12 @@ BLAS.set_num_threads(8) memory estimate: 0 bytes allocs estimate: 0 -------------- - minimum time: 321.207 ms (0.00% GC) - median time: 347.112 ms (0.00% GC) - mean time: 375.497 ms (0.00% GC) - maximum time: 769.390 ms (0.00% GC) + minimum time: 383.853 ms (0.00% GC) + median time: 417.986 ms (0.00% GC) + mean time: 431.205 ms (0.00% GC) + maximum time: 530.847 ms (0.00% GC) -------------- - samples: 15 + samples: 12 evals/sample: 1 @@ -124,17 +126,17 @@ BLAS.set_num_threads(1) memory estimate: 0 bytes allocs estimate: 0 -------------- - minimum time: 1.745 s (0.00% GC) - median time: 1.905 s (0.00% GC) - mean time: 2.144 s (0.00% GC) - maximum time: 2.782 s (0.00% GC) + minimum time: 1.849 s (0.00% GC) + median time: 2.064 s (0.00% GC) + mean time: 2.047 s (0.00% GC) + maximum time: 2.229 s (0.00% GC) -------------- samples: 3 evals/sample: 1 -Direct linear algebra on a SnpArray: +Direct linear algebra on a SnpArray, with mean imputation: ```julia @@ -145,20 +147,46 @@ Direct linear algebra on a SnpArray: BenchmarkTools.Trial: - memory estimate: 8.23 KiB - allocs estimate: 158 + memory estimate: 38.33 KiB + allocs estimate: 1616 + -------------- + minimum time: 1.241 s (0.00% GC) + median time: 1.355 s (0.00% GC) + mean time: 1.352 s (0.00% GC) + maximum time: 1.457 s (0.00% GC) + -------------- + samples: 4 + evals/sample: 1 + + + +With zero imputation: + + +```julia +@benchmark LinearAlgebra.mul!($v1, $EUR_100_sla_, $v2) +``` + + + + + BenchmarkTools.Trial: + memory estimate: 38.33 KiB + allocs estimate: 1616 -------------- - minimum time: 1.014 s (0.00% GC) - median time: 1.021 s (0.00% GC) - mean time: 1.020 s (0.00% GC) - maximum time: 1.027 s (0.00% GC) + minimum time: 1.017 s (0.00% GC) + median time: 1.037 s (0.00% GC) + mean time: 1.034 s (0.00% GC) + maximum time: 1.045 s (0.00% GC) -------------- samples: 5 evals/sample: 1 -The below is the benchmark for SnpBitMatrix: +Indeed, we are paying some price for mean imputation. + +The below is the benchmark for SnpBitMatrix (always zero-imputed): ```julia @@ -172,16 +200,90 @@ The below is the benchmark for SnpBitMatrix: memory estimate: 0 bytes allocs estimate: 0 -------------- - minimum time: 1.041 s (0.00% GC) - median time: 1.056 s (0.00% GC) - mean time: 1.058 s (0.00% GC) - maximum time: 1.076 s (0.00% GC) + minimum time: 1.045 s (0.00% GC) + median time: 1.051 s (0.00% GC) + mean time: 1.051 s (0.00% GC) + maximum time: 1.058 s (0.00% GC) + -------------- + samples: 5 + evals/sample: 1 + + + +At first glance, the result from SnpBitMatrix might look better than SnpLinAlg. However, SnpLinAlg is more stable in performance when the number of samples is not multiple of 4 or 8. + + +```julia +v1 = randn(size(EUR_101, 1)) +v2 = randn(size(EUR_101, 2)); +``` + + +```julia +@benchmark LinearAlgebra.mul!($v1, $EUR_101_sla, $v2) +``` + + + + + BenchmarkTools.Trial: + memory estimate: 44.13 KiB + allocs estimate: 1722 + -------------- + minimum time: 1.259 s (0.00% GC) + median time: 1.268 s (0.00% GC) + mean time: 1.268 s (0.00% GC) + maximum time: 1.277 s (0.00% GC) + -------------- + samples: 4 + evals/sample: 1 + + + + +```julia +@benchmark LinearAlgebra.mul!($v1, $EUR_101_sla_, $v2) +``` + + + + + BenchmarkTools.Trial: + memory estimate: 44.13 KiB + allocs estimate: 1722 + -------------- + minimum time: 1.038 s (0.00% GC) + median time: 1.043 s (0.00% GC) + mean time: 1.043 s (0.00% GC) + maximum time: 1.051 s (0.00% GC) -------------- samples: 5 evals/sample: 1 + +```julia +@benchmark LinearAlgebra.mul!($v1, $EUR_101_bm, $v2) +``` + + + + + BenchmarkTools.Trial: + memory estimate: 0 bytes + allocs estimate: 0 + -------------- + minimum time: 1.544 s (0.00% GC) + median time: 1.561 s (0.00% GC) + mean time: 1.561 s (0.00% GC) + maximum time: 1.577 s (0.00% GC) + -------------- + samples: 4 + evals/sample: 1 + + + Now let's try CUDA. The device is Nvidia Titan V. @@ -193,7 +295,11 @@ Moving data to GPU: ```julia +v1 = randn(size(EUR_100, 1)) +v1_ = randn(size(EUR_100, 1)) +v2 = randn(size(EUR_100, 2)); v1_d = adapt(CuArray{Float64}, v1) +v1_d_ = similar(v1_d) v2_d = adapt(CuArray{Float64}, v2); ``` @@ -215,27 +321,37 @@ using BenchmarkTools memory estimate: 3.28 KiB allocs estimate: 130 -------------- - minimum time: 22.138 ms (0.00% GC) - median time: 22.356 ms (0.00% GC) - mean time: 22.352 ms (0.00% GC) - maximum time: 23.825 ms (0.00% GC) + minimum time: 22.319 ms (0.00% GC) + median time: 22.429 ms (0.00% GC) + mean time: 22.722 ms (0.00% GC) + maximum time: 26.938 ms (0.00% GC) -------------- - samples: 224 + samples: 220 evals/sample: 1 -The speedup is obvious. Let's check correctness: +For CuSnpArray, the additional cost for mean imputation is negligible. ```julia -isapprox(collect(v1_d), v1) +@benchmark CUDA.@sync LinearAlgebra.mul!($v1_d_, $EUR_100_cu_, $v2_d) ``` - true + BenchmarkTools.Trial: + memory estimate: 3.28 KiB + allocs estimate: 130 + -------------- + minimum time: 22.243 ms (0.00% GC) + median time: 22.431 ms (0.00% GC) + mean time: 22.805 ms (0.00% GC) + maximum time: 57.050 ms (0.00% GC) + -------------- + samples: 220 + evals/sample: 1 @@ -256,67 +372,27 @@ EUR_100_mat_d = adapt(CuArray, EUR_100_mat); memory estimate: 2.58 KiB allocs estimate: 85 -------------- - minimum time: 78.002 ms (0.00% GC) - median time: 80.142 ms (0.00% GC) - mean time: 80.055 ms (0.00% GC) - maximum time: 83.111 ms (0.00% GC) + minimum time: 75.951 ms (0.00% GC) + median time: 76.516 ms (0.00% GC) + mean time: 77.909 ms (0.00% GC) + maximum time: 82.096 ms (0.00% GC) -------------- - samples: 63 + samples: 65 evals/sample: 1 -Using CuSnpArray is both faster and memory-efficient compared to linear algebra with floating point matrix on GPU. - -At first glance, the result from SnpBitMatrix might look similar to SnpLinAlg. However, SnpLinAlg is more stable in performance when the number of samples is not multiple of 4 or 8. +The speedup is obvious, CuSnpArrays is 30-50x faster than on CPU, and using CuSnpArray is both faster and memory-efficient compared to linear algebra with floating point matrix on GPU. ```julia -v1 = randn(size(EUR_101, 1)) -v2 = randn(size(EUR_101, 2)); -``` - - -```julia -@benchmark LinearAlgebra.mul!($v1, $EUR_101_sla, $v2) +isapprox(v1_d, v1_d_) ``` - BenchmarkTools.Trial: - memory estimate: 14.03 KiB - allocs estimate: 264 - -------------- - minimum time: 1.022 s (0.00% GC) - median time: 1.024 s (0.00% GC) - mean time: 1.024 s (0.00% GC) - maximum time: 1.028 s (0.00% GC) - -------------- - samples: 5 - evals/sample: 1 - - - - -```julia -@benchmark LinearAlgebra.mul!($v1, $EUR_101_bm, $v2) -``` - - - - - BenchmarkTools.Trial: - memory estimate: 0 bytes - allocs estimate: 0 - -------------- - minimum time: 1.187 s (0.00% GC) - median time: 1.188 s (0.00% GC) - mean time: 1.191 s (0.00% GC) - maximum time: 1.197 s (0.00% GC) - -------------- - samples: 5 - evals/sample: 1 + true @@ -342,12 +418,12 @@ v2_d = adapt(CuArray{Float64}, v2); memory estimate: 16 bytes allocs estimate: 1 -------------- - minimum time: 662.131 ms (0.00% GC) - median time: 678.294 ms (0.00% GC) - mean time: 701.543 ms (0.00% GC) - maximum time: 790.350 ms (0.00% GC) + minimum time: 842.187 ms (0.00% GC) + median time: 882.157 ms (0.00% GC) + mean time: 913.111 ms (0.00% GC) + maximum time: 1.049 s (0.00% GC) -------------- - samples: 8 + samples: 6 evals/sample: 1 @@ -364,12 +440,12 @@ v2_d = adapt(CuArray{Float64}, v2); memory estimate: 16 bytes allocs estimate: 1 -------------- - minimum time: 605.351 ms (0.00% GC) - median time: 618.706 ms (0.00% GC) - mean time: 620.553 ms (0.00% GC) - maximum time: 658.692 ms (0.00% GC) + minimum time: 618.434 ms (0.00% GC) + median time: 630.293 ms (0.00% GC) + mean time: 689.063 ms (0.00% GC) + maximum time: 881.782 ms (0.00% GC) -------------- - samples: 9 + samples: 8 evals/sample: 1 @@ -386,12 +462,12 @@ v2_d = adapt(CuArray{Float64}, v2); memory estimate: 3.08 KiB allocs estimate: 118 -------------- - minimum time: 26.926 ms (0.00% GC) - median time: 27.330 ms (0.00% GC) - mean time: 27.494 ms (0.00% GC) - maximum time: 31.492 ms (0.00% GC) + minimum time: 26.710 ms (0.00% GC) + median time: 26.930 ms (0.00% GC) + mean time: 27.350 ms (0.00% GC) + maximum time: 32.306 ms (0.00% GC) -------------- - samples: 182 + samples: 183 evals/sample: 1 @@ -426,10 +502,32 @@ v2 = randn(size(EUR_101, 2)); memory estimate: 128 bytes allocs estimate: 3 -------------- - minimum time: 667.369 ms (0.00% GC) - median time: 681.264 ms (0.00% GC) - mean time: 697.866 ms (0.00% GC) - maximum time: 789.988 ms (0.00% GC) + minimum time: 850.050 ms (0.00% GC) + median time: 862.818 ms (0.00% GC) + mean time: 953.739 ms (0.00% GC) + maximum time: 1.391 s (0.00% GC) + -------------- + samples: 6 + evals/sample: 1 + + + + +```julia +@benchmark LinearAlgebra.mul!($v2, transpose($EUR_101_sla_), $v1) +``` + + + + + BenchmarkTools.Trial: + memory estimate: 128 bytes + allocs estimate: 3 + -------------- + minimum time: 673.269 ms (0.00% GC) + median time: 693.418 ms (0.00% GC) + mean time: 700.729 ms (0.00% GC) + maximum time: 784.130 ms (0.00% GC) -------------- samples: 8 evals/sample: 1 @@ -448,10 +546,10 @@ v2 = randn(size(EUR_101, 2)); memory estimate: 16 bytes allocs estimate: 1 -------------- - minimum time: 612.441 ms (0.00% GC) - median time: 624.015 ms (0.00% GC) - mean time: 641.973 ms (0.00% GC) - maximum time: 770.845 ms (0.00% GC) + minimum time: 620.830 ms (0.00% GC) + median time: 637.698 ms (0.00% GC) + mean time: 656.708 ms (0.00% GC) + maximum time: 798.543 ms (0.00% GC) -------------- samples: 8 evals/sample: 1 diff --git a/src/cuda.jl b/src/cuda.jl index fa2ab4a0..973425b8 100644 --- a/src/cuda.jl +++ b/src/cuda.jl @@ -6,16 +6,29 @@ struct CuSnpArray{T} <: AbstractMatrix{UInt8} model::Union{Val{1}, Val{2}, Val{3}} center::Bool scale::Bool + impute::Bool μ::CuVector{T} σinv::CuVector{T} storagev1::CuVector{T} storagev2::CuVector{T} end +""" + CuSnpArray{T}(s; model=ADDITIVE_MODEL, center=false, scale=false, impute=true) + +Copy a `SnpArray` to a CUDA GPU to perform linear algebera operations. + +# Arguments +- s: a `SnpArray`. +- model: one of `ADDITIVE_MODEL`(default), `DOMINANT_MODEL`, `RECESSIVE_MODEL`. +- center: whether to center (default: false). +- scale: whether to scale to standard deviation 1 (default: false). +- impute: whether to impute missing value with column mean (default: true). +""" function CuSnpArray{T}(s::SnpArray; model = ADDITIVE_MODEL, center::Bool = false, - scale::Bool = false) where T <: AbstractFloat + scale::Bool = false, impute::Bool = false) where T <: AbstractFloat data = adapt(CuMatrix{UInt8}, s.data) if model == ADDITIVE_MODEL @@ -73,7 +86,7 @@ function CuSnpArray{T}(s::SnpArray; σinv = adapt(CuVector{T}, σinv) storagev1 = CuVector{T}(undef, size(s, 1)) storagev2 = CuVector{T}(undef, size(s, 2)) - CuSnpArray{T}(data, s.m, model, center, scale, μ, σinv, storagev1, storagev2) + CuSnpArray{T}(data, s.m, model, center, scale, impute, μ, σinv, storagev1, storagev2) end Base.size(s::CuSnpArray) = s.m, size(s.data, 2) @@ -175,6 +188,107 @@ function _snparray_cuda_atx_recessive!(out, s, v) end end +function _snparray_cuda_ax_additive_meanimpute!(out, s, v, μ) + packedstride = size(s, 1) + index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x + stride_x = blockDim().x * gridDim().x + for i = index_x:stride_x:length(out) + outi = zero(eltype(out)) + for j = 1:length(v) + k = 2 * ((i-1) & 3) + block = s[(j-1) * packedstride + ((i-1) >> 2) + 1] + Aij = (block >> k) & 3 + @inbounds outi += ((Aij >= 2) + (Aij >= 3) + (Aij == 1) * μ[j]) * v[j] + end + out[i] = outi + end +end + +function _snparray_cuda_ax_dominant_meanimpute!(out, s, v, μ) + packedstride = size(s, 1) + index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x + stride_x = blockDim().x * gridDim().x + for i = index_x:stride_x:length(out) + outi = zero(eltype(out)) + for j = 1:length(v) + k = 2 * ((i-1) & 3) + block = s[(j-1) * packedstride + ((i-1) >> 2) + 1] + Aij = (block >> k) & 3 + @inbounds outi += ((Aij >= 2) + (Aij == 1) * μ[j]) * v[j] + end + out[i] = outi + end +end + +function _snparray_cuda_ax_recessive_meanimpute!(out, s, v, μ) + packedstride = size(s, 1) + index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x + stride_x = blockDim().x * gridDim().x + for i = index_x:stride_x:length(out) + outi = zero(eltype(out)) + for j = 1:length(v) + k = 2 * ((i-1) & 3) + block = s[(j-1) * packedstride + ((i-1) >> 2) + 1] + Aij = (block >> k) & 3 + @inbounds outi += ((Aij >= 3) + (Aij == 1) * μ[j]) * v[j] + end + out[i] = outi + end +end + +function _snparray_cuda_atx_additive_meanimpute!(out, s, v, μ) + packedstride = size(s, 1) + index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x + stride_x = blockDim().x * gridDim().x + for i = index_x:stride_x:length(out) + outi = zero(eltype(out)) + for j = 1:length(v) + k = 2 * ((j-1) & 3) + block = s[(i-1) * packedstride + ((j-1) >> 2) + 1] + Aij = (block >> k) & 3 + @inbounds outi += ((Aij >= 2) + (Aij >= 3) + (Aij == 1) * μ[i]) * v[j] + end + out[i] = outi + end +end + +function _snparray_cuda_atx_dominant_meanimpute!(out, s, v, μ) + packedstride = size(s, 1) + index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x + stride_x = blockDim().x * gridDim().x + for i = index_x:stride_x:length(out) + outi = zero(eltype(out)) + for j = 1:length(v) + k = 2 * ((j-1) & 3) + block = s[(i-1) * packedstride + ((j-1) >> 2) + 1] + Aij = (block >> k) & 3 + @inbounds outi += ((Aij >= 2) + (Aij == 1) * μ[i]) * v[j] + end + out[i] = outi + end +end + +function _snparray_cuda_atx_recessive_meanimpute!(out, s, v, μ) + packedstride = size(s, 1) + index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x + stride_x = blockDim().x * gridDim().x + for i = index_x:stride_x:length(out) + outi = zero(eltype(out)) + for j = 1:length(v) + k = 2 * ((j-1) & 3) + block = s[(i-1) * packedstride + ((j-1) >> 2) + 1] + Aij = (block >> k) & 3 + @inbounds outi += ((Aij >= 3) + (Aij == 1) * μ[i]) * v[j] + end + out[i] = outi + end +end + +""" + LinearAlgebra.mul!(out::CuVector{T}, s::CuSnpArray{T}, v::CuVector{T}) + +In-place matrix-vector multiplication on a GPU. +""" function mul!( out::CuVector{T}, s::CuSnpArray{T}, @@ -205,6 +319,11 @@ function mul!( end end +""" + LinearAlgebra.mul!(out::CuVector{T}, s::Union{Transpose{T, CuSnpArray{T}}, Adjoint{T, CuSnpArray{T}}}, v::CuVector{T}) + +In-place matrix-vector multiplication on a GPU, with transposed CuSnpArray. +""" function mul!( out::CuVector{T}, st::Union{Transpose{T, CuSnpArray{T}}, Adjoint{T, CuSnpArray{T}}}, @@ -214,12 +333,22 @@ function mul!( s = st.parent numblocks = ceil(Int, length(out)/256) CUDA.@sync begin - if s.model == ADDITIVE_MODEL - @cuda threads=256 blocks=numblocks _snparray_cuda_atx_additive!(out, s.data, v) - elseif s.model == DOMINANT_MODEL - @cuda threads=256 blocks=numblocks _snparray_cuda_atx_dominant!(out, s.data, v) + if !s.impute + if s.model == ADDITIVE_MODEL + @cuda threads=256 blocks=numblocks _snparray_cuda_atx_additive!(out, s.data, v) + elseif s.model == DOMINANT_MODEL + @cuda threads=256 blocks=numblocks _snparray_cuda_atx_dominant!(out, s.data, v) + else + @cuda threads=256 blocks=numblocks _snparray_cuda_atx_recessive!(out, s.data, v) + end else - @cuda threads=256 blocks=numblocks _snparray_cuda_atx_recessive!(out, s.data, v) + if s.model == ADDITIVE_MODEL + @cuda threads=256 blocks=numblocks _snparray_cuda_atx_additive_meanimpute!(out, s.data, v, s.μ) + elseif s.model == DOMINANT_MODEL + @cuda threads=256 blocks=numblocks _snparray_cuda_atx_dominant_meanimpute!(out, s.data, v, s.μ) + else + @cuda threads=256 blocks=numblocks _snparray_cuda_atx_recessive_meanimpute!(out, s.data, v, s.μ) + end end end diff --git a/src/linalg_bitmatrix.jl b/src/linalg_bitmatrix.jl index 6c4ca4e6..e83794ae 100644 --- a/src/linalg_bitmatrix.jl +++ b/src/linalg_bitmatrix.jl @@ -10,6 +10,17 @@ struct SnpBitMatrix{T} <: AbstractMatrix{T} storagev2::Vector{T} end +""" + SnpBitMatrix{T}(s; model=ADDITIVE_MODEL, center=false, scale=false) + +Store an `AbstractSnpArray` `s` as two `BitMatrix`es to simplify linear algebra. + +# Arguments +- s: an `AbstractSnpArray`. +- model: one of `ADDITIVE_MODEL`(default), `DOMINANT_MODEL`, `RECESSIVE_MODEL`. +- center: whether to center (default: false). +- scale: whether to scale to standard deviation 1 (default: false). +""" function SnpBitMatrix{T}( s::AbstractSnpArray; model = ADDITIVE_MODEL, @@ -144,6 +155,11 @@ function mul!(c::Vector{T}, A::Union{Transpose{Bool,BitArray{2}}, Adjoint{Bool, end end +""" + LinearAlgebra.mul!(out, s::SnpBitMatrix, v) + +In-place matrix-vector multiplication. +""" function mul!( out::AbstractVector{T}, s::SnpBitMatrix{T}, @@ -169,6 +185,11 @@ function mul!( end end +""" + LinearAlgebra.mul!(out, s::Union{Transpose{T, SnpBitMatrix{T}}, Adjoint{T, SnpBitMatrix{T}}}, v) + +In-place matrix-vector multiplication, with transposed `SnpBitMatrix`. +""" function mul!( out::AbstractVector{T}, st::Union{Transpose{T, SnpBitMatrix{T}}, Adjoint{T, SnpBitMatrix{T}}}, diff --git a/src/linalg_direct.jl b/src/linalg_direct.jl index 228691c6..25e46876 100644 --- a/src/linalg_direct.jl +++ b/src/linalg_direct.jl @@ -3,17 +3,31 @@ struct SnpLinAlg{T} <: AbstractMatrix{T} model::Union{Val{1}, Val{2}, Val{3}} center::Bool scale::Bool + impute::Bool μ::Vector{T} σinv::Vector{T} storagev1::Vector{T} storagev2::Vector{T} end +""" + SnpLinAlg{T}(s; model=ADDITIVE_MODEL, center=false, scale=false, impute=true) + +Pad a `SnpArray` with some parameters for linear algebera operation. + +# Arguments +- s: a `SnpArray`. +- model: one of `ADDITIVE_MODEL`(default), `DOMINANT_MODEL`, `RECESSIVE_MODEL`. +- center: whether to center (default: false). +- scale: whether to scale to standard deviation 1 (default: false). +- impute: whether to impute missing value with column mean (default: true). +""" function SnpLinAlg{T}( s::AbstractSnpArray; model = ADDITIVE_MODEL, center::Bool = false, - scale::Bool = false) where T <: AbstractFloat + scale::Bool = false, + impute::Bool = true) where T <: AbstractFloat if model == ADDITIVE_MODEL if center || scale μ = Vector{T}(undef, size(s, 2)) @@ -67,7 +81,7 @@ function SnpLinAlg{T}( end storagev1 = Vector{T}(undef, size(s, 1)) storagev2 = Vector{T}(undef, size(s, 2)) - SnpLinAlg{T}(s, model, center, scale, μ, σinv, storagev1, storagev2) + SnpLinAlg{T}(s, model, center, scale, impute, μ, σinv, storagev1, storagev2) end Base.size(sla::SnpLinAlg) = size(sla.s) @@ -86,7 +100,8 @@ function _snparray_ax_additive_rem!(out, s, v) end end end -function _snparray_ax_additive!(out, s, v, rows, cols) + +function _snparray_ax_additive!(out, s, v, rows, cols, μ) #fill!(out, zero(Float64)) k = rows >> 2 rem = rows & 3 @@ -108,10 +123,6 @@ function _snparray_ax_additive!(out, s, v, rows, cols) out end -function _snparray_ax_additive!(out, s, v) - _snparray_ax_additive!(out, s, v, length(out), length(v)) -end - function _snparray_ax_dominant_rem!(out, s, v) maxp = length(out) @avx for j in eachindex(v) @@ -122,7 +133,8 @@ function _snparray_ax_dominant_rem!(out, s, v) end end end -function _snparray_ax_dominant!(out, s, v, rows, cols) + +function _snparray_ax_dominant!(out, s, v, rows, cols, μ) #fill!(out, zero(Float64)) k = rows >> 2 rem = rows & 3 @@ -144,10 +156,6 @@ function _snparray_ax_dominant!(out, s, v, rows, cols) out end -function _snparray_ax_dominant!(out, s, v) - _snparray_ax_dominant!(out, s, v, length(out), length(v)) -end - function _snparray_ax_recessive_rem!(out, s, v) maxp = length(out) @avx for j in eachindex(v) @@ -158,7 +166,8 @@ function _snparray_ax_recessive_rem!(out, s, v) end end end -function _snparray_ax_recessive!(out, s, v, rows, cols) + +function _snparray_ax_recessive!(out, s, v, rows, cols, μ) #fill!(out, zero(Float64)) k = rows >> 2 rem = rows & 3 @@ -180,22 +189,127 @@ function _snparray_ax_recessive!(out, s, v, rows, cols) out end -function _snparray_ax_recessive!(out, s, v) - _snparray_ax_recessive!(out, s, v, length(out), length(v)) +function _snparray_ax_additive_meanimpute_rem!(out, s, v, μ) + maxp = length(out) + @avx for j in eachindex(v) + block = s[1, j] + for p in 1:maxp + Aij = (block >> (2*(p-1))) & 3 + out[p] += ((Aij >= 2) + (Aij == 3) + (Aij == 1) * μ[j]) * v[j] + end + end +end + +function _snparray_ax_additive_meanimpute!(out, s, v, rows, cols, μ) + #fill!(out, zero(Float64)) + k = rows >> 2 + rem = rows & 3 + + @avx for j ∈ 1:cols + for l in 1:k + block = s[l, j] + + for p in 1:4 + Aij = (block >> (2 *(p-1))) & 3 + out[4*(l-1)+p] += ((Aij >= 2) + (Aij == 3) + (Aij == 1) * μ[j]) * v[j] + end + + end + end + if rem != 0 + _snparray_ax_additive_meanimpute_rem!(@view(out[4k+1:end]), @view(s[k+1:k+1, :]), v, μ) + end + out +end + +function _snparray_ax_dominant_meanimpute_rem!(out, s, v, μ) + maxp = length(out) + @avx for j in eachindex(v) + block = s[1, j] + for p in 1:maxp + Aij = (block >> (2*(p-1))) & 3 + out[p] += ((Aij >= 2) + (Aij == 1) * μ[j]) * v[j] + end + end +end + +function _snparray_ax_dominant_meanimpute!(out, s, v, rows, cols, μ) + #fill!(out, zero(Float64)) + k = rows >> 2 + rem = rows & 3 + + @avx for j ∈ 1:cols + for l in 1:k + block = s[l, j] + + for p in 1:4 + Aij = (block >> (2 *(p-1))) & 3 + out[4*(l-1)+p] += ((Aij >= 2) + (Aij == 1) * μ[j]) * v[j] + end + + end + end + if rem != 0 + _snparray_ax_dominant_meanimpute_rem!(@view(out[4k+1:end]), @view(s[k+1:k+1, :]), v, μ) + end + out +end + +function _snparray_ax_recessive_meanimpute_rem!(out, s, v, μ) + maxp = length(out) + @avx for j in eachindex(v) + block = s[1, j] + for p in 1:maxp + Aij = (block >> (2*(p-1))) & 3 + out[p] += ((Aij == 3) + (Aij == 1) * μ[j]) * v[j] + end + end +end + +function _snparray_ax_recessive_meanimpute!(out, s, v, rows, cols, μ) + #fill!(out, zero(Float64)) + k = rows >> 2 + rem = rows & 3 + + @avx for j ∈ 1:cols + for l in 1:k + block = s[l, j] + + for p in 1:4 + Aij = (block >> (2 *(p-1))) & 3 + out[4*(l-1)+p] += ((Aij == 3) + (Aij == 1) * μ[j]) * v[j] + end + + end + end + if rem != 0 + _snparray_ax_recessive_meanimpute_rem!(@view(out[4k+1:end]), @view(s[k+1:k+1, :]), v, μ) + end + out end -function _snparray_ax_tile!(c, A, b, model) +function _snparray_ax_tile!(c, A, b, model, μ, impute) vstep = 1024 hstep = 1024 vstep_log2 = 10 hstep_log2 = 10 - if model == ADDITIVE_MODEL - _ftn! = _snparray_ax_additive! - elseif model == DOMINANT_MODEL - _ftn! = _snparray_ax_dominant! + if !impute + if model == ADDITIVE_MODEL + _ftn! = _snparray_ax_additive! + elseif model == DOMINANT_MODEL + _ftn! = _snparray_ax_dominant! + else + _ftn! = _snparray_ax_recessive! + end else - _ftn! = _snparray_ax_recessive! + if model == ADDITIVE_MODEL + _ftn! = _snparray_ax_additive_meanimpute! + elseif model == DOMINANT_MODEL + _ftn! = _snparray_ax_dominant_meanimpute! + else + _ftn! = _snparray_ax_recessive_meanimpute! + end end fill!(c, zero(UInt8)) @@ -213,7 +327,7 @@ function _snparray_ax_tile!(c, A, b, model) gesp(stridedpointer(A), (vstep*m, hstep*n)), gesp(stridedpointer(b), (hstep*n,)), vstep << 2, - hstep + hstep, μ ) end if Mrem != 0 @@ -221,61 +335,67 @@ function _snparray_ax_tile!(c, A, b, model) @view(A[vstep*(Miter)+1:end, hstep*n+1:hstep*(n+1)]), @view(b[hstep*n+1:hstep*(n+1)]), length(c) - 4*vstep*Miter, - hstep + hstep, μ ) end end if Nrem != 0 _ftn!(c, @view(A[:, (hstep*Niter+1):end]), @view(b[hstep*Niter+1:end]), - length(c), Nrem + length(c), Nrem, μ ) end end -function mul!( - out::AbstractVector{T}, - sla::SnpLinAlg{T}, - v::AbstractVector{T}) where T <: AbstractFloat - @assert length(out) == size(sla, 1) && length(v) == size(sla, 2) - if sla.scale - sla.storagev2 .= sla.σinv .* v - w = sla.storagev2 - else - w = v +function _snparray_atx_additive_rem!(out, s, v, μ) + maxp = length(v) + @avx for i in eachindex(out) + block = s[1, i] + for p in 1:maxp + Aij = (block >> (2*(p-1))) & 3 + out[i] += ((Aij >= 2) + (Aij == 3)) * v[p] + end end - fill!(out, zero(eltype(out))) - s = sla.s +end - _snparray_ax_tile!(out, s.data, w, sla.model) - # if sla.model == ADDITIVE_MODEL - # _snparray_ax_additive!(out, s.data, w) - # elseif sla.model == DOMINANT_MODEL - # _snparray_ax_dominant!(out, s.data, w) - # else - # _snparray_ax_recessive!(out, s.data, w) - # end +function _snparray_atx_additive!(out, s, v, rows, cols, μ) + #fill!(out, zero(Float64)) + k = rows >> 2 + rem = rows & 3 + + @avx for i ∈ 1:cols + for l in 1:k + block = s[l, i] + + for p in 1:4 + Aij = (block >> (2 *(p-1))) & 3 + out[i] += ((Aij >= 2) + (Aij == 3)) * v[4*(l-1)+p] + end - if sla.center - return out .-= dot(sla.μ, w) - else - return out + end + end + if rem != 0 + _snparray_atx_additive_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]), μ) end + out end +function _snparray_atx_additive!(out, s, v, μ) + _snparray_atx_additive!(out, s, v, length(v), length(out), μ) +end -function _snparray_atx_additive_rem!(out, s, v) +function _snparray_atx_dominant_rem!(out, s, v, μ) maxp = length(v) @avx for i in eachindex(out) block = s[1, i] for p in 1:maxp Aij = (block >> (2*(p-1))) & 3 - out[i] += ((Aij >= 2) + (Aij == 3)) * v[p] + out[i] += (Aij >= 2) * v[p] end end end -function _snparray_atx_additive!(out, s, v, rows, cols) +function _snparray_atx_dominant!(out, s, v, rows, cols, μ) #fill!(out, zero(Float64)) k = rows >> 2 rem = rows & 3 @@ -286,33 +406,33 @@ function _snparray_atx_additive!(out, s, v, rows, cols) for p in 1:4 Aij = (block >> (2 *(p-1))) & 3 - out[i] += ((Aij >= 2) + (Aij == 3)) * v[4*(l-1)+p] + out[i] += (Aij >= 2) * v[4*(l-1)+p] end end end if rem != 0 - _snparray_atx_additive_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end])) + _snparray_atx_dominant_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]), μ) end out end -function _snparray_atx_additive!(out, s, v) - _snparray_atx_additive!(out, s, v, length(v), length(out)) +function _snparray_atx_dominant!(out, s, v, μ) + _snparray_atx_dominant!(out, s, v, length(v), length(out), μ) end -function _snparray_atx_dominant_rem!(out, s, v) +function _snparray_atx_recessive_rem!(out, s, v, μ) maxp = length(v) @avx for i in eachindex(out) block = s[1, i] for p in 1:maxp Aij = (block >> (2*(p-1))) & 3 - out[i] += (Aij >= 2) * v[p] + out[i] += (Aij == 3) * v[p] end end end -function _snparray_atx_dominant!(out, s, v, rows, cols) +function _snparray_atx_recessive!(out, s, v, rows, cols, μ) #fill!(out, zero(Float64)) k = rows >> 2 rem = rows & 3 @@ -323,33 +443,35 @@ function _snparray_atx_dominant!(out, s, v, rows, cols) for p in 1:4 Aij = (block >> (2 *(p-1))) & 3 - out[i] += (Aij >= 2) * v[4*(l-1)+p] + out[i] += (Aij == 3) * v[4*(l-1)+p] end end + end if rem != 0 - _snparray_atx_dominant_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end])) + _snparray_atx_recessive_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]), μ) end out end -function _snparray_atx_dominant!(out, s, v) - _snparray_atx_dominant!(out, s, v, length(v), length(out)) +function _snparray_atx_recessive!(out, s, v, μ) + _snparray_atx_recessive!(out, s, v, length(v), length(out), μ) end -function _snparray_atx_recessive_rem!(out, s, v) + +function _snparray_atx_additive_meanimpute_rem!(out, s, v, μ) maxp = length(v) @avx for i in eachindex(out) block = s[1, i] for p in 1:maxp Aij = (block >> (2*(p-1))) & 3 - out[i] += (Aij == 3) * v[p] + out[i] += ((Aij >= 2) + (Aij == 3) + (Aij == 1) * μ[i]) * v[p] end end end -function _snparray_atx_recessive!(out, s, v, rows, cols) +function _snparray_atx_additive_meanimpute!(out, s, v, rows, cols, μ) #fill!(out, zero(Float64)) k = rows >> 2 rem = rows & 3 @@ -360,22 +482,129 @@ function _snparray_atx_recessive!(out, s, v, rows, cols) for p in 1:4 Aij = (block >> (2 *(p-1))) & 3 - out[i] += (Aij == 3) * v[4*(l-1)+p] + out[i] += ((Aij >= 2) + (Aij == 3) + (Aij == 1) * μ[i]) * v[4*(l-1)+p] end end + end + if rem != 0 + _snparray_atx_additive_meanimpute_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]), μ) + end + out +end + +function _snparray_atx_additive_meanimpute!(out, s, v, μ) + _snparray_atx_additive_meanimpute!(out, s, v, length(v), length(out), μ) +end + +function _snparray_atx_dominant_meanimpute_rem!(out, s, v, μ) + maxp = length(v) + @avx for i in eachindex(out) + block = s[1, i] + for p in 1:maxp + Aij = (block >> (2*(p-1))) & 3 + out[i] += ((Aij >= 2) + (Aij == 1) * μ[i]) * v[p] + end + end +end +function _snparray_atx_dominant_meanimpute!(out, s, v, rows, cols, μ) + #fill!(out, zero(Float64)) + k = rows >> 2 + rem = rows & 3 + + @avx for i ∈ 1:cols + for l in 1:k + block = s[l, i] + + for p in 1:4 + Aij = (block >> (2 *(p-1))) & 3 + out[i] += ((Aij >= 2) + (Aij == 1) * μ[i]) * v[4*(l-1)+p] + end + + end end if rem != 0 - _snparray_atx_recessive_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end])) + _snparray_atx_dominant_meanimpute_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]), μ) end out end -function _snparray_atx_recessive!(out, s, v) - _snparray_atx_recessive!(out, s, v, length(v), length(out)) +function _snparray_atx_dominant_meanimpute!(out, s, v, μ) + _snparray_atx_dominant_meanimpute!(out, s, v, length(v), length(out), μ) end +function _snparray_atx_recessive_meanimpute_rem!(out, s, v, μ) + maxp = length(v) + @avx for i in eachindex(out) + block = s[1, i] + for p in 1:maxp + Aij = (block >> (2*(p-1))) & 3 + out[i] += ((Aij == 3) + (Aij == 1) * μ[i]) * v[p] + end + end +end + +function _snparray_atx_recessive_meanimpute!(out, s, v, rows, cols, μ) + #fill!(out, zero(Float64)) + k = rows >> 2 + rem = rows & 3 + + @avx for i ∈ 1:cols + for l in 1:k + block = s[l, i] + + for p in 1:4 + Aij = (block >> (2 *(p-1))) & 3 + out[i] += ((Aij == 3) + (Aij == 1) * μ[i]) * v[4*(l-1)+p] + end + + end + + end + if rem != 0 + _snparray_atx_recessive_meanimpute_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]), μ) + end + out +end + +function _snparray_atx_recessive_meanimpute!(out, s, v, μ) + _snparray_atx_recessive_meanimpute!(out, s, v, length(v), length(out), μ) +end + +""" + LinearAlgebra.mul!(out, s::SnpLiinAlg, v) + +In-place matrix-vector multiplication. +""" +function mul!( + out::AbstractVector{T}, + sla::SnpLinAlg{T}, + v::AbstractVector{T}) where T <: AbstractFloat + @assert length(out) == size(sla, 1) && length(v) == size(sla, 2) + if sla.scale + sla.storagev2 .= sla.σinv .* v + w = sla.storagev2 + else + w = v + end + fill!(out, zero(eltype(out))) + s = sla.s + + _snparray_ax_tile!(out, s.data, w, sla.model, sla.μ, sla.impute) + + if sla.center + return out .-= dot(sla.μ, w) + else + return out + end +end + +""" + LinearAlgebra.mul!(out, s::Union{Transpose{T, SnpBitMatrix{T}}, Adjoint{T, SnpBitMatrix{T}}}, v) + +In-place matrix-vector multiplication, with transposed `SnpBitMatrix`. +""" function mul!( out::AbstractVector{T}, st::Union{Transpose{T, SnpLinAlg{T}}, Adjoint{T, SnpLinAlg{T}}}, @@ -384,13 +613,25 @@ function mul!( sla = st.parent s = sla.s fill!(out, zero(eltype(out))) - if sla.model == ADDITIVE_MODEL - _snparray_atx_additive!(out, s.data, v) - elseif sla.model == DOMINANT_MODEL - _snparray_atx_dominant!(out, s.data, v) + if !sla.impute + if sla.model == ADDITIVE_MODEL + _ftn! = _snparray_atx_additive! + elseif sla.model == DOMINANT_MODEL + _ftn! = _snparray_atx_dominant! + else + _ftn! = _snparray_atx_recessive! + end else - _snparray_atx_recessive!(out, s.data, v) - end + if sla.model == ADDITIVE_MODEL + _ftn! = _snparray_atx_additive_meanimpute! + elseif sla.model == DOMINANT_MODEL + _ftn! = _snparray_atx_dominant_meanimpute! + else + _ftn! = _snparray_atx_recessive_meanimpute! + end + end + + _ftn!(out, s.data, v, sla.μ) if sla.center out .-= sum(v) .* sla.μ end