diff --git a/docs/SnpArrays.ipynb b/docs/SnpArrays.ipynb
index e80e9435..5cb07a4d 100644
--- a/docs/SnpArrays.ipynb
+++ b/docs/SnpArrays.ipynb
@@ -266,7 +266,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  87.555 μs (64 allocations: 390.03 KiB)\n"
+      "  94.032 μs (64 allocations: 390.03 KiB)\n"
      ]
     }
    ],
@@ -647,11 +647,11 @@
      "data": {
       "text/plain": [
        "5×3 SnpArray:\n",
-       " 0x00  0x00  0x00\n",
-       " 0x00  0x00  0x00\n",
-       " 0x00  0x00  0x00\n",
-       " 0x00  0x00  0x00\n",
-       " 0x00  0x00  0x00"
+       " 0x00  0x01  0x01\n",
+       " 0x00  0x03  0x01\n",
+       " 0x01  0x03  0x02\n",
+       " 0x02  0x01  0x00\n",
+       " 0x00  0x00  0x03"
       ]
      },
      "execution_count": 19,
@@ -679,12 +679,11 @@
      "data": {
       "text/plain": [
        "5×3 SnpArray:\n",
-
-       " 0x00  0x00  0x00\n",
-       " 0x00  0x00  0x00\n",
-       " 0x00  0x00  0x00\n",
-       " 0x00  0x00  0x00\n",
-       " 0x00  0x00  0x00"
+       " 0x00  0x01  0x01\n",
+       " 0x00  0x03  0x01\n",
+       " 0x01  0x03  0x02\n",
+       " 0x02  0x01  0x00\n",
+       " 0x00  0x00  0x03"
       ]
      },
      "execution_count": 20,
@@ -705,11 +704,11 @@
      "data": {
       "text/plain": [
        "5×3 SnpArray:\n",
-       " 0x02  0x00  0x00\n",
-       " 0x00  0x00  0x00\n",
-       " 0x00  0x00  0x00\n",
-       " 0x00  0x00  0x00\n",
-       " 0x00  0x00  0x00"
+       " 0x02  0x01  0x01\n",
+       " 0x00  0x03  0x01\n",
+       " 0x01  0x03  0x02\n",
+       " 0x02  0x01  0x00\n",
+       " 0x00  0x00  0x03"
       ]
      },
      "execution_count": 21,
@@ -736,11 +735,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## `convert` and `copyto!`\n",
+    "### `convert` and `copyto!`\n",
     "\n",
     "Most common usage of SnpArray is to convert genotypes to numeric values for statistical analysis. Conversion rule depends on genetic models (additive, dominant, or recessive), centering, scaling, or imputation.\n",
     "\n",
-    "### `convert`\n",
+    "#### `convert`\n",
     "\n",
     "`convert` function has 4 keyword arguments: `model`, `center`, `scale`, and `impute`.\n",
     "\n",
@@ -1068,7 +1067,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### `copyto!`\n",
+    "#### `copyto!`\n",
     "\n",
     "`copyto!` is the in-place version of `convert`. It takes the same keyword arguments (`model`, `center`, `scale`, `impute`) as `convert`."
    ]
@@ -1136,7 +1135,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  3.719 μs (0 allocations: 0 bytes)\n"
+      "  3.655 μs (0 allocations: 0 bytes)\n"
      ]
     }
    ],
@@ -1207,7 +1206,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  7.138 μs (0 allocations: 0 bytes)\n"
+      "  7.205 μs (0 allocations: 0 bytes)\n"
      ]
     }
    ],
@@ -1278,7 +1277,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  6.860 μs (0 allocations: 0 bytes)\n"
+      "  6.856 μs (0 allocations: 0 bytes)\n"
      ]
     }
    ],
@@ -1303,7 +1302,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  50.433 ms (10150 allocations: 475.78 KiB)\n"
+      "  50.329 ms (10150 allocations: 475.78 KiB)\n"
      ]
     }
    ],
@@ -1333,7 +1332,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  64.308 ms (0 allocations: 0 bytes)\n"
+      "  64.344 ms (0 allocations: 0 bytes)\n"
      ]
     }
    ],
@@ -1346,9 +1345,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Summaries\n",
+    "### Summaries\n",
     "\n",
-    "### Counts\n",
+    "#### Counts\n",
     "\n",
     "Counts of each the four possible values for each column are returned by `counts`.`"
    ]
@@ -1396,7 +1395,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  6.047 ns (0 allocations: 0 bytes)\n"
+      "  6.043 ns (0 allocations: 0 bytes)\n"
      ]
     }
    ],
@@ -1408,7 +1407,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Minor allele frequencies\n",
+    "#### Minor allele frequencies\n",
     "\n",
     "Minor allele frequencies (MAF) for each SNP."
    ]
@@ -1516,7 +1515,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### `mean` and `var`\n",
+    "#### `mean` and `var`\n",
     "\n",
     "The package provides methods for the generics `mean` and `var` from the `Statistics` package."
    ]
@@ -1600,7 +1599,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  17.328 μs (2 allocations: 79.39 KiB)\n"
+      "  17.263 μs (2 allocations: 79.39 KiB)\n"
      ]
     }
    ],
@@ -1665,7 +1664,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Missing rate\n",
+    "#### Missing rate\n",
     "\n",
     "Proportion of missing genotypes"
    ]
@@ -1766,7 +1765,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Location of the missing values\n",
+    "#### Location of the missing values\n",
     "\n",
     "The positions of the missing data are evaluated by"
    ]
@@ -1826,7 +1825,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  43.661 ms (19274 allocations: 1.80 MiB)\n"
+      "  43.843 ms (19274 allocations: 1.80 MiB)\n"
      ]
     }
    ],
@@ -1894,7 +1893,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Genetic relationship matrix\n",
+    "### Genetic relationship matrix\n",
     "\n",
     "`grm` function computes the empirical kinship matrix using either the classical genetic relationship matrix, `grm(A, model=:GRM)`, or the method of moment method, `grm(A, model=:MoM)`, or the robust method, `grm(A, model=:Robust)`. \n",
     "\n",
@@ -1958,7 +1957,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  502.287 ms (25 allocations: 28.95 MiB)\n"
+      "  505.403 ms (25 allocations: 28.95 MiB)\n"
      ]
     }
    ],
@@ -2028,7 +2027,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  315.064 ms (26 allocations: 14.60 MiB)\n"
+      "  312.411 ms (26 allocations: 14.60 MiB)\n"
      ]
     }
    ],
@@ -2148,7 +2147,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Filtering\n",
+    "### Filtering\n",
     "\n",
     "Before GWAS, we often need to filter SNPs and/or samples according to genotyping success rates, minor allele frequencies, and Hardy-Weinberg Equilibrium test. This can be achieved by the `filter` function.\n",
     "\n",
@@ -2213,7 +2212,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  150.800 ms (11460 allocations: 171.28 MiB)\n"
+      "  148.931 ms (11460 allocations: 171.28 MiB)\n"
      ]
     }
    ],
@@ -2235,7 +2234,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Filter Plink files\n",
+    "#### Filter Plink files\n",
     "\n",
     "Filter a set of Plink files according to row indices and column indices. By result, filtered Plink files are saved as `srcname.filtered.bed`, `srcname.filtered.fam`, and `srcname.filtered.bim`, where `srcname` is the source Plink file name. You can also specify destimation file name using keyword `des`."
    ]
@@ -2507,15 +2506,10 @@
     {
      "data": {
       "text/plain": [
-       "7-element Array{String,1}:\n",
+       "3-element Array{String,1}:\n",
        " \"./tmp_hcat_arr_1.bed\"\n",
        " \"./tmp_hvcat_arr_1.bed\"\n",
-       " \"./tmp_vcat_arr_1.bed\"\n",
-       " \"./tmp_vcat_arr_2.bed\"\n",
-       " \"./tmp_vcat_arr_3.bed\"\n",
-       " \"./tmp_vcat_arr_4.bed\"\n",
-       " \"./tmp_vcat_arr_5.bed\""
-
+       " \"./tmp_vcat_arr_1.bed\""
       ]
      },
      "execution_count": 71,
@@ -2629,12 +2623,17 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## SnpBitMatrix\n",
+    "## Linear Algebra\n",
     "\n",
-    "In some applications we want to perform linear algebra using SnpArray directly without expanding it to numeric matrix. This is achieved by the `SnpBitMatrix` type. The implementation assumes:\n",
+    "In some applications we want to perform linear algebra using SnpArray directly without expanding it to numeric matrix. This is achieved in three different `struct`s:\n",
     "\n",
-    "1. the SnpArray does not have missing genotypes, and\n",
-    "2. the matrix corresponding to SnpArray is the matrix of A2 allele counts.\n",
+    "1. Direct operations on a plink-formatted `SnpArray`: `SnpLinAlg`\n",
+    "2. Operations on transformed `BitMatrix`es: `SnpBitMatrix`\n",
+    "3. Direct operations on a plink-formatted data on an Nvidia GPU: `CuSnpArray`.\n",
+    "\n",
+    "`SnpLinAlg` and `SnpBitMatrix` use Chris Elrod's [LoopVectorization.jl](https://github.com/chriselrod/LoopVectorization.jl) internally. It is much faster on machines with AVX support. `CuSnpArray` uses [CUDA.jl](https://juliagpu.gitlab.io/CUDA.jl/) internally.\n",
+    "\n",
+    "The implementation assumes that the matrix corresponding to SnpArray is the matrix of the A2 allele counts. `SnpLinAlg` and `CuSnpArray` impute any missing genotype with its column mean by default. They can also configured to impute missing genotypes with zero. `SnpBitMatrix` can only impute missing values with zero. \n",
     "\n",
     "### Constructor\n",
     "\n",
@@ -2691,7 +2690,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To instantiate a SnpBitMatrix based on SnpArray,"
+    "To instantiate a SnpLinAlg based on SnpArray,"
    ]
   },
   {
@@ -2700,6 +2699,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "const EURsla = SnpLinAlg{Float64}(EUR, model=ADDITIVE_MODEL, center=true, scale=true);\n",
+    "const EURsla_ = SnpLinAlg{Float64}(EUR, model=ADDITIVE_MODEL, center=true, scale=true, impute=false);\n",
     "const EURbm = SnpBitMatrix{Float64}(EUR, model=ADDITIVE_MODEL, center=true, scale=true);"
    ]
   },
@@ -2707,9 +2708,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The constructor shares the same keyword arguments as the `convert` or `copyto!` functions. The type parameter, `Float64` in this example, indicates the SnpBitMatrix acts like a Float64 matrix.\n",
+    "The constructor shares the same keyword arguments as the `convert` or `copyto!` functions. The type parameter, `Float64` in this example, indicates the SnpLinAlg acts like a Float64 matrix.\n",
+    "SnpLinAlg directly uses the SnpArray for computation. \n",
     "\n",
-    "The memory usage of the SnpBitMatrix should be similar to the SnpArray, or equivalently bed file size, if `model=ADDITIVE_MODEL`, or half of that of SnpArray if `model=DOMINANT_MODEL` or `model=RECESSIVE_MODEL`."
+    "On the other hand, memory usage of SnpBitMatrix should be similar to the SnpArray, or equivalently bed file size, if `model=ADDITIVE_MODEL`, or half of that of SnpArray if `model=DOMINANT_MODEL` or `model=RECESSIVE_MODEL`."
    ]
   },
   {
@@ -2720,7 +2722,7 @@
     {
      "data": {
       "text/plain": [
-       "(6876757, 6421960)"
+       "(6876757, 8177221, 6421960)"
       ]
      },
      "execution_count": 78,
@@ -2729,16 +2731,16 @@
     }
    ],
    "source": [
-    "Base.summarysize(EUR), Base.summarysize(EURbm)"
+    "Base.summarysize(EUR), Base.summarysize(EURsla), Base.summarysize(EURbm)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Linear algebra\n",
+    "### `mul!()`\n",
     "\n",
-    "A SnpBitMatrix acts similar to a regular matrix and responds to `size`, `eltype`, and SnpBitMatrix-vector multiplications."
+    "SnpLinAlg and SnpBitMatrix act similar to a regular matrix and responds to `size`, `eltype`, and matrix-vector multiplications."
    ]
   },
   {
@@ -2750,6 +2752,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "size(EURsla) = (379, 54051)\n",
+      "eltype(EURsla) = Float64\n",
+      "typeof(EURsla) <: AbstractMatrix = true\n",
       "size(EURbm) = (379, 54051)\n",
       "eltype(EURbm) = Float64\n",
       "typeof(EURbm) <: AbstractMatrix = true\n"
@@ -2757,6 +2762,10 @@
     }
    ],
    "source": [
+    "@show size(EURsla)\n",
+    "@show eltype(EURsla)\n",
+    "@show typeof(EURsla) <: AbstractMatrix;\n",
+    "\n",
     "@show size(EURbm)\n",
     "@show eltype(EURbm)\n",
     "@show typeof(EURbm) <: AbstractMatrix;"
@@ -2766,31 +2775,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "SnpBitMatrix-vector multiplication is mathematically equivalent to the corresponding Float matrix contained from `convert` or `copyto!` a SnpArray."
+    "Matrix-vector multiplications with SnpLinAlg and SnpBitMatrix are mathematically equivalent to the corresponding Float matrix contained from `convert` or `copyto!` a SnpArray."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 80,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "2.837775572500693e-11"
-      ]
-     },
-     "execution_count": 80,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "using LinearAlgebra\n",
     "v1 = randn(size(EUR, 1))\n",
     "v2 = randn(size(EUR, 2))\n",
-    "A = convert(Matrix{Float64}, EUR, model=ADDITIVE_MODEL, center=true, scale=true)\n",
-    "norm(EURbm * v2 -  A * v2)"
+    "A = convert(Matrix{Float64}, EUR, model=ADDITIVE_MODEL, center=true, scale=true);"
    ]
   },
   {
@@ -2801,7 +2798,7 @@
     {
      "data": {
       "text/plain": [
-       "5.399567080480323e-12"
+       "3.255157000354014e-11"
       ]
      },
      "execution_count": 81,
@@ -2810,14 +2807,7 @@
     }
    ],
    "source": [
-    "norm(EURbm' * v1 - A' * v1)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this example, the Float64 matrix fits into memory so the SnpBitMatrix-vector multiplication is much slower than Matrix{Float64}-vector multiplication (highly optimized BLAS)."
+    "norm(EURsla * v2 -  A * v2)"
    ]
   },
   {
@@ -2826,17 +2816,18 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "  8.495 ms (0 allocations: 0 bytes)\n"
-     ]
+     "data": {
+      "text/plain": [
+       "5.398163705112499e-12"
+      ]
+     },
+     "execution_count": 82,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "out1 = Vector{Float64}(undef, size(EUR, 1))\n",
-    "out2 = Vector{Float64}(undef, size(EUR, 2))\n",
-    "@btime(mul!($out1, $EURbm, $v2));"
+    "norm(EURsla' * v1 - A' *v1)"
    ]
   },
   {
@@ -2845,15 +2836,18 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "  7.844 ms (0 allocations: 0 bytes)\n"
-     ]
+     "data": {
+      "text/plain": [
+       "3.590044545281484e-11"
+      ]
+     },
+     "execution_count": 83,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "@btime(mul!($out1, $A, $v2));"
+    "norm(EURbm * v2 -  A * v2)"
    ]
   },
   {
@@ -2862,39 +2856,39 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "  6.612 ms (1 allocation: 16 bytes)\n"
-     ]
+     "data": {
+      "text/plain": [
+       "8.54121862883797e-12"
+      ]
+     },
+     "execution_count": 84,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "@btime(mul!($out2, $transpose($EURbm), $v1));"
+    "norm(EURbm' * v1 - A' * v1)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 85,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "  3.011 ms (0 allocations: 0 bytes)\n"
-     ]
-    }
-   ],
    "source": [
-    "@btime(mul!($out2, $transpose($A), $v1));"
+    "See Linear Algebra page for performance comparison among BLAS, SnpLinAlg, and SnpBitMatrix. In general, SnpLinAlg and SnpBitMatrix operations are at least twice faster than the corresponding Matrix{Float64}-vector multiplication. "
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In another test example with ~1GB bed file, SnpBitMatrix-vector multiplication is about 3-5 folder faster than the corresponding Matrix{Float64}-vector multiplication, because the Matrix{Float64} matrix cannot fit into the memory."
+    "In general, computing $Ax$ is more effective in SnpLinAlg, and computing $A^T x$ is faster in SnpBitMatrix. Note that SnpLinAlg does not allocate additional memory, and can impute missing values with column means. See Linear Algebra page for more information."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In a test example with ~1GB bed file, SnpBitMatrix-vector multiplication is about 3-5 fold faster than the corresponding Matrix{Float64}-vector multiplication, because the Matrix{Float64} matrix cannot fit into the memory."
    ]
   },
   {
@@ -2906,7 +2900,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": 85,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2916,7 +2910,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": 86,
    "metadata": {},
    "outputs": [
     {
@@ -2925,7 +2919,7 @@
        "2600"
       ]
      },
-     "execution_count": 87,
+     "execution_count": 86,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2936,7 +2930,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
+   "execution_count": 87,
    "metadata": {},
    "outputs": [
     {
@@ -2957,16 +2951,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
+   "execution_count": 88,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "8.987658251177042e-14"
+       "5.05574647736198e-14"
       ]
      },
-     "execution_count": 89,
+     "execution_count": 88,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2981,16 +2975,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": 89,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "1.6136185547316357e-14"
+       "1.4647267077134765e-14"
       ]
      },
-     "execution_count": 90,
+     "execution_count": 89,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3003,77 +2997,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Two alternatives: SnpLinAlg and CuSnpArray\n",
-    "\n",
-    "Basically, creating SnpBitMatrix doubles the memory usage. If it becomes an issue,  direct linear algebra operations from a SnpArray is possible. "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### SnpLinAlg"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 91,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "const EURla = SnpLinAlg{Float64}(EUR; model=ADDITIVE_MODEL, center=true, scale=true);"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 92,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "  6.880 ms (264 allocations: 14.03 KiB)\n"
-     ]
-    }
-   ],
-   "source": [
-    "v1 = randn(size(EUR, 1))\n",
-    "v2 = randn(size(EUR, 2))\n",
-    "out1 = Vector{Float64}(undef, size(EUR, 1))\n",
-    "out2 = Vector{Float64}(undef, size(EUR, 2))\n",
-    "@btime mul!($out1, $EURla, $v2);"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 93,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "  9.760 ms (3 allocations: 128 bytes)\n"
-     ]
-    }
-   ],
-   "source": [
-    "@btime mul!($out2, transpose($EURla), $v1);"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Computing $Ax$ is faster in SnpLinAlg, and computing $A^T x$ is faster in SnpBitMatrix. Note that SnpLinAlg does not allocate additional memory. See Linear Algebra page for more information."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### CuSnpArray"
+    "### GPU support: CuSnpArray"
    ]
   },
   {
@@ -3085,7 +3009,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 94,
+   "execution_count": 90,
    "metadata": {},
    "outputs": [
     {
@@ -3101,6 +3025,10 @@
    "source": [
     "ENV[\"JULIA_CUDA_USE_BINARYBUILDER\"] = \"false\" # will use local CUDA installation\n",
     "using CUDA, Adapt\n",
+    "out1 = randn(size(EUR, 1))\n",
+    "out2 = randn(size(EUR, 2))\n",
+    "v1 = randn(size(EUR, 1))\n",
+    "v2 = randn(size(EUR, 2))\n",
     "v1_d = adapt(CuVector{Float64}, v1) # sends data to GPU\n",
     "v2_d = adapt(CuVector{Float64}, v2)\n",
     "out1_d = adapt(CuVector{Float64}, out1)\n",
@@ -3111,7 +3039,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 95,
+   "execution_count": 91,
    "metadata": {},
    "outputs": [
     {
@@ -3127,7 +3055,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  18.361 ms (253 allocations: 7.69 KiB)\n"
+      "  18.204 ms (253 allocations: 7.69 KiB)\n"
      ]
     }
    ],
@@ -3137,14 +3065,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 96,
+   "execution_count": 92,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  626.435 μs (465 allocations: 15.00 KiB)\n"
+      "  544.722 μs (465 allocations: 15.00 KiB)\n"
      ]
     }
    ],
@@ -3168,16 +3096,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 97,
+   "execution_count": 93,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "4.38122146476498e-11"
+       "2.2306544292754367e-11"
       ]
      },
-     "execution_count": 97,
+     "execution_count": 93,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3204,7 +3132,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 98,
+   "execution_count": 94,
    "metadata": {},
    "outputs": [
     {
@@ -3239,7 +3167,7 @@
        ")"
       ]
      },
-     "execution_count": 98,
+     "execution_count": 94,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3259,7 +3187,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 99,
+   "execution_count": 95,
    "metadata": {},
    "outputs": [
     {
@@ -3294,7 +3222,7 @@
        ")"
       ]
      },
-     "execution_count": 99,
+     "execution_count": 95,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3305,7 +3233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 100,
+   "execution_count": 96,
    "metadata": {},
    "outputs": [
     {
@@ -3340,7 +3268,7 @@
        ")"
       ]
      },
-     "execution_count": 100,
+     "execution_count": 96,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3351,7 +3279,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 101,
+   "execution_count": 97,
    "metadata": {},
    "outputs": [
     {
@@ -3386,7 +3314,7 @@
        ")"
       ]
      },
-     "execution_count": 101,
+     "execution_count": 97,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3404,7 +3332,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 102,
+   "execution_count": 98,
    "metadata": {},
    "outputs": [
     {
@@ -3439,7 +3367,7 @@
        ")"
       ]
      },
-     "execution_count": 102,
+     "execution_count": 98,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3459,7 +3387,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": 99,
    "metadata": {},
    "outputs": [
     {
@@ -3474,7 +3402,7 @@
        "  \"18\" => SnpData(people: 379, snps: 12242,…"
       ]
      },
-     "execution_count": 103,
+     "execution_count": 99,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3492,7 +3420,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": 100,
    "metadata": {},
    "outputs": [
     {
@@ -3527,7 +3455,7 @@
        ")"
       ]
      },
-     "execution_count": 104,
+     "execution_count": 100,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3538,7 +3466,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 105,
+   "execution_count": 101,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3547,7 +3475,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 106,
+   "execution_count": 102,
    "metadata": {},
    "outputs": [
     {
@@ -3558,7 +3486,7 @@
        "  \"2\" => SnpData(people: 201, snps: 54051,…"
       ]
      },
-     "execution_count": 106,
+     "execution_count": 102,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3578,7 +3506,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": 103,
    "metadata": {},
    "outputs": [
     {
@@ -3613,7 +3541,7 @@
        ")"
       ]
      },
-     "execution_count": 107,
+     "execution_count": 103,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3624,7 +3552,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 108,
+   "execution_count": 104,
    "metadata": {},
    "outputs": [
     {
@@ -3659,7 +3587,7 @@
        ")"
       ]
      },
-     "execution_count": 108,
+     "execution_count": 104,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3670,7 +3598,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 105,
    "metadata": {},
    "outputs": [
     {
@@ -3705,7 +3633,7 @@
        ")"
       ]
      },
-     "execution_count": 109,
+     "execution_count": 105,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3716,7 +3644,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": 106,
    "metadata": {},
    "outputs": [
     {
@@ -3751,7 +3679,7 @@
        ")"
       ]
      },
-     "execution_count": 110,
+     "execution_count": 106,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3762,7 +3690,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 107,
    "metadata": {},
    "outputs": [
     {
@@ -3797,7 +3725,7 @@
        ")"
       ]
      },
-     "execution_count": 111,
+     "execution_count": 107,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3808,7 +3736,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 108,
    "metadata": {},
    "outputs": [
     {
@@ -3843,7 +3771,7 @@
        ")"
       ]
      },
-     "execution_count": 112,
+     "execution_count": 108,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3863,16 +3791,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": 109,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  0.066388 seconds (117.18 k allocations: 7.904 MiB)\n",
-      "  0.049178 seconds (125.97 k allocations: 11.520 MiB)\n",
-      "  0.051638 seconds (134.18 k allocations: 8.737 MiB)\n"
+      "  0.067004 seconds (117.20 k allocations: 7.905 MiB)\n",
+      "  0.102031 seconds (125.97 k allocations: 11.520 MiB, 51.01% gc time)\n",
+      "  0.050454 seconds (134.18 k allocations: 8.737 MiB)\n"
      ]
     },
     {
@@ -3907,7 +3835,7 @@
        ")"
       ]
      },
-     "execution_count": 113,
+     "execution_count": 109,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3925,17 +3853,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 110,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "  0.143816 seconds (783.53 k allocations: 36.258 MiB)\n",
-      "  0.575327 seconds (1.19 M allocations: 61.140 MiB)\n",
-      "  0.003704 seconds (8 allocations: 4.897 MiB)\n",
-      "  0.000827 seconds (8 allocations: 1.650 MiB)\n"
+      "  0.146038 seconds (783.53 k allocations: 36.252 MiB)\n",
+      "  0.568753 seconds (1.19 M allocations: 61.158 MiB)\n",
+      "  0.003368 seconds (8 allocations: 4.897 MiB)\n",
+      "  0.001259 seconds (8 allocations: 1.650 MiB)\n"
      ]
     },
     {
@@ -3970,7 +3898,7 @@
        ")"
       ]
      },
-     "execution_count": 114,
+     "execution_count": 110,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -3990,7 +3918,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": 111,
    "metadata": {},
    "outputs": [
     {
@@ -3999,7 +3927,7 @@
        "Process(`\u001b[4mcp\u001b[24m \u001b[4m/home/kose/.julia/dev/SnpArrays/src/../data/mouse.fam\u001b[24m \u001b[4mmouse_reorder.fam\u001b[24m`, ProcessExited(0))"
       ]
      },
-     "execution_count": 115,
+     "execution_count": 111,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -4013,7 +3941,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": 112,
    "metadata": {},
    "outputs": [
     {
@@ -4022,7 +3950,7 @@
        "(1940, 10150)"
       ]
      },
-     "execution_count": 116,
+     "execution_count": 112,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -4042,7 +3970,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": 113,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4053,7 +3981,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": 114,
    "metadata": {},
    "outputs": [
     {
@@ -4075,12 +4003,12 @@
        "│ Row │ fid      │ iid        │ father       │ mother       │ sex      │ phenotype │\n",
        "│     │ Abstrac… │ AbstractS… │ AbstractStr… │ AbstractStr… │ Abstrac… │ Abstract… │\n",
        "├─────┼──────────┼────────────┼──────────────┼──────────────┼──────────┼───────────┤\n",
-       "│ 1   │ 1_5      │ A053629514 │ F5.3:E2.2(6) │ F5.3:A3.2(5) │ 1        │ -9        │\n",
-       "│ 2   │ 1_18     │ A067077564 │ G1.5:B5.4(5) │ G1.5:F5.4(5) │ 1        │ -9        │\n",
-       "│ 3   │ 1_2      │ A048029086 │ G1.3:B1.2(3) │ G1.3:F3.2(3) │ 2        │ -9        │\n",
-       "│ 4   │ 1_1      │ A048097274 │ H4.2:C5.1(4) │ H4.2:G1.1(7) │ 1        │ -9        │\n",
-       "│ 5   │ 1_15     │ A063796366 │ H2.5:C3.4(3) │ H2.5:G4.4(2) │ 2        │ -9        │\n",
-       "│ 6   │ 1_45     │ A084280094 │ 6.6:F1.5(6)  │ 6.6:G1.5(1)  │ 1        │ -9        │\n",
+       "│ 1   │ 1_3      │ A067289112 │ E2.4:H3.3(2) │ E2.4:D4.3(2) │ 1        │ -9        │\n",
+       "│ 2   │ 1_1      │ A048031769 │ H4.2:C5.1(4) │ H4.2:G1.1(7) │ 1        │ -9        │\n",
+       "│ 3   │ 1_2      │ A084286354 │ H4.4:C2.3(5) │ H4.4:G3.3(7) │ 1        │ -9        │\n",
+       "│ 4   │ 1_3      │ A048072266 │ G5.2:B5.1(4) │ G5.2:F5.1(2) │ 2        │ -9        │\n",
+       "│ 5   │ 1_3      │ A063311623 │ A5.4:D4.3(5) │ A5.4:H1.3(6) │ 1        │ -9        │\n",
+       "│ 6   │ 1_54     │ A063011268 │ B3.5:E4.4(2) │ B3.5:A3.4(4) │ 1        │ -9        │\n",
        "…,\n",
        "srcbed: mouse_reorder.bed\n",
        "srcbim: mouse_reorder.bim\n",
@@ -4088,7 +4016,7 @@
        ")"
       ]
      },
-     "execution_count": 118,
+     "execution_count": 114,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -4106,7 +4034,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": 115,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/docs/SnpLinAlg.ipynb b/docs/SnpLinAlg.ipynb
index 0b448f90..60563dda 100644
--- a/docs/SnpLinAlg.ipynb
+++ b/docs/SnpLinAlg.ipynb
@@ -96,11 +96,13 @@
    "source": [
     "EUR_100_bm = SnpBitMatrix{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false)\n",
     "EUR_100_sla = SnpLinAlg{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false)\n",
-    "EUR_100_mat = convert(Matrix{Float64}, EUR_100, model=ADDITIVE_MODEL, center=true, scale=true);\n",
+    "EUR_100_sla_ = SnpLinAlg{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false, impute=false)\n",
+    "EUR_100_mat = convert(Matrix{Float64}, EUR_100, model=ADDITIVE_MODEL, center=false, scale=false);\n",
     "\n",
     "EUR_101_bm = SnpBitMatrix{Float64}(EUR_101; model=ADDITIVE_MODEL, center=false, scale=false)\n",
     "EUR_101_sla = SnpLinAlg{Float64}(EUR_101; model=ADDITIVE_MODEL, center=false, scale=false)\n",
-    "EUR_101_mat = convert(Matrix{Float64}, EUR_101, model=ADDITIVE_MODEL, center=true, scale=true);"
+    "EUR_101_sla_ = SnpLinAlg{Float64}(EUR_101; model=ADDITIVE_MODEL, center=false, scale=false, impute=false)\n",
+    "EUR_101_mat = convert(Matrix{Float64}, EUR_101, model=ADDITIVE_MODEL, center=false, scale=false);"
    ]
   },
   {
@@ -121,7 +123,8 @@
    "source": [
     "ENV[\"JULIA_CUDA_USE_BINARYBUILDER\"] = \"false\"\n",
     "using CUDA\n",
-    "EUR_100_cu = CuSnpArray{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false);"
+    "EUR_100_cu = CuSnpArray{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false);\n",
+    "EUR_100_cu_ = CuSnpArray{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false, impute=false);"
    ]
   },
   {
@@ -148,6 +151,7 @@
    "outputs": [],
    "source": [
     "v1 = randn(size(EUR_100, 1))\n",
+    "v1_ = randn(size(EUR_100, 1))\n",
     "v2 = randn(size(EUR_100, 2));"
    ]
   },
@@ -170,12 +174,12 @@
        "  memory estimate:  0 bytes\n",
        "  allocs estimate:  0\n",
        "  --------------\n",
-       "  minimum time:     321.207 ms (0.00% GC)\n",
-       "  median time:      347.112 ms (0.00% GC)\n",
-       "  mean time:        375.497 ms (0.00% GC)\n",
-       "  maximum time:     769.390 ms (0.00% GC)\n",
+       "  minimum time:     383.853 ms (0.00% GC)\n",
+       "  median time:      417.986 ms (0.00% GC)\n",
+       "  mean time:        431.205 ms (0.00% GC)\n",
+       "  maximum time:     530.847 ms (0.00% GC)\n",
        "  --------------\n",
-       "  samples:          15\n",
+       "  samples:          12\n",
        "  evals/sample:     1"
       ]
      },
@@ -208,10 +212,10 @@
        "  memory estimate:  0 bytes\n",
        "  allocs estimate:  0\n",
        "  --------------\n",
-       "  minimum time:     1.745 s (0.00% GC)\n",
-       "  median time:      1.905 s (0.00% GC)\n",
-       "  mean time:        2.144 s (0.00% GC)\n",
-       "  maximum time:     2.782 s (0.00% GC)\n",
+       "  minimum time:     1.849 s (0.00% GC)\n",
+       "  median time:      2.064 s (0.00% GC)\n",
+       "  mean time:        2.047 s (0.00% GC)\n",
+       "  maximum time:     2.229 s (0.00% GC)\n",
        "  --------------\n",
        "  samples:          3\n",
        "  evals/sample:     1"
@@ -231,7 +235,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Direct linear algebra on a SnpArray: "
+    "Direct linear algebra on a SnpArray, with mean imputation: "
    ]
   },
   {
@@ -243,15 +247,15 @@
      "data": {
       "text/plain": [
        "BenchmarkTools.Trial: \n",
-       "  memory estimate:  8.23 KiB\n",
-       "  allocs estimate:  158\n",
+       "  memory estimate:  38.33 KiB\n",
+       "  allocs estimate:  1616\n",
        "  --------------\n",
-       "  minimum time:     1.014 s (0.00% GC)\n",
-       "  median time:      1.021 s (0.00% GC)\n",
-       "  mean time:        1.020 s (0.00% GC)\n",
-       "  maximum time:     1.027 s (0.00% GC)\n",
+       "  minimum time:     1.241 s (0.00% GC)\n",
+       "  median time:      1.355 s (0.00% GC)\n",
+       "  mean time:        1.352 s (0.00% GC)\n",
+       "  maximum time:     1.457 s (0.00% GC)\n",
        "  --------------\n",
-       "  samples:          5\n",
+       "  samples:          4\n",
        "  evals/sample:     1"
       ]
      },
@@ -268,7 +272,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The below is the benchmark for SnpBitMatrix:"
+    "With zero imputation:"
    ]
   },
   {
@@ -280,13 +284,13 @@
      "data": {
       "text/plain": [
        "BenchmarkTools.Trial: \n",
-       "  memory estimate:  0 bytes\n",
-       "  allocs estimate:  0\n",
+       "  memory estimate:  38.33 KiB\n",
+       "  allocs estimate:  1616\n",
        "  --------------\n",
-       "  minimum time:     1.041 s (0.00% GC)\n",
-       "  median time:      1.056 s (0.00% GC)\n",
-       "  mean time:        1.058 s (0.00% GC)\n",
-       "  maximum time:     1.076 s (0.00% GC)\n",
+       "  minimum time:     1.017 s (0.00% GC)\n",
+       "  median time:      1.037 s (0.00% GC)\n",
+       "  mean time:        1.034 s (0.00% GC)\n",
+       "  maximum time:     1.045 s (0.00% GC)\n",
        "  --------------\n",
        "  samples:          5\n",
        "  evals/sample:     1"
@@ -298,30 +302,58 @@
     }
    ],
    "source": [
-    "@benchmark (LinearAlgebra.mul!($v1, $EUR_100_bm, $v2))"
+    "@benchmark LinearAlgebra.mul!($v1, $EUR_100_sla_, $v2)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now let's try CUDA. The device is Nvidia Titan V."
+    "Indeed, we are paying some price for mean imputation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The below is the benchmark for SnpBitMatrix (always zero-imputed):"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 13,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "BenchmarkTools.Trial: \n",
+       "  memory estimate:  0 bytes\n",
+       "  allocs estimate:  0\n",
+       "  --------------\n",
+       "  minimum time:     1.045 s (0.00% GC)\n",
+       "  median time:      1.051 s (0.00% GC)\n",
+       "  mean time:        1.051 s (0.00% GC)\n",
+       "  maximum time:     1.058 s (0.00% GC)\n",
+       "  --------------\n",
+       "  samples:          5\n",
+       "  evals/sample:     1"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "using Adapt"
+    "@benchmark (LinearAlgebra.mul!($v1, $EUR_100_bm, $v2))"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Moving data to GPU: "
+    "At first glance, the result from SnpBitMatrix might look better than SnpLinAlg. However, SnpLinAlg is more stable in performance when the number of samples is not multiple of 4 or 8."
    ]
   },
   {
@@ -330,8 +362,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "v1_d = adapt(CuArray{Float64}, v1)\n",
-    "v2_d = adapt(CuArray{Float64}, v2);"
+    "v1 = randn(size(EUR_101, 1))\n",
+    "v2 = randn(size(EUR_101, 2));"
    ]
   },
   {
@@ -339,28 +371,19 @@
    "execution_count": 15,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "┌ Warning: `Target(triple::String)` is deprecated, use `Target(; triple = triple)` instead.\n",
-      "│   caller = ip:0x0\n",
-      "└ @ Core :-1\n"
-     ]
-    },
     {
      "data": {
       "text/plain": [
        "BenchmarkTools.Trial: \n",
-       "  memory estimate:  3.28 KiB\n",
-       "  allocs estimate:  130\n",
+       "  memory estimate:  44.13 KiB\n",
+       "  allocs estimate:  1722\n",
        "  --------------\n",
-       "  minimum time:     22.138 ms (0.00% GC)\n",
-       "  median time:      22.356 ms (0.00% GC)\n",
-       "  mean time:        22.352 ms (0.00% GC)\n",
-       "  maximum time:     23.825 ms (0.00% GC)\n",
+       "  minimum time:     1.259 s (0.00% GC)\n",
+       "  median time:      1.268 s (0.00% GC)\n",
+       "  mean time:        1.268 s (0.00% GC)\n",
+       "  maximum time:     1.277 s (0.00% GC)\n",
        "  --------------\n",
-       "  samples:          224\n",
+       "  samples:          4\n",
        "  evals/sample:     1"
       ]
      },
@@ -370,15 +393,7 @@
     }
    ],
    "source": [
-    "using BenchmarkTools\n",
-    "@benchmark CUDA.@sync LinearAlgebra.mul!($v1_d, $EUR_100_cu, $v2_d)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The speedup is obvious. Let's check correctness:"
+    "@benchmark LinearAlgebra.mul!($v1, $EUR_101_sla, $v2)"
    ]
   },
   {
@@ -389,7 +404,17 @@
     {
      "data": {
       "text/plain": [
-       "true"
+       "BenchmarkTools.Trial: \n",
+       "  memory estimate:  44.13 KiB\n",
+       "  allocs estimate:  1722\n",
+       "  --------------\n",
+       "  minimum time:     1.038 s (0.00% GC)\n",
+       "  median time:      1.043 s (0.00% GC)\n",
+       "  mean time:        1.043 s (0.00% GC)\n",
+       "  maximum time:     1.051 s (0.00% GC)\n",
+       "  --------------\n",
+       "  samples:          5\n",
+       "  evals/sample:     1"
       ]
      },
      "execution_count": 16,
@@ -398,60 +423,62 @@
     }
    ],
    "source": [
-    "isapprox(collect(v1_d), v1)"
+    "@benchmark LinearAlgebra.mul!($v1, $EUR_101_sla_, $v2)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "EUR_100_mat_d = adapt(CuArray, EUR_100_mat);"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [
     {
      "data": {
       "text/plain": [
        "BenchmarkTools.Trial: \n",
-       "  memory estimate:  2.58 KiB\n",
-       "  allocs estimate:  85\n",
+       "  memory estimate:  0 bytes\n",
+       "  allocs estimate:  0\n",
        "  --------------\n",
-       "  minimum time:     78.002 ms (0.00% GC)\n",
-       "  median time:      80.142 ms (0.00% GC)\n",
-       "  mean time:        80.055 ms (0.00% GC)\n",
-       "  maximum time:     83.111 ms (0.00% GC)\n",
+       "  minimum time:     1.544 s (0.00% GC)\n",
+       "  median time:      1.561 s (0.00% GC)\n",
+       "  mean time:        1.561 s (0.00% GC)\n",
+       "  maximum time:     1.577 s (0.00% GC)\n",
        "  --------------\n",
-       "  samples:          63\n",
+       "  samples:          4\n",
        "  evals/sample:     1"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "@benchmark CUDA.@sync LinearAlgebra.mul!($v1_d, $EUR_100_mat_d, $v2_d)"
+    "@benchmark LinearAlgebra.mul!($v1, $EUR_101_bm, $v2)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Using CuSnpArray is both faster and memory-efficient compared to linear algebra with floating point matrix on GPU."
+    "Now let's try CUDA. The device is Nvidia Titan V."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "using Adapt"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "At first glance, the result from SnpBitMatrix might look similar to SnpLinAlg. However, SnpLinAlg is more stable in performance when the number of samples is not multiple of 4 or 8."
+    "Moving data to GPU: "
    ]
   },
   {
@@ -460,8 +487,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "v1 = randn(size(EUR_101, 1))\n",
-    "v2 = randn(size(EUR_101, 2));"
+    "v1 = randn(size(EUR_100, 1))\n",
+    "v1_ = randn(size(EUR_100, 1))\n",
+    "v2 = randn(size(EUR_100, 2));\n",
+    "v1_d = adapt(CuArray{Float64}, v1)\n",
+    "v1_d_ = similar(v1_d)\n",
+    "v2_d = adapt(CuArray{Float64}, v2);"
    ]
   },
   {
@@ -469,19 +500,28 @@
    "execution_count": 20,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "┌ Warning: `Target(triple::String)` is deprecated, use `Target(; triple = triple)` instead.\n",
+      "│   caller = ip:0x0\n",
+      "└ @ Core :-1\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
        "BenchmarkTools.Trial: \n",
-       "  memory estimate:  14.03 KiB\n",
-       "  allocs estimate:  264\n",
+       "  memory estimate:  3.28 KiB\n",
+       "  allocs estimate:  130\n",
        "  --------------\n",
-       "  minimum time:     1.022 s (0.00% GC)\n",
-       "  median time:      1.024 s (0.00% GC)\n",
-       "  mean time:        1.024 s (0.00% GC)\n",
-       "  maximum time:     1.028 s (0.00% GC)\n",
+       "  minimum time:     22.319 ms (0.00% GC)\n",
+       "  median time:      22.429 ms (0.00% GC)\n",
+       "  mean time:        22.722 ms (0.00% GC)\n",
+       "  maximum time:     26.938 ms (0.00% GC)\n",
        "  --------------\n",
-       "  samples:          5\n",
+       "  samples:          220\n",
        "  evals/sample:     1"
       ]
      },
@@ -491,7 +531,15 @@
     }
    ],
    "source": [
-    "@benchmark LinearAlgebra.mul!($v1, $EUR_101_sla, $v2)"
+    "using BenchmarkTools\n",
+    "@benchmark CUDA.@sync LinearAlgebra.mul!($v1_d, $EUR_100_cu, $v2_d)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For CuSnpArray, the additional cost for mean imputation is negligible."
    ]
   },
   {
@@ -503,15 +551,15 @@
      "data": {
       "text/plain": [
        "BenchmarkTools.Trial: \n",
-       "  memory estimate:  0 bytes\n",
-       "  allocs estimate:  0\n",
+       "  memory estimate:  3.28 KiB\n",
+       "  allocs estimate:  130\n",
        "  --------------\n",
-       "  minimum time:     1.187 s (0.00% GC)\n",
-       "  median time:      1.188 s (0.00% GC)\n",
-       "  mean time:        1.191 s (0.00% GC)\n",
-       "  maximum time:     1.197 s (0.00% GC)\n",
+       "  minimum time:     22.243 ms (0.00% GC)\n",
+       "  median time:      22.431 ms (0.00% GC)\n",
+       "  mean time:        22.805 ms (0.00% GC)\n",
+       "  maximum time:     57.050 ms (0.00% GC)\n",
        "  --------------\n",
-       "  samples:          5\n",
+       "  samples:          220\n",
        "  evals/sample:     1"
       ]
      },
@@ -521,7 +569,73 @@
     }
    ],
    "source": [
-    "@benchmark LinearAlgebra.mul!($v1, $EUR_101_bm, $v2)"
+    "@benchmark CUDA.@sync LinearAlgebra.mul!($v1_d_, $EUR_100_cu_, $v2_d)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EUR_100_mat_d = adapt(CuArray, EUR_100_mat);"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "BenchmarkTools.Trial: \n",
+       "  memory estimate:  2.58 KiB\n",
+       "  allocs estimate:  85\n",
+       "  --------------\n",
+       "  minimum time:     75.951 ms (0.00% GC)\n",
+       "  median time:      76.516 ms (0.00% GC)\n",
+       "  mean time:        77.909 ms (0.00% GC)\n",
+       "  maximum time:     82.096 ms (0.00% GC)\n",
+       "  --------------\n",
+       "  samples:          65\n",
+       "  evals/sample:     1"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@benchmark CUDA.@sync LinearAlgebra.mul!($v1_d, $EUR_100_mat_d, $v2_d)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The speedup is obvious, CuSnpArrays is 30-50x faster than on CPU, and using CuSnpArray is both faster and memory-efficient compared to linear algebra with floating point matrix on GPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "true"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "isapprox(v1_d, v1_d_)"
    ]
   },
   {
@@ -533,7 +647,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -545,7 +659,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -555,16 +669,16 @@
        "  memory estimate:  16 bytes\n",
        "  allocs estimate:  1\n",
        "  --------------\n",
-       "  minimum time:     662.131 ms (0.00% GC)\n",
-       "  median time:      678.294 ms (0.00% GC)\n",
-       "  mean time:        701.543 ms (0.00% GC)\n",
-       "  maximum time:     790.350 ms (0.00% GC)\n",
+       "  minimum time:     842.187 ms (0.00% GC)\n",
+       "  median time:      882.157 ms (0.00% GC)\n",
+       "  mean time:        913.111 ms (0.00% GC)\n",
+       "  maximum time:     1.049 s (0.00% GC)\n",
        "  --------------\n",
-       "  samples:          8\n",
+       "  samples:          6\n",
        "  evals/sample:     1"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -575,7 +689,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
@@ -585,16 +699,16 @@
        "  memory estimate:  16 bytes\n",
        "  allocs estimate:  1\n",
        "  --------------\n",
-       "  minimum time:     605.351 ms (0.00% GC)\n",
-       "  median time:      618.706 ms (0.00% GC)\n",
-       "  mean time:        620.553 ms (0.00% GC)\n",
-       "  maximum time:     658.692 ms (0.00% GC)\n",
+       "  minimum time:     618.434 ms (0.00% GC)\n",
+       "  median time:      630.293 ms (0.00% GC)\n",
+       "  mean time:        689.063 ms (0.00% GC)\n",
+       "  maximum time:     881.782 ms (0.00% GC)\n",
        "  --------------\n",
-       "  samples:          9\n",
+       "  samples:          8\n",
        "  evals/sample:     1"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -605,7 +719,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -615,16 +729,16 @@
        "  memory estimate:  3.08 KiB\n",
        "  allocs estimate:  118\n",
        "  --------------\n",
-       "  minimum time:     26.926 ms (0.00% GC)\n",
-       "  median time:      27.330 ms (0.00% GC)\n",
-       "  mean time:        27.494 ms (0.00% GC)\n",
-       "  maximum time:     31.492 ms (0.00% GC)\n",
+       "  minimum time:     26.710 ms (0.00% GC)\n",
+       "  median time:      26.930 ms (0.00% GC)\n",
+       "  mean time:        27.350 ms (0.00% GC)\n",
+       "  maximum time:     32.306 ms (0.00% GC)\n",
        "  --------------\n",
-       "  samples:          182\n",
+       "  samples:          183\n",
        "  evals/sample:     1"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -635,7 +749,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
@@ -644,7 +758,7 @@
        "true"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -655,7 +769,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -665,7 +779,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
@@ -675,16 +789,16 @@
        "  memory estimate:  128 bytes\n",
        "  allocs estimate:  3\n",
        "  --------------\n",
-       "  minimum time:     667.369 ms (0.00% GC)\n",
-       "  median time:      681.264 ms (0.00% GC)\n",
-       "  mean time:        697.866 ms (0.00% GC)\n",
-       "  maximum time:     789.988 ms (0.00% GC)\n",
+       "  minimum time:     850.050 ms (0.00% GC)\n",
+       "  median time:      862.818 ms (0.00% GC)\n",
+       "  mean time:        953.739 ms (0.00% GC)\n",
+       "  maximum time:     1.391 s (0.00% GC)\n",
        "  --------------\n",
-       "  samples:          8\n",
+       "  samples:          6\n",
        "  evals/sample:     1"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -695,7 +809,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "BenchmarkTools.Trial: \n",
+       "  memory estimate:  128 bytes\n",
+       "  allocs estimate:  3\n",
+       "  --------------\n",
+       "  minimum time:     673.269 ms (0.00% GC)\n",
+       "  median time:      693.418 ms (0.00% GC)\n",
+       "  mean time:        700.729 ms (0.00% GC)\n",
+       "  maximum time:     784.130 ms (0.00% GC)\n",
+       "  --------------\n",
+       "  samples:          8\n",
+       "  evals/sample:     1"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@benchmark LinearAlgebra.mul!($v2, transpose($EUR_101_sla_), $v1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
@@ -705,16 +849,16 @@
        "  memory estimate:  16 bytes\n",
        "  allocs estimate:  1\n",
        "  --------------\n",
-       "  minimum time:     612.441 ms (0.00% GC)\n",
-       "  median time:      624.015 ms (0.00% GC)\n",
-       "  mean time:        641.973 ms (0.00% GC)\n",
-       "  maximum time:     770.845 ms (0.00% GC)\n",
+       "  minimum time:     620.830 ms (0.00% GC)\n",
+       "  median time:      637.698 ms (0.00% GC)\n",
+       "  mean time:        656.708 ms (0.00% GC)\n",
+       "  maximum time:     798.543 ms (0.00% GC)\n",
        "  --------------\n",
        "  samples:          8\n",
        "  evals/sample:     1"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/docs/src/index.md b/docs/src/index.md
index e82efafb..7f1127c3 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -167,7 +167,7 @@ Because the file is memory-mapped opening the file and accessing the data is fas
 @btime(SnpArray(SnpArrays.datadir("mouse.bed")));
 ```
 
-      87.555 μs (64 allocations: 390.03 KiB)
+      94.032 μs (64 allocations: 390.03 KiB)
 
 
 By default, the memory-mapped file is read only, changing entries is not allowed.
@@ -430,11 +430,11 @@ tmpbf = SnpArray(undef, 5, 3)
 
 
     5×3 SnpArray:
-     0x00  0x00  0x00
-     0x00  0x00  0x00
-     0x00  0x00  0x00
-     0x00  0x00  0x00
-     0x00  0x00  0x00
+     0x00  0x01  0x01
+     0x00  0x03  0x01
+     0x01  0x03  0x02
+     0x02  0x01  0x00
+     0x00  0x00  0x03
 
 
 
@@ -449,11 +449,11 @@ tmpbf = SnpArray("tmp.bed", tmpbf)
 
 
     5×3 SnpArray:
-     0x00  0x00  0x00
-     0x00  0x00  0x00
-     0x00  0x00  0x00
-     0x00  0x00  0x00
-     0x00  0x00  0x00
+     0x00  0x01  0x01
+     0x00  0x03  0x01
+     0x01  0x03  0x02
+     0x02  0x01  0x00
+     0x00  0x00  0x03
 
 
 
@@ -467,11 +467,11 @@ tmpbf
 
 
     5×3 SnpArray:
-     0x02  0x00  0x00
-     0x00  0x00  0x00
-     0x00  0x00  0x00
-     0x00  0x00  0x00
-     0x00  0x00  0x00
+     0x02  0x01  0x01
+     0x00  0x03  0x01
+     0x01  0x03  0x02
+     0x02  0x01  0x00
+     0x00  0x00  0x03
 
 
 
@@ -481,11 +481,11 @@ tmpbf
 rm("tmp.bed", force=true)
 ```
 
-## `convert` and `copyto!`
+### `convert` and `copyto!`
 
 Most common usage of SnpArray is to convert genotypes to numeric values for statistical analysis. Conversion rule depends on genetic models (additive, dominant, or recessive), centering, scaling, or imputation.
 
-### `convert`
+#### `convert`
 
 `convert` function has 4 keyword arguments: `model`, `center`, `scale`, and `impute`.
 
@@ -730,7 +730,7 @@ convert(Vector{Float64}, @view(mouse[:, end]), center=true, scale=true, impute=t
 
 
 
-### `copyto!`
+#### `copyto!`
 
 `copyto!` is the in-place version of `convert`. It takes the same keyword arguments (`model`, `center`, `scale`, `impute`) as `convert`.
 
@@ -780,7 +780,7 @@ copyto!(v, @view(mouse[:, 1]))
 @btime(copyto!($v, $@view(mouse[:, 1])));
 ```
 
-      3.719 μs (0 allocations: 0 bytes)
+      3.655 μs (0 allocations: 0 bytes)
 
 
 Copy columns using defaults
@@ -830,7 +830,7 @@ copyto!(v2, @view(mouse[:, 1:2]))
 @btime(copyto!($v2, $@view(mouse[:, 1:2])));
 ```
 
-      7.138 μs (0 allocations: 0 bytes)
+      7.205 μs (0 allocations: 0 bytes)
 
 
 Center and scale
@@ -879,7 +879,7 @@ copyto!(v, @view(mouse[:, 1]), center=true, scale=true)
 @btime(copyto!($v, $(@view(mouse[:, 1])), center=true, scale=true));
 ```
 
-      6.860 μs (0 allocations: 0 bytes)
+      6.856 μs (0 allocations: 0 bytes)
 
 
 Looping over all columns
@@ -895,7 +895,7 @@ end
 @btime(loop_test($v, $mouse))
 ```
 
-      50.433 ms (10150 allocations: 475.78 KiB)
+      50.329 ms (10150 allocations: 475.78 KiB)
 
 
 Copy whole SnpArray
@@ -906,12 +906,12 @@ M = similar(mouse, Float64)
 @btime(copyto!($M, $mouse));
 ```
 
-      64.308 ms (0 allocations: 0 bytes)
+      64.344 ms (0 allocations: 0 bytes)
 
 
-## Summaries
+### Summaries
 
-### Counts
+#### Counts
 
 Counts of each the four possible values for each column are returned by `counts`.`
 
@@ -941,10 +941,10 @@ The counts by column and by row are cached in the `SnpArray` object. Accesses af
 @btime(counts($mouse, dims=1));
 ```
 
-      6.047 ns (0 allocations: 0 bytes)
+      6.043 ns (0 allocations: 0 bytes)
 
 
-### Minor allele frequencies
+#### Minor allele frequencies
 
 Minor allele frequencies (MAF) for each SNP.
 
@@ -1026,7 +1026,7 @@ minorallele(mouse)
 
 
 
-### `mean` and `var`
+#### `mean` and `var`
 
 The package provides methods for the generics `mean` and `var` from the `Statistics` package.
 
@@ -1076,7 +1076,7 @@ These methods make use of the cached column or row counts and thus are very fast
 @btime(mean($mouse, dims=1));
 ```
 
-      17.328 μs (2 allocations: 79.39 KiB)
+      17.263 μs (2 allocations: 79.39 KiB)
 
 
 The column-wise or row-wise standard deviations are returned by `std`.
@@ -1119,7 +1119,7 @@ std(mouse, dims=2)
 
 
 
-### Missing rate
+#### Missing rate
 
 Proportion of missing genotypes
 
@@ -1199,7 +1199,7 @@ missingrate(mouse, 2)
 
 
 
-### Location of the missing values
+#### Location of the missing values
 
 The positions of the missing data are evaluated by
 
@@ -1246,7 +1246,7 @@ mp = missingpos(mouse)
 @btime(missingpos($mouse));
 ```
 
-      43.661 ms (19274 allocations: 1.80 MiB)
+      43.843 ms (19274 allocations: 1.80 MiB)
 
 
 So, for example, the number of missing data values in each column can be evaluated as
@@ -1279,7 +1279,7 @@ view(counts(mouse, dims=1), 2:2, :)
 
 
 
-## Genetic relationship matrix
+### Genetic relationship matrix
 
 `grm` function computes the empirical kinship matrix using either the classical genetic relationship matrix, `grm(A, model=:GRM)`, or the method of moment method, `grm(A, model=:MoM)`, or the robust method, `grm(A, model=:Robust)`. 
 
@@ -1330,7 +1330,7 @@ grm(mouse, method=:GRM)
 @btime(grm($mouse, method=:GRM));
 ```
 
-      502.287 ms (25 allocations: 28.95 MiB)
+      505.403 ms (25 allocations: 28.95 MiB)
 
 
 Using Float32 (single precision) potentially saves memory usage and computation time.
@@ -1378,7 +1378,7 @@ grm(mouse, method=:GRM, t=Float32)
 @btime(grm($mouse, method=:GRM, t=Float32));
 ```
 
-      315.064 ms (26 allocations: 14.60 MiB)
+      312.411 ms (26 allocations: 14.60 MiB)
 
 
 By default, `grm` exlcude SNPs with minor allele frequency below 0.01. This can be changed by the keyword argument `minmaf`.
@@ -1463,7 +1463,7 @@ grm(mouse, cinds=1:2:size(mouse, 2))
 
 
 
-## Filtering
+### Filtering
 
 Before GWAS, we often need to filter SNPs and/or samples according to genotyping success rates, minor allele frequencies, and Hardy-Weinberg Equilibrium test. This can be achieved by the `filter` function.
 
@@ -1502,7 +1502,7 @@ count(rowmask), count(colmask)
 @btime(SnpArrays.filter($mouse, min_success_rate_per_row=0.999, min_success_rate_per_col=0.999));
 ```
 
-      150.800 ms (11460 allocations: 171.28 MiB)
+      148.931 ms (11460 allocations: 171.28 MiB)
 
 
 One may use the `rowmask` and `colmask` to filter and save filtering result as Plink files.
@@ -1510,7 +1510,7 @@ One may use the `rowmask` and `colmask` to filter and save filtering result as P
 SnpArrays.filter(SnpArrays.datadir("mouse"), rowmask, colmask)
 ```
 
-### Filter Plink files
+#### Filter Plink files
 
 Filter a set of Plink files according to row indices and column indices. By result, filtered Plink files are saved as `srcname.filtered.bed`, `srcname.filtered.fam`, and `srcname.filtered.bim`, where `srcname` is the source Plink file name. You can also specify destimation file name using keyword `des`.
 
@@ -1694,14 +1694,10 @@ readdir(glob"tmp_*", ".")
 
 
 
-    7-element Array{String,1}:
+    3-element Array{String,1}:
      "./tmp_hcat_arr_1.bed"
      "./tmp_hvcat_arr_1.bed"
      "./tmp_vcat_arr_1.bed"
-     "./tmp_vcat_arr_2.bed"
-     "./tmp_vcat_arr_3.bed"
-     "./tmp_vcat_arr_4.bed"
-     "./tmp_vcat_arr_5.bed"
 
 
 
@@ -1770,12 +1766,17 @@ rm(SnpArrays.datadir("mouse.test.vcat.bed"), force=true)
 rm(SnpArrays.datadir("mouse.test.hvcat.bed"), force=true)
 ```
 
-## SnpBitMatrix
+## Linear Algebra
+
+In some applications we want to perform linear algebra using SnpArray directly without expanding it to numeric matrix. This is achieved in three different `struct`s:
 
-In some applications we want to perform linear algebra using SnpArray directly without expanding it to numeric matrix. This is achieved by the `SnpBitMatrix` type. The implementation assumes:
+1. Direct operations on a plink-formatted `SnpArray`: `SnpLinAlg`
+2. Operations on transformed `BitMatrix`es: `SnpBitMatrix`
+3. Direct operations on a plink-formatted data on an Nvidia GPU: `CuSnpArray`.
 
-1. the SnpArray does not have missing genotypes, and
-2. the matrix corresponding to SnpArray is the matrix of A2 allele counts.
+`SnpLinAlg` and `SnpBitMatrix` use Chris Elrod's [LoopVectorization.jl](https://github.com/chriselrod/LoopVectorization.jl) internally. It is much faster on machines with AVX support. `CuSnpArray` uses [CUDA.jl](https://juliagpu.gitlab.io/CUDA.jl/) internally.
+
+The implementation assumes that the matrix corresponding to SnpArray is the matrix of the A2 allele counts. `SnpLinAlg` and `CuSnpArray` impute any missing genotype with its column mean by default. They can also configured to impute missing genotypes with zero. `SnpBitMatrix` can only impute missing values with zero. 
 
 ### Constructor
 
@@ -1819,112 +1820,118 @@ const EUR = SnpArray(SnpArrays.datadir("EUR_subset.bed"))
 
 
 
-To instantiate a SnpBitMatrix based on SnpArray,
+To instantiate a SnpLinAlg based on SnpArray,
 
 
 ```julia
+const EURsla = SnpLinAlg{Float64}(EUR, model=ADDITIVE_MODEL, center=true, scale=true);
+const EURsla_ = SnpLinAlg{Float64}(EUR, model=ADDITIVE_MODEL, center=true, scale=true, impute=false);
 const EURbm = SnpBitMatrix{Float64}(EUR, model=ADDITIVE_MODEL, center=true, scale=true);
 ```
 
-The constructor shares the same keyword arguments as the `convert` or `copyto!` functions. The type parameter, `Float64` in this example, indicates the SnpBitMatrix acts like a Float64 matrix.
+The constructor shares the same keyword arguments as the `convert` or `copyto!` functions. The type parameter, `Float64` in this example, indicates the SnpLinAlg acts like a Float64 matrix.
+SnpLinAlg directly uses the SnpArray for computation. 
 
-The memory usage of the SnpBitMatrix should be similar to the SnpArray, or equivalently bed file size, if `model=ADDITIVE_MODEL`, or half of that of SnpArray if `model=DOMINANT_MODEL` or `model=RECESSIVE_MODEL`.
+On the other hand, memory usage of SnpBitMatrix should be similar to the SnpArray, or equivalently bed file size, if `model=ADDITIVE_MODEL`, or half of that of SnpArray if `model=DOMINANT_MODEL` or `model=RECESSIVE_MODEL`.
 
 
 ```julia
-Base.summarysize(EUR), Base.summarysize(EURbm)
+Base.summarysize(EUR), Base.summarysize(EURsla), Base.summarysize(EURbm)
 ```
 
 
 
 
-    (6876757, 6421960)
+    (6876757, 8177221, 6421960)
 
 
 
-### Linear algebra
+### `mul!()`
 
-A SnpBitMatrix acts similar to a regular matrix and responds to `size`, `eltype`, and SnpBitMatrix-vector multiplications.
+SnpLinAlg and SnpBitMatrix act similar to a regular matrix and responds to `size`, `eltype`, and matrix-vector multiplications.
 
 
 ```julia
+@show size(EURsla)
+@show eltype(EURsla)
+@show typeof(EURsla) <: AbstractMatrix;
+
 @show size(EURbm)
 @show eltype(EURbm)
 @show typeof(EURbm) <: AbstractMatrix;
 ```
 
+    size(EURsla) = (379, 54051)
+    eltype(EURsla) = Float64
+    typeof(EURsla) <: AbstractMatrix = true
     size(EURbm) = (379, 54051)
     eltype(EURbm) = Float64
     typeof(EURbm) <: AbstractMatrix = true
 
 
-SnpBitMatrix-vector multiplication is mathematically equivalent to the corresponding Float matrix contained from `convert` or `copyto!` a SnpArray.
+Matrix-vector multiplications with SnpLinAlg and SnpBitMatrix are mathematically equivalent to the corresponding Float matrix contained from `convert` or `copyto!` a SnpArray.
 
 
 ```julia
 using LinearAlgebra
 v1 = randn(size(EUR, 1))
 v2 = randn(size(EUR, 2))
-A = convert(Matrix{Float64}, EUR, model=ADDITIVE_MODEL, center=true, scale=true)
-norm(EURbm * v2 -  A * v2)
+A = convert(Matrix{Float64}, EUR, model=ADDITIVE_MODEL, center=true, scale=true);
+```
+
+
+```julia
+norm(EURsla * v2 -  A * v2)
 ```
 
 
 
 
-    2.837775572500693e-11
+    3.255157000354014e-11
 
 
 
 
 ```julia
-norm(EURbm' * v1 - A' * v1)
+norm(EURsla' * v1 - A' *v1)
 ```
 
 
 
 
-    5.399567080480323e-12
-
+    5.398163705112499e-12
 
 
-In this example, the Float64 matrix fits into memory so the SnpBitMatrix-vector multiplication is much slower than Matrix{Float64}-vector multiplication (highly optimized BLAS).
 
 
 ```julia
-out1 = Vector{Float64}(undef, size(EUR, 1))
-out2 = Vector{Float64}(undef, size(EUR, 2))
-@btime(mul!($out1, $EURbm, $v2));
+norm(EURbm * v2 -  A * v2)
 ```
 
-      8.495 ms (0 allocations: 0 bytes)
 
 
 
-```julia
-@btime(mul!($out1, $A, $v2));
-```
+    3.590044545281484e-11
 
-      7.844 ms (0 allocations: 0 bytes)
 
 
 
 ```julia
-@btime(mul!($out2, $transpose($EURbm), $v1));
+norm(EURbm' * v1 - A' * v1)
 ```
 
-      6.612 ms (1 allocation: 16 bytes)
 
 
 
-```julia
-@btime(mul!($out2, $transpose($A), $v1));
-```
+    8.54121862883797e-12
+
 
-      3.011 ms (0 allocations: 0 bytes)
 
+See Linear Algebra page for performance comparison among BLAS, SnpLinAlg, and SnpBitMatrix. In general, SnpLinAlg and SnpBitMatrix operations are at least twice faster than the corresponding Matrix{Float64}-vector multiplication. 
 
-In another test example with ~1GB bed file, SnpBitMatrix-vector multiplication is about 3-5 folder faster than the corresponding Matrix{Float64}-vector multiplication, because the Matrix{Float64} matrix cannot fit into the memory.
+In general, computing $Ax$ is more effective in SnpLinAlg, and computing $A^T x$ is faster in SnpBitMatrix. Note that SnpLinAlg does not allocate additional memory, and can impute missing values with column means. See Linear Algebra page for more information.
+
+In a test example with ~1GB bed file, SnpBitMatrix-vector multiplication is about 3-5 fold faster than the corresponding Matrix{Float64}-vector multiplication, because the Matrix{Float64} matrix cannot fit into the memory.
 
 `SnpBitMatrix` can be created from a subarray of SnpArray.
 
@@ -1970,7 +1977,7 @@ norm(EURsubbm * v2 -  A * v2)
 
 
 
-    8.987658251177042e-14
+    5.05574647736198e-14
 
 
 
@@ -1982,43 +1989,11 @@ norm(EURsubbm' * v1 - A' * v1)
 
 
 
-    1.6136185547316357e-14
-
-
-
-### Two alternatives: SnpLinAlg and CuSnpArray
-
-Basically, creating SnpBitMatrix doubles the memory usage. If it becomes an issue,  direct linear algebra operations from a SnpArray is possible. 
-
-#### SnpLinAlg
-
-
-```julia
-const EURla = SnpLinAlg{Float64}(EUR; model=ADDITIVE_MODEL, center=true, scale=true);
-```
-
-
-```julia
-v1 = randn(size(EUR, 1))
-v2 = randn(size(EUR, 2))
-out1 = Vector{Float64}(undef, size(EUR, 1))
-out2 = Vector{Float64}(undef, size(EUR, 2))
-@btime mul!($out1, $EURla, $v2);
-```
-
-      6.880 ms (264 allocations: 14.03 KiB)
-
-
-```julia
-@btime mul!($out2, transpose($EURla), $v1);
-```
-
-      9.760 ms (3 allocations: 128 bytes)
+    1.4647267077134765e-14
 
 
-Computing $Ax$ is faster in SnpLinAlg, and computing $A^T x$ is faster in SnpBitMatrix. Note that SnpLinAlg does not allocate additional memory. See Linear Algebra page for more information.
 
-#### CuSnpArray
+### GPU support: CuSnpArray
 
 On machines with Nvidia GPU, matrix-vector multiplications can be performed on it via CuSnpArray. The input vectors should be CuVectors. 
 
@@ -2026,6 +2001,10 @@ On machines with Nvidia GPU, matrix-vector multiplications can be performed on i
 ```julia
 ENV["JULIA_CUDA_USE_BINARYBUILDER"] = "false" # will use local CUDA installation
 using CUDA, Adapt
+out1 = randn(size(EUR, 1))
+out2 = randn(size(EUR, 2))
+v1 = randn(size(EUR, 1))
+v2 = randn(size(EUR, 2))
 v1_d = adapt(CuVector{Float64}, v1) # sends data to GPU
 v2_d = adapt(CuVector{Float64}, v2)
 out1_d = adapt(CuVector{Float64}, out1)
@@ -2049,7 +2028,7 @@ const EURcu = CuSnpArray{Float64}(EUR; model=ADDITIVE_MODEL, center=true, scale=
     └ @ Core :-1
 
 
-      18.361 ms (253 allocations: 7.69 KiB)
+      18.204 ms (253 allocations: 7.69 KiB)
 
 
 
@@ -2057,7 +2036,7 @@ const EURcu = CuSnpArray{Float64}(EUR; model=ADDITIVE_MODEL, center=true, scale=
 @btime mul!($out2_d, transpose($EURcu), $v1_d);
 ```
 
-      626.435 μs (465 allocations: 15.00 KiB)
+      544.722 μs (465 allocations: 15.00 KiB)
 
 
 The operations are parallelized along the output dimension, hence the GPU was not fully utilized in the first case. With 100-time larger data, 30 to 50-fold speedup were observed for both cases with Nvidia Titan V. See linear algebra page for more information.
@@ -2072,7 +2051,7 @@ norm(collect(EURcu' * v1_d) -  EURbm' * v1)
 
 
 
-    4.38122146476498e-11
+    2.2306544292754367e-11
 
 
 
@@ -2600,9 +2579,9 @@ We can merge the splitted dictionary back into one SnpData using `merge_plink`.
 merged = SnpArrays.merge_plink("tmp.merged", splitted) # write_plink is included here
 ```
 
-      0.066388 seconds (117.18 k allocations: 7.904 MiB)
-      0.049178 seconds (125.97 k allocations: 11.520 MiB)
-      0.051638 seconds (134.18 k allocations: 8.737 MiB)
+      0.067004 seconds (117.20 k allocations: 7.905 MiB)
+      0.102031 seconds (125.97 k allocations: 11.520 MiB, 51.01% gc time)
+      0.050454 seconds (134.18 k allocations: 8.737 MiB)
 
 
 
@@ -2645,10 +2624,10 @@ You can also merge the plink formatted files based on their common prefix.
 merged_from_splitted_files = merge_plink("tmp.split.chr"; des = "tmp.merged.2")
 ```
 
-      0.143816 seconds (783.53 k allocations: 36.258 MiB)
-      0.575327 seconds (1.19 M allocations: 61.140 MiB)
-      0.003704 seconds (8 allocations: 4.897 MiB)
-      0.000827 seconds (8 allocations: 1.650 MiB)
+      0.146038 seconds (783.53 k allocations: 36.252 MiB)
+      0.568753 seconds (1.19 M allocations: 61.158 MiB)
+      0.003368 seconds (8 allocations: 4.897 MiB)
+      0.001259 seconds (8 allocations: 1.650 MiB)
 
 
 
@@ -2750,12 +2729,12 @@ mouse_toreorder
     │ Row │ fid      │ iid        │ father       │ mother       │ sex      │ phenotype │
     │     │ Abstrac… │ AbstractS… │ AbstractStr… │ AbstractStr… │ Abstrac… │ Abstract… │
     ├─────┼──────────┼────────────┼──────────────┼──────────────┼──────────┼───────────┤
-    │ 1   │ 1_5      │ A053629514 │ F5.3:E2.2(6) │ F5.3:A3.2(5) │ 1        │ -9        │
-    │ 2   │ 1_18     │ A067077564 │ G1.5:B5.4(5) │ G1.5:F5.4(5) │ 1        │ -9        │
-    │ 3   │ 1_2      │ A048029086 │ G1.3:B1.2(3) │ G1.3:F3.2(3) │ 2        │ -9        │
-    │ 4   │ 1_1      │ A048097274 │ H4.2:C5.1(4) │ H4.2:G1.1(7) │ 1        │ -9        │
-    │ 5   │ 1_15     │ A063796366 │ H2.5:C3.4(3) │ H2.5:G4.4(2) │ 2        │ -9        │
-    │ 6   │ 1_45     │ A084280094 │ 6.6:F1.5(6)  │ 6.6:G1.5(1)  │ 1        │ -9        │
+    │ 1   │ 1_3      │ A067289112 │ E2.4:H3.3(2) │ E2.4:D4.3(2) │ 1        │ -9        │
+    │ 2   │ 1_1      │ A048031769 │ H4.2:C5.1(4) │ H4.2:G1.1(7) │ 1        │ -9        │
+    │ 3   │ 1_2      │ A084286354 │ H4.4:C2.3(5) │ H4.4:G3.3(7) │ 1        │ -9        │
+    │ 4   │ 1_3      │ A048072266 │ G5.2:B5.1(4) │ G5.2:F5.1(2) │ 2        │ -9        │
+    │ 5   │ 1_3      │ A063311623 │ A5.4:D4.3(5) │ A5.4:H1.3(6) │ 1        │ -9        │
+    │ 6   │ 1_54     │ A063011268 │ B3.5:E4.4(2) │ B3.5:A3.4(4) │ 1        │ -9        │
     …,
     srcbed: mouse_reorder.bed
     srcbim: mouse_reorder.bim
diff --git a/docs/src/linalg.md b/docs/src/linalg.md
index 3282f509..fc2457da 100644
--- a/docs/src/linalg.md
+++ b/docs/src/linalg.md
@@ -37,25 +37,25 @@ const EUR = SnpArray(SnpArrays.datadir("EUR_subset.bed"));
 Let's try with EUR data repeated 100 and 101 times: 37900 by 54051 and 38279 by 54051, respectively.
 
 
-
 ```julia
 EUR_10 = [EUR; EUR; EUR; EUR; EUR; EUR; EUR; EUR; EUR; EUR]
 EUR_100 = [EUR_10; EUR_10; EUR_10; EUR_10; EUR_10; EUR_10; EUR_10; EUR_10; EUR_10; EUR_10];
 EUR_101 = [EUR_100; EUR];
 ```
 
-
 We create instnaces of SnpLinAlg, SnpBitmatrix and CuSnpArray:
 
 
 ```julia
 EUR_100_bm = SnpBitMatrix{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false)
 EUR_100_sla = SnpLinAlg{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false)
-EUR_100_mat = convert(Matrix{Float64}, EUR_100, model=ADDITIVE_MODEL, center=true, scale=true);
+EUR_100_sla_ = SnpLinAlg{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false, impute=false)
+EUR_100_mat = convert(Matrix{Float64}, EUR_100, model=ADDITIVE_MODEL, center=false, scale=false);
 
 EUR_101_bm = SnpBitMatrix{Float64}(EUR_101; model=ADDITIVE_MODEL, center=false, scale=false)
 EUR_101_sla = SnpLinAlg{Float64}(EUR_101; model=ADDITIVE_MODEL, center=false, scale=false)
-EUR_101_mat = convert(Matrix{Float64}, EUR_101, model=ADDITIVE_MODEL, center=true, scale=true);
+EUR_101_sla_ = SnpLinAlg{Float64}(EUR_101; model=ADDITIVE_MODEL, center=false, scale=false, impute=false)
+EUR_101_mat = convert(Matrix{Float64}, EUR_101, model=ADDITIVE_MODEL, center=false, scale=false);
 ```
 
 
@@ -63,6 +63,7 @@ EUR_101_mat = convert(Matrix{Float64}, EUR_101, model=ADDITIVE_MODEL, center=tru
 ENV["JULIA_CUDA_USE_BINARYBUILDER"] = "false"
 using CUDA
 EUR_100_cu = CuSnpArray{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false);
+EUR_100_cu_ = CuSnpArray{Float64}(EUR_100; model=ADDITIVE_MODEL, center=false, scale=false, impute=false);
 ```
 
     ┌ Warning: `haskey(::TargetIterator, name::String)` is deprecated, use `Target(; name = name) !== nothing` instead.
@@ -81,6 +82,7 @@ using BenchmarkTools
 
 ```julia
 v1 = randn(size(EUR_100, 1))
+v1_ = randn(size(EUR_100, 1))
 v2 = randn(size(EUR_100, 2));
 ```
 
@@ -99,12 +101,12 @@ BLAS.set_num_threads(8)
       memory estimate:  0 bytes
       allocs estimate:  0
       --------------
-      minimum time:     321.207 ms (0.00% GC)
-      median time:      347.112 ms (0.00% GC)
-      mean time:        375.497 ms (0.00% GC)
-      maximum time:     769.390 ms (0.00% GC)
+      minimum time:     383.853 ms (0.00% GC)
+      median time:      417.986 ms (0.00% GC)
+      mean time:        431.205 ms (0.00% GC)
+      maximum time:     530.847 ms (0.00% GC)
       --------------
-      samples:          15
+      samples:          12
       evals/sample:     1
 
 
@@ -124,17 +126,17 @@ BLAS.set_num_threads(1)
       memory estimate:  0 bytes
       allocs estimate:  0
       --------------
-      minimum time:     1.745 s (0.00% GC)
-      median time:      1.905 s (0.00% GC)
-      mean time:        2.144 s (0.00% GC)
-      maximum time:     2.782 s (0.00% GC)
+      minimum time:     1.849 s (0.00% GC)
+      median time:      2.064 s (0.00% GC)
+      mean time:        2.047 s (0.00% GC)
+      maximum time:     2.229 s (0.00% GC)
       --------------
       samples:          3
       evals/sample:     1
 
 
 
-Direct linear algebra on a SnpArray: 
+Direct linear algebra on a SnpArray, with mean imputation: 
 
 
 ```julia
@@ -145,20 +147,46 @@ Direct linear algebra on a SnpArray:
 
 
     BenchmarkTools.Trial: 
-      memory estimate:  8.23 KiB
-      allocs estimate:  158
+      memory estimate:  38.33 KiB
+      allocs estimate:  1616
+      --------------
+      minimum time:     1.241 s (0.00% GC)
+      median time:      1.355 s (0.00% GC)
+      mean time:        1.352 s (0.00% GC)
+      maximum time:     1.457 s (0.00% GC)
+      --------------
+      samples:          4
+      evals/sample:     1
+
+
+
+With zero imputation:
+
+
+```julia
+@benchmark LinearAlgebra.mul!($v1, $EUR_100_sla_, $v2)
+```
+
+
+
+
+    BenchmarkTools.Trial: 
+      memory estimate:  38.33 KiB
+      allocs estimate:  1616
       --------------
-      minimum time:     1.014 s (0.00% GC)
-      median time:      1.021 s (0.00% GC)
-      mean time:        1.020 s (0.00% GC)
-      maximum time:     1.027 s (0.00% GC)
+      minimum time:     1.017 s (0.00% GC)
+      median time:      1.037 s (0.00% GC)
+      mean time:        1.034 s (0.00% GC)
+      maximum time:     1.045 s (0.00% GC)
       --------------
       samples:          5
       evals/sample:     1
 
 
 
-The below is the benchmark for SnpBitMatrix:
+Indeed, we are paying some price for mean imputation.
+
+The below is the benchmark for SnpBitMatrix (always zero-imputed):
 
 
 ```julia
@@ -172,16 +200,90 @@ The below is the benchmark for SnpBitMatrix:
       memory estimate:  0 bytes
       allocs estimate:  0
       --------------
-      minimum time:     1.041 s (0.00% GC)
-      median time:      1.056 s (0.00% GC)
-      mean time:        1.058 s (0.00% GC)
-      maximum time:     1.076 s (0.00% GC)
+      minimum time:     1.045 s (0.00% GC)
+      median time:      1.051 s (0.00% GC)
+      mean time:        1.051 s (0.00% GC)
+      maximum time:     1.058 s (0.00% GC)
+      --------------
+      samples:          5
+      evals/sample:     1
+
+
+
+At first glance, the result from SnpBitMatrix might look better than SnpLinAlg. However, SnpLinAlg is more stable in performance when the number of samples is not multiple of 4 or 8.
+
+
+```julia
+v1 = randn(size(EUR_101, 1))
+v2 = randn(size(EUR_101, 2));
+```
+
+
+```julia
+@benchmark LinearAlgebra.mul!($v1, $EUR_101_sla, $v2)
+```
+
+
+
+
+    BenchmarkTools.Trial: 
+      memory estimate:  44.13 KiB
+      allocs estimate:  1722
+      --------------
+      minimum time:     1.259 s (0.00% GC)
+      median time:      1.268 s (0.00% GC)
+      mean time:        1.268 s (0.00% GC)
+      maximum time:     1.277 s (0.00% GC)
+      --------------
+      samples:          4
+      evals/sample:     1
+
+
+
+
+```julia
+@benchmark LinearAlgebra.mul!($v1, $EUR_101_sla_, $v2)
+```
+
+
+
+
+    BenchmarkTools.Trial: 
+      memory estimate:  44.13 KiB
+      allocs estimate:  1722
+      --------------
+      minimum time:     1.038 s (0.00% GC)
+      median time:      1.043 s (0.00% GC)
+      mean time:        1.043 s (0.00% GC)
+      maximum time:     1.051 s (0.00% GC)
       --------------
       samples:          5
       evals/sample:     1
 
 
 
+
+```julia
+@benchmark LinearAlgebra.mul!($v1, $EUR_101_bm, $v2)
+```
+
+
+
+
+    BenchmarkTools.Trial: 
+      memory estimate:  0 bytes
+      allocs estimate:  0
+      --------------
+      minimum time:     1.544 s (0.00% GC)
+      median time:      1.561 s (0.00% GC)
+      mean time:        1.561 s (0.00% GC)
+      maximum time:     1.577 s (0.00% GC)
+      --------------
+      samples:          4
+      evals/sample:     1
+
+
+
 Now let's try CUDA. The device is Nvidia Titan V.
 
 
@@ -193,7 +295,11 @@ Moving data to GPU:
 
 
 ```julia
+v1 = randn(size(EUR_100, 1))
+v1_ = randn(size(EUR_100, 1))
+v2 = randn(size(EUR_100, 2));
 v1_d = adapt(CuArray{Float64}, v1)
+v1_d_ = similar(v1_d)
 v2_d = adapt(CuArray{Float64}, v2);
 ```
 
@@ -215,27 +321,37 @@ using BenchmarkTools
       memory estimate:  3.28 KiB
       allocs estimate:  130
       --------------
-      minimum time:     22.138 ms (0.00% GC)
-      median time:      22.356 ms (0.00% GC)
-      mean time:        22.352 ms (0.00% GC)
-      maximum time:     23.825 ms (0.00% GC)
+      minimum time:     22.319 ms (0.00% GC)
+      median time:      22.429 ms (0.00% GC)
+      mean time:        22.722 ms (0.00% GC)
+      maximum time:     26.938 ms (0.00% GC)
       --------------
-      samples:          224
+      samples:          220
       evals/sample:     1
 
 
 
-The speedup is obvious. Let's check correctness:
+For CuSnpArray, the additional cost for mean imputation is negligible.
 
 
 ```julia
-isapprox(collect(v1_d), v1)
+@benchmark CUDA.@sync LinearAlgebra.mul!($v1_d_, $EUR_100_cu_, $v2_d)
 ```
 
 
 
 
-    true
+    BenchmarkTools.Trial: 
+      memory estimate:  3.28 KiB
+      allocs estimate:  130
+      --------------
+      minimum time:     22.243 ms (0.00% GC)
+      median time:      22.431 ms (0.00% GC)
+      mean time:        22.805 ms (0.00% GC)
+      maximum time:     57.050 ms (0.00% GC)
+      --------------
+      samples:          220
+      evals/sample:     1
 
 
 
@@ -256,67 +372,27 @@ EUR_100_mat_d = adapt(CuArray, EUR_100_mat);
       memory estimate:  2.58 KiB
       allocs estimate:  85
       --------------
-      minimum time:     78.002 ms (0.00% GC)
-      median time:      80.142 ms (0.00% GC)
-      mean time:        80.055 ms (0.00% GC)
-      maximum time:     83.111 ms (0.00% GC)
+      minimum time:     75.951 ms (0.00% GC)
+      median time:      76.516 ms (0.00% GC)
+      mean time:        77.909 ms (0.00% GC)
+      maximum time:     82.096 ms (0.00% GC)
       --------------
-      samples:          63
+      samples:          65
       evals/sample:     1
 
 
 
-Using CuSnpArray is both faster and memory-efficient compared to linear algebra with floating point matrix on GPU.
-
-At first glance, the result from SnpBitMatrix might look similar to SnpLinAlg. However, SnpLinAlg is more stable in performance when the number of samples is not multiple of 4 or 8.
+The speedup is obvious, CuSnpArrays is 30-50x faster than on CPU, and using CuSnpArray is both faster and memory-efficient compared to linear algebra with floating point matrix on GPU.
 
 
 ```julia
-v1 = randn(size(EUR_101, 1))
-v2 = randn(size(EUR_101, 2));
-```
-
-
-```julia
-@benchmark LinearAlgebra.mul!($v1, $EUR_101_sla, $v2)
+isapprox(v1_d, v1_d_)
 ```
 
 
 
 
-    BenchmarkTools.Trial: 
-      memory estimate:  14.03 KiB
-      allocs estimate:  264
-      --------------
-      minimum time:     1.022 s (0.00% GC)
-      median time:      1.024 s (0.00% GC)
-      mean time:        1.024 s (0.00% GC)
-      maximum time:     1.028 s (0.00% GC)
-      --------------
-      samples:          5
-      evals/sample:     1
-
-
-
-
-```julia
-@benchmark LinearAlgebra.mul!($v1, $EUR_101_bm, $v2)
-```
-
-
-
-
-    BenchmarkTools.Trial: 
-      memory estimate:  0 bytes
-      allocs estimate:  0
-      --------------
-      minimum time:     1.187 s (0.00% GC)
-      median time:      1.188 s (0.00% GC)
-      mean time:        1.191 s (0.00% GC)
-      maximum time:     1.197 s (0.00% GC)
-      --------------
-      samples:          5
-      evals/sample:     1
+    true
 
 
 
@@ -342,12 +418,12 @@ v2_d = adapt(CuArray{Float64}, v2);
       memory estimate:  16 bytes
       allocs estimate:  1
       --------------
-      minimum time:     662.131 ms (0.00% GC)
-      median time:      678.294 ms (0.00% GC)
-      mean time:        701.543 ms (0.00% GC)
-      maximum time:     790.350 ms (0.00% GC)
+      minimum time:     842.187 ms (0.00% GC)
+      median time:      882.157 ms (0.00% GC)
+      mean time:        913.111 ms (0.00% GC)
+      maximum time:     1.049 s (0.00% GC)
       --------------
-      samples:          8
+      samples:          6
       evals/sample:     1
 
 
@@ -364,12 +440,12 @@ v2_d = adapt(CuArray{Float64}, v2);
       memory estimate:  16 bytes
       allocs estimate:  1
       --------------
-      minimum time:     605.351 ms (0.00% GC)
-      median time:      618.706 ms (0.00% GC)
-      mean time:        620.553 ms (0.00% GC)
-      maximum time:     658.692 ms (0.00% GC)
+      minimum time:     618.434 ms (0.00% GC)
+      median time:      630.293 ms (0.00% GC)
+      mean time:        689.063 ms (0.00% GC)
+      maximum time:     881.782 ms (0.00% GC)
       --------------
-      samples:          9
+      samples:          8
       evals/sample:     1
 
 
@@ -386,12 +462,12 @@ v2_d = adapt(CuArray{Float64}, v2);
       memory estimate:  3.08 KiB
       allocs estimate:  118
       --------------
-      minimum time:     26.926 ms (0.00% GC)
-      median time:      27.330 ms (0.00% GC)
-      mean time:        27.494 ms (0.00% GC)
-      maximum time:     31.492 ms (0.00% GC)
+      minimum time:     26.710 ms (0.00% GC)
+      median time:      26.930 ms (0.00% GC)
+      mean time:        27.350 ms (0.00% GC)
+      maximum time:     32.306 ms (0.00% GC)
       --------------
-      samples:          182
+      samples:          183
       evals/sample:     1
 
 
@@ -426,10 +502,32 @@ v2 = randn(size(EUR_101, 2));
       memory estimate:  128 bytes
       allocs estimate:  3
       --------------
-      minimum time:     667.369 ms (0.00% GC)
-      median time:      681.264 ms (0.00% GC)
-      mean time:        697.866 ms (0.00% GC)
-      maximum time:     789.988 ms (0.00% GC)
+      minimum time:     850.050 ms (0.00% GC)
+      median time:      862.818 ms (0.00% GC)
+      mean time:        953.739 ms (0.00% GC)
+      maximum time:     1.391 s (0.00% GC)
+      --------------
+      samples:          6
+      evals/sample:     1
+
+
+
+
+```julia
+@benchmark LinearAlgebra.mul!($v2, transpose($EUR_101_sla_), $v1)
+```
+
+
+
+
+    BenchmarkTools.Trial: 
+      memory estimate:  128 bytes
+      allocs estimate:  3
+      --------------
+      minimum time:     673.269 ms (0.00% GC)
+      median time:      693.418 ms (0.00% GC)
+      mean time:        700.729 ms (0.00% GC)
+      maximum time:     784.130 ms (0.00% GC)
       --------------
       samples:          8
       evals/sample:     1
@@ -448,10 +546,10 @@ v2 = randn(size(EUR_101, 2));
       memory estimate:  16 bytes
       allocs estimate:  1
       --------------
-      minimum time:     612.441 ms (0.00% GC)
-      median time:      624.015 ms (0.00% GC)
-      mean time:        641.973 ms (0.00% GC)
-      maximum time:     770.845 ms (0.00% GC)
+      minimum time:     620.830 ms (0.00% GC)
+      median time:      637.698 ms (0.00% GC)
+      mean time:        656.708 ms (0.00% GC)
+      maximum time:     798.543 ms (0.00% GC)
       --------------
       samples:          8
       evals/sample:     1
diff --git a/src/cuda.jl b/src/cuda.jl
index fa2ab4a0..973425b8 100644
--- a/src/cuda.jl
+++ b/src/cuda.jl
@@ -6,16 +6,29 @@ struct CuSnpArray{T} <: AbstractMatrix{UInt8}
     model::Union{Val{1}, Val{2}, Val{3}}
     center::Bool
     scale::Bool
+    impute::Bool
     μ::CuVector{T}
     σinv::CuVector{T}
     storagev1::CuVector{T}
     storagev2::CuVector{T}
 end
 
+"""
+    CuSnpArray{T}(s; model=ADDITIVE_MODEL, center=false, scale=false, impute=true)
+
+Copy a `SnpArray` to a CUDA GPU to perform linear algebera operations.
+
+# Arguments
+- s: a `SnpArray`.
+- model: one of `ADDITIVE_MODEL`(default), `DOMINANT_MODEL`, `RECESSIVE_MODEL`.
+- center: whether to center (default: false).
+- scale: whether to scale to standard deviation 1 (default: false).
+- impute: whether to impute missing value with column mean (default: true).
+"""
 function CuSnpArray{T}(s::SnpArray; 
     model = ADDITIVE_MODEL,
     center::Bool = false,
-    scale::Bool = false) where T <: AbstractFloat
+    scale::Bool = false, impute::Bool = false) where T <: AbstractFloat
 
     data = adapt(CuMatrix{UInt8}, s.data)
     if model == ADDITIVE_MODEL
@@ -73,7 +86,7 @@ function CuSnpArray{T}(s::SnpArray;
     σinv = adapt(CuVector{T}, σinv)
     storagev1 = CuVector{T}(undef, size(s, 1))
     storagev2 = CuVector{T}(undef, size(s, 2))
-    CuSnpArray{T}(data, s.m, model, center, scale, μ, σinv, storagev1, storagev2)
+    CuSnpArray{T}(data, s.m, model, center, scale, impute, μ, σinv, storagev1, storagev2)
 end
 
 Base.size(s::CuSnpArray) = s.m, size(s.data, 2)
@@ -175,6 +188,107 @@ function _snparray_cuda_atx_recessive!(out, s, v)
     end
 end
 
+function _snparray_cuda_ax_additive_meanimpute!(out, s, v, μ)
+    packedstride = size(s, 1)
+    index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    stride_x = blockDim().x * gridDim().x
+    for i = index_x:stride_x:length(out)
+        outi = zero(eltype(out))
+        for j = 1:length(v)
+            k = 2 * ((i-1) & 3)
+            block = s[(j-1) * packedstride + ((i-1) >> 2) + 1]
+            Aij = (block >> k) & 3
+            @inbounds outi += ((Aij >= 2) + (Aij >= 3) + (Aij == 1) * μ[j]) * v[j]
+        end
+        out[i] = outi
+    end
+end
+
+function _snparray_cuda_ax_dominant_meanimpute!(out, s, v, μ)
+    packedstride = size(s, 1)
+    index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    stride_x = blockDim().x * gridDim().x
+    for i = index_x:stride_x:length(out)
+        outi = zero(eltype(out))
+        for j = 1:length(v)
+            k = 2 * ((i-1) & 3)
+            block = s[(j-1) * packedstride + ((i-1) >> 2) + 1]
+            Aij = (block >> k) & 3
+            @inbounds outi += ((Aij >= 2) + (Aij == 1) * μ[j]) * v[j]
+        end
+        out[i] = outi
+    end
+end
+
+function _snparray_cuda_ax_recessive_meanimpute!(out, s, v, μ)
+    packedstride = size(s, 1)
+    index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    stride_x = blockDim().x * gridDim().x
+    for i = index_x:stride_x:length(out)
+        outi = zero(eltype(out))
+        for j = 1:length(v)
+            k = 2 * ((i-1) & 3)
+            block = s[(j-1) * packedstride + ((i-1) >> 2) + 1]
+            Aij = (block >> k) & 3
+            @inbounds outi += ((Aij >= 3) + (Aij == 1) * μ[j]) * v[j]
+        end
+        out[i] = outi
+    end
+end
+
+function _snparray_cuda_atx_additive_meanimpute!(out, s, v, μ)
+    packedstride = size(s, 1)
+    index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    stride_x = blockDim().x * gridDim().x
+    for i = index_x:stride_x:length(out)
+        outi = zero(eltype(out))
+        for j = 1:length(v)
+            k = 2 * ((j-1) & 3)
+            block = s[(i-1) * packedstride + ((j-1) >> 2) + 1]
+            Aij = (block >> k) & 3
+            @inbounds outi += ((Aij >= 2) + (Aij >= 3) + (Aij == 1) * μ[i]) * v[j]
+        end
+        out[i] = outi
+    end
+end
+
+function _snparray_cuda_atx_dominant_meanimpute!(out, s, v, μ)
+    packedstride = size(s, 1)
+    index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    stride_x = blockDim().x * gridDim().x
+    for i = index_x:stride_x:length(out)
+        outi = zero(eltype(out))
+        for j = 1:length(v)
+            k = 2 * ((j-1) & 3)
+            block = s[(i-1) * packedstride + ((j-1) >> 2) + 1]
+            Aij = (block >> k) & 3
+            @inbounds outi += ((Aij >= 2) + (Aij == 1) * μ[i]) * v[j]
+        end
+        out[i] = outi
+    end
+end
+
+function _snparray_cuda_atx_recessive_meanimpute!(out, s, v, μ)
+    packedstride = size(s, 1)
+    index_x = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    stride_x = blockDim().x * gridDim().x
+    for i = index_x:stride_x:length(out)
+        outi = zero(eltype(out))
+        for j = 1:length(v)
+            k = 2 * ((j-1) & 3)
+            block = s[(i-1) * packedstride + ((j-1) >> 2) + 1]
+            Aij = (block >> k) & 3
+            @inbounds outi += ((Aij >= 3) + (Aij == 1) * μ[i]) * v[j]
+        end
+        out[i] = outi
+    end
+end
+
+"""
+    LinearAlgebra.mul!(out::CuVector{T}, s::CuSnpArray{T}, v::CuVector{T})
+
+In-place matrix-vector multiplication on a GPU.
+"""
 function mul!(
     out::CuVector{T}, 
     s::CuSnpArray{T}, 
@@ -205,6 +319,11 @@ function mul!(
     end
 end
 
+"""
+    LinearAlgebra.mul!(out::CuVector{T}, s::Union{Transpose{T, CuSnpArray{T}}, Adjoint{T, CuSnpArray{T}}}, v::CuVector{T})
+
+In-place matrix-vector multiplication on a GPU, with transposed CuSnpArray.
+"""
 function mul!(
     out::CuVector{T}, 
     st::Union{Transpose{T, CuSnpArray{T}}, Adjoint{T, CuSnpArray{T}}},
@@ -214,12 +333,22 @@ function mul!(
     s = st.parent
     numblocks = ceil(Int, length(out)/256)
     CUDA.@sync begin
-        if s.model == ADDITIVE_MODEL
-            @cuda threads=256 blocks=numblocks _snparray_cuda_atx_additive!(out, s.data, v)
-        elseif s.model == DOMINANT_MODEL
-            @cuda threads=256 blocks=numblocks _snparray_cuda_atx_dominant!(out, s.data, v) 
+        if !s.impute
+            if s.model == ADDITIVE_MODEL
+                @cuda threads=256 blocks=numblocks _snparray_cuda_atx_additive!(out, s.data, v)
+            elseif s.model == DOMINANT_MODEL
+                @cuda threads=256 blocks=numblocks _snparray_cuda_atx_dominant!(out, s.data, v) 
+            else
+                @cuda threads=256 blocks=numblocks _snparray_cuda_atx_recessive!(out, s.data, v) 
+            end
         else
-            @cuda threads=256 blocks=numblocks _snparray_cuda_atx_recessive!(out, s.data, v) 
+            if s.model == ADDITIVE_MODEL
+                @cuda threads=256 blocks=numblocks _snparray_cuda_atx_additive_meanimpute!(out, s.data, v, s.μ)
+            elseif s.model == DOMINANT_MODEL
+                @cuda threads=256 blocks=numblocks _snparray_cuda_atx_dominant_meanimpute!(out, s.data, v, s.μ) 
+            else
+                @cuda threads=256 blocks=numblocks _snparray_cuda_atx_recessive_meanimpute!(out, s.data, v, s.μ) 
+            end
         end
     end
 
diff --git a/src/linalg_bitmatrix.jl b/src/linalg_bitmatrix.jl
index 6c4ca4e6..e83794ae 100644
--- a/src/linalg_bitmatrix.jl
+++ b/src/linalg_bitmatrix.jl
@@ -10,6 +10,17 @@ struct SnpBitMatrix{T} <: AbstractMatrix{T}
     storagev2::Vector{T}
 end
 
+"""
+    SnpBitMatrix{T}(s; model=ADDITIVE_MODEL, center=false, scale=false)
+
+Store an `AbstractSnpArray` `s` as two `BitMatrix`es to simplify linear algebra.
+
+# Arguments
+- s: an `AbstractSnpArray`.
+- model: one of `ADDITIVE_MODEL`(default), `DOMINANT_MODEL`, `RECESSIVE_MODEL`.
+- center: whether to center (default: false).
+- scale: whether to scale to standard deviation 1 (default: false).
+"""
 function SnpBitMatrix{T}(
     s::AbstractSnpArray;
     model = ADDITIVE_MODEL,
@@ -144,6 +155,11 @@ function mul!(c::Vector{T}, A::Union{Transpose{Bool,BitArray{2}}, Adjoint{Bool,
     end
 end
 
+"""
+    LinearAlgebra.mul!(out, s::SnpBitMatrix, v)
+
+In-place matrix-vector multiplication.
+"""
 function mul!(
     out::AbstractVector{T}, 
     s::SnpBitMatrix{T}, 
@@ -169,6 +185,11 @@ function mul!(
     end
 end
 
+"""
+    LinearAlgebra.mul!(out, s::Union{Transpose{T, SnpBitMatrix{T}}, Adjoint{T, SnpBitMatrix{T}}}, v)
+
+In-place matrix-vector multiplication, with transposed `SnpBitMatrix`.
+"""
 function mul!(
     out::AbstractVector{T}, 
     st::Union{Transpose{T, SnpBitMatrix{T}}, Adjoint{T, SnpBitMatrix{T}}},
diff --git a/src/linalg_direct.jl b/src/linalg_direct.jl
index 228691c6..25e46876 100644
--- a/src/linalg_direct.jl
+++ b/src/linalg_direct.jl
@@ -3,17 +3,31 @@ struct SnpLinAlg{T} <: AbstractMatrix{T}
     model::Union{Val{1}, Val{2}, Val{3}}
     center::Bool
     scale::Bool
+    impute::Bool
     μ::Vector{T}
     σinv::Vector{T}
     storagev1::Vector{T}
     storagev2::Vector{T}
 end
 
+"""
+    SnpLinAlg{T}(s; model=ADDITIVE_MODEL, center=false, scale=false, impute=true)
+
+Pad a `SnpArray` with some parameters for linear algebera operation.
+
+# Arguments
+- s: a `SnpArray`.
+- model: one of `ADDITIVE_MODEL`(default), `DOMINANT_MODEL`, `RECESSIVE_MODEL`.
+- center: whether to center (default: false).
+- scale: whether to scale to standard deviation 1 (default: false).
+- impute: whether to impute missing value with column mean (default: true).
+"""
 function SnpLinAlg{T}(
     s::AbstractSnpArray;
     model = ADDITIVE_MODEL,
     center::Bool = false,
-    scale::Bool = false) where T <: AbstractFloat
+    scale::Bool = false,
+    impute::Bool = true) where T <: AbstractFloat
     if model == ADDITIVE_MODEL
         if center || scale
             μ = Vector{T}(undef, size(s, 2))
@@ -67,7 +81,7 @@ function SnpLinAlg{T}(
     end
     storagev1 = Vector{T}(undef, size(s, 1))
     storagev2 = Vector{T}(undef, size(s, 2))
-    SnpLinAlg{T}(s, model, center, scale, μ, σinv, storagev1, storagev2)
+    SnpLinAlg{T}(s, model, center, scale, impute, μ, σinv, storagev1, storagev2)
 end
 
 Base.size(sla::SnpLinAlg) = size(sla.s)
@@ -86,7 +100,8 @@ function _snparray_ax_additive_rem!(out, s, v)
         end
     end
 end
-function _snparray_ax_additive!(out, s, v, rows, cols)
+
+function _snparray_ax_additive!(out, s, v, rows, cols, μ)
     #fill!(out, zero(Float64))
     k = rows >> 2
     rem = rows & 3
@@ -108,10 +123,6 @@ function _snparray_ax_additive!(out, s, v, rows, cols)
     out
 end
 
-function _snparray_ax_additive!(out, s, v)
-    _snparray_ax_additive!(out, s, v, length(out), length(v))
-end
-
 function _snparray_ax_dominant_rem!(out, s, v)
     maxp = length(out)
     @avx for j in eachindex(v)
@@ -122,7 +133,8 @@ function _snparray_ax_dominant_rem!(out, s, v)
         end
     end
 end
-function _snparray_ax_dominant!(out, s, v, rows, cols)
+
+function _snparray_ax_dominant!(out, s, v, rows, cols, μ)
     #fill!(out, zero(Float64))
     k = rows >> 2
     rem = rows & 3
@@ -144,10 +156,6 @@ function _snparray_ax_dominant!(out, s, v, rows, cols)
     out
 end
 
-function _snparray_ax_dominant!(out, s, v)
-    _snparray_ax_dominant!(out, s, v, length(out), length(v))
-end
-
 function _snparray_ax_recessive_rem!(out, s, v)
     maxp = length(out)
     @avx for j in eachindex(v)
@@ -158,7 +166,8 @@ function _snparray_ax_recessive_rem!(out, s, v)
         end
     end
 end
-function _snparray_ax_recessive!(out, s, v, rows, cols)
+
+function _snparray_ax_recessive!(out, s, v, rows, cols, μ)
     #fill!(out, zero(Float64))
     k = rows >> 2
     rem = rows & 3
@@ -180,22 +189,127 @@ function _snparray_ax_recessive!(out, s, v, rows, cols)
     out
 end
 
-function _snparray_ax_recessive!(out, s, v)
-    _snparray_ax_recessive!(out, s, v, length(out), length(v))
+function _snparray_ax_additive_meanimpute_rem!(out, s, v, μ)
+    maxp = length(out)
+    @avx for j in eachindex(v)
+        block = s[1, j]
+        for p in 1:maxp
+            Aij = (block >> (2*(p-1))) & 3
+            out[p] += ((Aij >= 2) + (Aij == 3) + (Aij == 1) * μ[j]) * v[j]
+        end
+    end
+end
+
+function _snparray_ax_additive_meanimpute!(out, s, v, rows, cols, μ)
+    #fill!(out, zero(Float64))
+    k = rows >> 2
+    rem = rows & 3
+    
+    @avx for j ∈ 1:cols
+        for l in 1:k
+            block = s[l, j]
+            
+            for p in 1:4
+                Aij = (block >> (2 *(p-1))) & 3
+                out[4*(l-1)+p] += ((Aij >= 2) + (Aij == 3) + (Aij == 1) * μ[j]) * v[j]
+            end
+
+        end
+    end
+    if rem != 0
+        _snparray_ax_additive_meanimpute_rem!(@view(out[4k+1:end]), @view(s[k+1:k+1, :]), v, μ)
+    end
+    out
+end
+
+function _snparray_ax_dominant_meanimpute_rem!(out, s, v, μ)
+    maxp = length(out)
+    @avx for j in eachindex(v)
+        block = s[1, j]
+        for p in 1:maxp
+            Aij = (block >> (2*(p-1))) & 3
+            out[p] += ((Aij >= 2) + (Aij == 1) * μ[j]) * v[j]
+        end
+    end
+end
+
+function _snparray_ax_dominant_meanimpute!(out, s, v, rows, cols, μ)
+    #fill!(out, zero(Float64))
+    k = rows >> 2
+    rem = rows & 3
+    
+    @avx for j ∈ 1:cols
+        for l in 1:k
+            block = s[l, j]
+            
+            for p in 1:4
+                Aij = (block >> (2 *(p-1))) & 3
+                out[4*(l-1)+p] += ((Aij >= 2) + (Aij == 1) * μ[j]) * v[j]
+            end
+
+        end
+    end
+    if rem != 0
+        _snparray_ax_dominant_meanimpute_rem!(@view(out[4k+1:end]), @view(s[k+1:k+1, :]), v, μ)
+    end
+    out
+end
+
+function _snparray_ax_recessive_meanimpute_rem!(out, s, v, μ)
+    maxp = length(out)
+    @avx for j in eachindex(v)
+        block = s[1, j]
+        for p in 1:maxp
+            Aij = (block >> (2*(p-1))) & 3
+            out[p] += ((Aij == 3) + (Aij == 1) * μ[j]) * v[j]
+        end
+    end
+end
+
+function _snparray_ax_recessive_meanimpute!(out, s, v, rows, cols, μ)
+    #fill!(out, zero(Float64))
+    k = rows >> 2
+    rem = rows & 3
+    
+    @avx for j ∈ 1:cols
+        for l in 1:k
+            block = s[l, j]
+            
+            for p in 1:4
+                Aij = (block >> (2 *(p-1))) & 3
+                out[4*(l-1)+p] += ((Aij == 3) + (Aij == 1) * μ[j]) * v[j]
+            end
+
+        end
+    end
+    if rem != 0
+        _snparray_ax_recessive_meanimpute_rem!(@view(out[4k+1:end]), @view(s[k+1:k+1, :]), v, μ)
+    end
+    out
 end
 
-function _snparray_ax_tile!(c, A, b, model)
+function _snparray_ax_tile!(c, A, b, model, μ, impute)
     vstep = 1024
     hstep = 1024
     vstep_log2 = 10
     hstep_log2 = 10
 
-    if model == ADDITIVE_MODEL
-        _ftn! = _snparray_ax_additive!
-    elseif model == DOMINANT_MODEL
-        _ftn! = _snparray_ax_dominant!
+    if !impute
+        if model == ADDITIVE_MODEL
+            _ftn! = _snparray_ax_additive!
+        elseif model == DOMINANT_MODEL
+            _ftn! = _snparray_ax_dominant!
+        else
+            _ftn! = _snparray_ax_recessive!
+        end
     else
-        _ftn! = _snparray_ax_recessive!
+        if model == ADDITIVE_MODEL
+            _ftn! = _snparray_ax_additive_meanimpute!
+        elseif model == DOMINANT_MODEL
+            _ftn! = _snparray_ax_dominant_meanimpute!
+        else
+            _ftn! = _snparray_ax_recessive_meanimpute!
+        end
     end
     
     fill!(c, zero(UInt8))
@@ -213,7 +327,7 @@ function _snparray_ax_tile!(c, A, b, model)
                 gesp(stridedpointer(A), (vstep*m, hstep*n)),
                 gesp(stridedpointer(b), (hstep*n,)),
                 vstep << 2,
-                hstep
+                hstep, μ
             )
         end
         if Mrem != 0
@@ -221,61 +335,67 @@ function _snparray_ax_tile!(c, A, b, model)
                 @view(A[vstep*(Miter)+1:end, hstep*n+1:hstep*(n+1)]),
                 @view(b[hstep*n+1:hstep*(n+1)]),
                 length(c) - 4*vstep*Miter,
-                hstep
+                hstep, μ
             )
         end
     end
     if Nrem != 0
         _ftn!(c, @view(A[:, (hstep*Niter+1):end]), @view(b[hstep*Niter+1:end]),
-            length(c), Nrem
+            length(c), Nrem, μ
         )
     end
 
 end
 
-function mul!(
-    out::AbstractVector{T}, 
-    sla::SnpLinAlg{T}, 
-    v::AbstractVector{T}) where T <: AbstractFloat
-    @assert length(out) == size(sla, 1) && length(v) == size(sla, 2)
-    if sla.scale
-        sla.storagev2 .= sla.σinv .* v
-        w = sla.storagev2
-    else
-        w = v
+function _snparray_atx_additive_rem!(out, s, v, μ)
+    maxp = length(v)
+    @avx for i in eachindex(out)
+        block = s[1, i]
+        for p in 1:maxp
+            Aij = (block >> (2*(p-1))) & 3
+            out[i] += ((Aij >= 2) + (Aij == 3)) * v[p]
+        end
     end
-    fill!(out, zero(eltype(out)))
-    s = sla.s
+end
 
-    _snparray_ax_tile!(out, s.data, w, sla.model)
-    # if sla.model == ADDITIVE_MODEL
-    #     _snparray_ax_additive!(out, s.data, w)
-    # elseif sla.model == DOMINANT_MODEL
-    #     _snparray_ax_dominant!(out, s.data, w)
-    # else
-    #     _snparray_ax_recessive!(out, s.data, w)
-    # end   
+function _snparray_atx_additive!(out, s, v, rows, cols, μ)
+    #fill!(out, zero(Float64))
+    k = rows >> 2
+    rem = rows & 3
+    
+    @avx for i ∈ 1:cols
+        for l in 1:k
+            block = s[l, i]
+            
+            for p in 1:4
+                Aij = (block >> (2 *(p-1))) & 3
+                out[i] += ((Aij >= 2) + (Aij == 3)) * v[4*(l-1)+p]
+            end
 
-    if sla.center
-        return out .-= dot(sla.μ, w)
-    else
-        return out
+        end
+    end
+    if rem != 0
+        _snparray_atx_additive_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]), μ)
     end
+    out
 end
 
+function _snparray_atx_additive!(out, s, v, μ)
+    _snparray_atx_additive!(out, s, v, length(v), length(out), μ)
+end
 
-function _snparray_atx_additive_rem!(out, s, v)
+function _snparray_atx_dominant_rem!(out, s, v, μ)
     maxp = length(v)
     @avx for i in eachindex(out)
         block = s[1, i]
         for p in 1:maxp
             Aij = (block >> (2*(p-1))) & 3
-            out[i] += ((Aij >= 2) + (Aij == 3)) * v[p]
+            out[i] += (Aij >= 2) * v[p]
         end
     end
 end
 
-function _snparray_atx_additive!(out, s, v, rows, cols)
+function _snparray_atx_dominant!(out, s, v, rows, cols, μ)
     #fill!(out, zero(Float64))
     k = rows >> 2
     rem = rows & 3
@@ -286,33 +406,33 @@ function _snparray_atx_additive!(out, s, v, rows, cols)
             
             for p in 1:4
                 Aij = (block >> (2 *(p-1))) & 3
-                out[i] += ((Aij >= 2) + (Aij == 3)) * v[4*(l-1)+p]
+                out[i] += (Aij >= 2) * v[4*(l-1)+p]
             end
 
         end
     end
     if rem != 0
-        _snparray_atx_additive_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]))
+        _snparray_atx_dominant_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]), μ)
     end
     out
 end
 
-function _snparray_atx_additive!(out, s, v)
-    _snparray_atx_additive!(out, s, v, length(v), length(out))
+function _snparray_atx_dominant!(out, s, v, μ)
+    _snparray_atx_dominant!(out, s, v, length(v), length(out), μ)
 end
 
-function _snparray_atx_dominant_rem!(out, s, v)
+function _snparray_atx_recessive_rem!(out, s, v, μ)
     maxp = length(v)
     @avx for i in eachindex(out)
         block = s[1, i]
         for p in 1:maxp
             Aij = (block >> (2*(p-1))) & 3
-            out[i] += (Aij >= 2) * v[p]
+            out[i] += (Aij == 3) * v[p]
         end
     end
 end
 
-function _snparray_atx_dominant!(out, s, v, rows, cols)
+function _snparray_atx_recessive!(out, s, v, rows, cols, μ)
     #fill!(out, zero(Float64))
     k = rows >> 2
     rem = rows & 3
@@ -323,33 +443,35 @@ function _snparray_atx_dominant!(out, s, v, rows, cols)
             
             for p in 1:4
                 Aij = (block >> (2 *(p-1))) & 3
-                out[i] += (Aij >= 2) * v[4*(l-1)+p]
+                out[i] += (Aij == 3) * v[4*(l-1)+p]
             end
 
         end
+
     end
     if rem != 0
-        _snparray_atx_dominant_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]))
+        _snparray_atx_recessive_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]), μ)
     end
     out
 end
 
-function _snparray_atx_dominant!(out, s, v)
-    _snparray_atx_dominant!(out, s, v, length(v), length(out))
+function _snparray_atx_recessive!(out, s, v, μ)
+    _snparray_atx_recessive!(out, s, v, length(v), length(out), μ)
 end
 
-function _snparray_atx_recessive_rem!(out, s, v)
+
+function _snparray_atx_additive_meanimpute_rem!(out, s, v, μ)
     maxp = length(v)
     @avx for i in eachindex(out)
         block = s[1, i]
         for p in 1:maxp
             Aij = (block >> (2*(p-1))) & 3
-            out[i] += (Aij == 3) * v[p]
+            out[i] += ((Aij >= 2) + (Aij == 3) + (Aij == 1) * μ[i]) * v[p]
         end
     end
 end
 
-function _snparray_atx_recessive!(out, s, v, rows, cols)
+function _snparray_atx_additive_meanimpute!(out, s, v, rows, cols, μ)
     #fill!(out, zero(Float64))
     k = rows >> 2
     rem = rows & 3
@@ -360,22 +482,129 @@ function _snparray_atx_recessive!(out, s, v, rows, cols)
             
             for p in 1:4
                 Aij = (block >> (2 *(p-1))) & 3
-                out[i] += (Aij == 3) * v[4*(l-1)+p]
+                out[i] += ((Aij >= 2) + (Aij == 3) + (Aij == 1) * μ[i]) * v[4*(l-1)+p]
             end
 
         end
+    end
+    if rem != 0
+        _snparray_atx_additive_meanimpute_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]), μ)
+    end
+    out
+end
+
+function _snparray_atx_additive_meanimpute!(out, s, v, μ)
+    _snparray_atx_additive_meanimpute!(out, s, v, length(v), length(out), μ)
+end
+
+function _snparray_atx_dominant_meanimpute_rem!(out, s, v, μ)
+    maxp = length(v)
+    @avx for i in eachindex(out)
+        block = s[1, i]
+        for p in 1:maxp
+            Aij = (block >> (2*(p-1))) & 3
+            out[i] += ((Aij >= 2) + (Aij == 1) * μ[i]) * v[p]
+        end
+    end
+end
 
+function _snparray_atx_dominant_meanimpute!(out, s, v, rows, cols, μ)
+    #fill!(out, zero(Float64))
+    k = rows >> 2
+    rem = rows & 3
+    
+    @avx for i ∈ 1:cols
+        for l in 1:k
+            block = s[l, i]
+            
+            for p in 1:4
+                Aij = (block >> (2 *(p-1))) & 3
+                out[i] += ((Aij >= 2) + (Aij == 1) * μ[i]) * v[4*(l-1)+p]
+            end
+
+        end
     end
     if rem != 0
-        _snparray_atx_recessive_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]))
+        _snparray_atx_dominant_meanimpute_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]), μ)
     end
     out
 end
 
-function _snparray_atx_recessive!(out, s, v)
-    _snparray_atx_recessive!(out, s, v, length(v), length(out))
+function _snparray_atx_dominant_meanimpute!(out, s, v, μ)
+    _snparray_atx_dominant_meanimpute!(out, s, v, length(v), length(out), μ)
 end
 
+function _snparray_atx_recessive_meanimpute_rem!(out, s, v, μ)
+    maxp = length(v)
+    @avx for i in eachindex(out)
+        block = s[1, i]
+        for p in 1:maxp
+            Aij = (block >> (2*(p-1))) & 3
+            out[i] += ((Aij == 3) + (Aij == 1) * μ[i]) * v[p]
+        end
+    end
+end
+
+function _snparray_atx_recessive_meanimpute!(out, s, v, rows, cols, μ)
+    #fill!(out, zero(Float64))
+    k = rows >> 2
+    rem = rows & 3
+    
+    @avx for i ∈ 1:cols
+        for l in 1:k
+            block = s[l, i]
+            
+            for p in 1:4
+                Aij = (block >> (2 *(p-1))) & 3
+                out[i] += ((Aij == 3) + (Aij == 1) * μ[i]) * v[4*(l-1)+p]
+            end
+
+        end
+
+    end
+    if rem != 0
+        _snparray_atx_recessive_meanimpute_rem!(out, @view(s[k+1:k+1, :]), @view(v[4k+1:end]), μ)
+    end
+    out
+end
+
+function _snparray_atx_recessive_meanimpute!(out, s, v, μ)
+    _snparray_atx_recessive_meanimpute!(out, s, v, length(v), length(out), μ)
+end
+
+"""
+    LinearAlgebra.mul!(out, s::SnpLiinAlg, v)
+
+In-place matrix-vector multiplication.
+"""
+function mul!(
+    out::AbstractVector{T}, 
+    sla::SnpLinAlg{T}, 
+    v::AbstractVector{T}) where T <: AbstractFloat
+    @assert length(out) == size(sla, 1) && length(v) == size(sla, 2)
+    if sla.scale
+        sla.storagev2 .= sla.σinv .* v
+        w = sla.storagev2
+    else
+        w = v
+    end
+    fill!(out, zero(eltype(out)))
+    s = sla.s
+
+    _snparray_ax_tile!(out, s.data, w, sla.model, sla.μ, sla.impute)
+
+    if sla.center
+        return out .-= dot(sla.μ, w)
+    else
+        return out
+    end
+end
+
+"""
+    LinearAlgebra.mul!(out, s::Union{Transpose{T, SnpBitMatrix{T}}, Adjoint{T, SnpBitMatrix{T}}}, v)
+
+In-place matrix-vector multiplication, with transposed `SnpBitMatrix`.
+"""
 function mul!(
     out::AbstractVector{T}, 
     st::Union{Transpose{T, SnpLinAlg{T}}, Adjoint{T, SnpLinAlg{T}}},
@@ -384,13 +613,25 @@ function mul!(
     sla = st.parent
     s = sla.s
     fill!(out, zero(eltype(out)))
-    if sla.model == ADDITIVE_MODEL
-        _snparray_atx_additive!(out, s.data, v)
-    elseif sla.model == DOMINANT_MODEL
-        _snparray_atx_dominant!(out, s.data, v)
+    if !sla.impute
+        if sla.model == ADDITIVE_MODEL
+            _ftn! = _snparray_atx_additive!
+        elseif sla.model == DOMINANT_MODEL
+            _ftn! = _snparray_atx_dominant!
+        else
+            _ftn! = _snparray_atx_recessive!
+        end
     else
-        _snparray_atx_recessive!(out, s.data, v)
-    end   
+        if sla.model == ADDITIVE_MODEL
+            _ftn! = _snparray_atx_additive_meanimpute!
+        elseif sla.model == DOMINANT_MODEL
+            _ftn! = _snparray_atx_dominant_meanimpute!
+        else
+            _ftn! = _snparray_atx_recessive_meanimpute!
+        end        
+    end
+
+    _ftn!(out, s.data, v, sla.μ)
     if sla.center
         out .-= sum(v) .* sla.μ
     end