Skip to content

Commit

Permalink
bug fix by @juhkim111: in REAP method, when inferred minor allele fre…
Browse files Browse the repository at this point in the history
…quency is <=0.01, the genotype is treated as missing; after this fix, ACCORD kinship estimates are much more reasonable
  • Loading branch information
Hua-Zhou committed Aug 18, 2020
1 parent 4c2328b commit 18ddcd9
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 14 deletions.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Expand Down
4 changes: 2 additions & 2 deletions src/SnpArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ __precompile__()
module SnpArrays

using CodecZlib, CodecXz, CodecBzip2, CodecZstd, TranscodingStreams
using Glob, LinearAlgebra, Missings, Mmap, SparseArrays, Statistics, StatsBase, LoopVectorization
using Requires, Adapt
using Adapt, Glob, LinearAlgebra, LoopVectorization, Missings, Mmap, Printf
using Requires, SparseArrays, Statistics, StatsBase
import Base: IndexStyle, convert, copyto!, eltype, getindex, setindex!, length, size, wait
import DataFrames: DataFrame, rename!, eachrow
import DelimitedFiles: readdlm, writedlm
Expand Down
38 changes: 26 additions & 12 deletions src/admixture.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@ end
end

"""
Base.copyto!(v, s, P, Q; model=ADDITIVE_MODEL, center=false, scale=false)
Base.copyto!(v, s, P, Q; model=ADDITIVE_MODEL, center=false, scale=false)
Copy SnpArray `s` to numeric vector or matrix `v`. Impute missing genotypes
Copy SnpArray `s` to a numeric vector or matrix `v`. Impute missing genotypes
according to ADMIXTURE estimates `P` (population A2 allele frequencies of SNPs)
and `Q` (population fractions of individuals). Columns of `P` should match columns of
`s`. Columns of `Q` should match rows of `s`.
`s`. Columns of `Q` should match rows of `s`. Note if the inferred minor allele
frequency from `P` and `Q` is ≤0.01, that genotype will be imputed according to
the inferred minor allele frequency.
# Positional arguments
- `v`: output vector or array.
Expand Down Expand Up @@ -52,7 +54,7 @@ function Base.copyto!(
μij = SnpArrays.convert(T1, a2freq, model)
σij = model == ADDITIVE_MODEL ? sqrt(μij * (1 - μij / 2)) : sqrt(μij * (1 - μij))
end
v[i, j] = isnan(vij) ? μij : vij
v[i, j] = (isnan(vij) || a2freq 0.01 || a2freq 0.99) ? μij : vij
center && (v[i, j] -= μij)
scale && (v[i, j] /= σij)
end
Expand All @@ -77,17 +79,29 @@ function grm_admixture(
::Type{T} = Float64
) where T <: AbstractFloat
m, n = size(s)
G = Mmap.mmap(Matrix{T}, m, n) # slightly faster than G = Matrix{T}(undef, m, n)
# convert genotype
tic = time()
G = Mmap.mmap(Matrix{T}, m, n) # slightly faster than G = Matrix{T}(undef, m, n)
Base.copyto!(G, s, P, Q, model=ADDITIVE_MODEL, center=true, scale=true)
Φ = G * transpose(G) # m x m
# M = sparse matrix of missing positions
M = missingpos(s)
Mrs = sum(M, dims=2)
@printf("convert genotype: %.2f seconds\n", time() - tic)
# GG'
tic = time()
Φ = G * transpose(G) # m x m
@printf("Φ = GG': %.2f seconds\n", time() - tic)
# convert G to {0,1} matrix
tic = time()
@inbounds for (idx, g) in enumerate(G)
G[idx] = ifelse(iszero(g), T(0), T(1))
# iszero(g) || (G[idx] = T(1))
end
@printf("convert G to {0,1} matrix: %.2f seconds\n", time() - tic)
# Sij = # SNPs observed in both individuals i and j
# S = (1m * 1n^T - M)(1n * 1m^T - M^T) = n 1m1m^T - (M 1n) 1m^T - 1m (M 1n)^T + MM^T
S = M * transpose(M)
tic = time()
S = G * transpose(G)
@printf("S = GG': %.2f seconds\n", time() - tic)
# Φ /= 2S
@inbounds for j in 1:m, i in 1:j
Φ[i, j] /= 2(S[i, j] + n - Mrs[i] - Mrs[j])
Φ[i, j] /= 2S[i, j]
end
copytri!(Φ, 'U')
end # function grm_admixture

0 comments on commit 18ddcd9

Please sign in to comment.