# Test and Benchmark

In [1]:
versioninfo()

Julia Version 0.6.0
Commit 903644385b (2017-06-19 13:05 UTC)
Platform Info:
  OS: macOS (x86_64-apple-darwin13.4.0)
  CPU: Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
  WORD_SIZE: 64
  BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
  LAPACK: libopenblas64_
  LIBM: libopenlibm
  LLVM: libLLVM-3.9.1 (ORCJIT, skylake)


## Haplotyping on complete genotype data

In [2]:
using NullableArrays, BenchmarkTools, MendelImpute

srand(123)
n, p, d = 5000, 500, 500
H = convert(Matrix{Float32}, rand(0:1, d, p))
X = convert(Matrix{Float32}, rand(0:2, n, p))
M   = A_mul_Bt(H, H)
for j in 1:d
    for i in 1:(j - 1)
        M[i, j] = 2M[i, j] + M[i, i] + M[j, j]
    end
end
for j in 1:d
    M[j, j] *= 4
end
N   = A_mul_Bt(X, H)
for I in eachindex(N)
    N[I] *= 2
end
uniqhap = zeros(Bool, d)
permhap = zeros(Int, d)
happair  = zeros(Int, n), zeros(Int, n)
hapscore = zeros(eltype(N), n)
@show typeof(M), typeof(N)
@time haplopair!(happair, hapscore, M, N)

(typeof(M), typeof(N)) = (Array{Float32,2}, Array{Float32,2})
 



In [3]:
happair

([227, 205, 201, 254, 267, 40, 170, 87, 27, 377  …  257, 81, 194, 257, 258, 106, 203, 100, 243, 208], [358, 454, 245, 442, 344, 138, 218, 262, 278, 449  …  442, 208, 301, 442, 338, 396, 387, 158, 482, 407])

In [4]:
hapscore

5000-element Array{Float32,1}:
 -371.0
 -304.0
 -408.0
 -428.0
 -392.0
 -339.0
 -423.0
 -388.0
 -360.0
 -316.0
 -416.0
 -435.0
 -376.0
    ⋮  
 -347.0
 -366.0
 -367.0
 -369.0
 -386.0
 -357.0
 -443.0
 -343.0
 -377.0
 -344.0
 -352.0
 -376.0

In [5]:
haplopair(X, H)

(([227, 205, 201, 254, 267, 40, 170, 87, 27, 377  …  257, 81, 194, 257, 258, 106, 203, 100, 243, 208], [358, 454, 245, 442, 344, 138, 218, 262, 278, 449  …  442, 208, 301, 442, 338, 396, 387, 158, 482, 407]), Float32[474.0, 486.0, 485.0, 475.0, 453.0, 470.0, 457.0, 460.0, 463.0, 479.0  …  462.0, 452.0, 465.0, 466.0, 458.0, 441.0, 457.0, 471.0, 456.0, 470.0])

In [6]:
@benchmark haplopair!(happair, hapscore, M, N)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     480.226 ms (0.00% GC)
  median time:      516.202 ms (0.00% GC)
  mean time:        515.118 ms (0.00% GC)
  maximum time:     555.477 ms (0.00% GC)
  --------------
  samples:          10
  evals/sample:     1

In [7]:
Profile.clear()
@profile haplopair!(happair, hapscore, M, N)
Profile.print(format=:flat)

 Count File                        Line Function                               
   425 ./<missing>                   -1 anonymous                              
     2 ./abstractarray.jl           572 copy!(::Array{Any,1}, ::Core.Infere... 
     2 ./inference.jl              1882 abstract_call(::Any, ::Array{Any,1}... 
     2 ./inference.jl              1401 abstract_call_gf_by_type(::Any, ::A... 
     4 ./inference.jl              1935 abstract_eval(::Any, ::Array{Any,1}... 
     2 ./inference.jl              1886 abstract_eval_call(::Expr, ::Array{... 
     2 ./inference.jl              1912 abstract_eval_call(::Expr, ::Array{... 
     1 ./inference.jl              3832 inlineable(::Any, ::Any, ::Expr, ::... 
     1 ./inference.jl              4329 inlining_pass!(::Core.Inference.Inf... 
     1 ./inference.jl              4391 inlining_pass(::Expr, ::Core.Infere... 
     1 ./inference.jl              4492 inlining_pass(::Expr, ::Core.Infere... 
     1 ./inference.jl              2955 

In [8]:
#@code_warntype haplopair!(X, H, M, N, uniqhap, permhap, happair, hapscore)

In [9]:
@time haplopair!(X, H, M, N, uniqhap, permhap, happair, hapscore)

  

In [10]:
@benchmark haplopair!(X, H, M, N, uniqhap, permhap, happair, hapscore)

BenchmarkTools.Trial: 
  memory estimate:  40.16 KiB
  allocs estimate:  518
  --------------
  minimum time:     844.036 ms (0.00% GC)
  median time:      860.695 ms (0.00% GC)
  mean time:        861.094 ms (0.00% GC)
  maximum time:     879.066 ms (0.00% GC)
  --------------
  samples:          6
  evals/sample:     1

In [11]:
Profile.clear()
@profile haplopair!(X, H, M, N, uniqhap, permhap, happair, hapscore)
Profile.print(format=:flat)

 Count File                        Line Function                               
   622 ./<missing>                   -1 anonymous                              
     1 ./linalg/blas.jl            1027 gemm!(::Char, ::Char, ::Float32, ::... 
     2 ./linalg/matmul.jl           191 A_mul_Bt!                              
     1 ./linalg/matmul.jl           240 copytri!(::Array{Float32,2}, ::Char... 
     1 ./linalg/matmul.jl           367 gemm_wrapper!(::Array{Float32,2}, :... 
     1 ./linalg/matmul.jl           293 syrk_wrapper!(::Array{Float32,2}, :... 
   622 ./loading.jl                 515 include_string(::String, ::String)     
   622 ./profile.jl                  23 macro expansion                        
     1 ./simdloop.jl                 73 macro expansion                        
   424 ./subarray.jl                185 getindex                               
   622 ./task.jl                    335 (::IJulia.##11#14)()                   
   622 ...IJulia/src/eventloop.jl     8 

## Haplotyping on incomplete genotype data



In [12]:
using MendelImpute, NullableArrays, BenchmarkTools

srand(123)
n, p, d = 5000, 1000, 500
H = convert(Matrix{Float32}, rand(0:1, d, p))
X = convert(Matrix{Float32}, rand(0:2, n, p))
M = zeros(eltype(H), d, d)
N = zeros(eltype(X), n, d)
uniqhap = zeros(Bool, d)
permhap = zeros(Int, d)
happair  = zeros(Int, n), zeros(Int, n)
hapscore = zeros(eltype(N), n)
missingprop = 0.1
Xm = NullableArray(X, full(sprand(Bool, n, p, missingprop)))
@time haploimpute!(Xm, H, M, N, uniqhap, permhap, happair, hapscore)

  

In [13]:
@benchmark haploimpute!(Xm, H, M, N, uniqhap, permhap, happair, hapscore)

BenchmarkTools.Trial: 
  memory estimate:  40.16 KiB
  allocs estimate:  518
  --------------
  minimum time:     916.088 ms (0.00% GC)
  median time:      951.960 ms (0.00% GC)
  mean time:        963.737 ms (0.00% GC)
  maximum time:     1.045 s (0.00% GC)
  --------------
  samples:          6
  evals/sample:     1

In [14]:
Profile.clear()
@profile haploimpute!(Xm, H, M, N, uniqhap, permhap, happair, hapscore)
Profile.print(format=:flat)

 Count File                        Line Function                               
   717 ./<missing>                   -1 anonymous                              
     1 ./abstractarray.jl          1512 lexcmp(::SubArray{Float32,1,Array{F... 
     1 ./linalg/blas.jl            1027 gemm!(::Char, ::Char, ::Float32, ::... 
     1 ./linalg/matmul.jl           191 A_mul_Bt!                              
     1 ./linalg/matmul.jl           367 gemm_wrapper!(::Array{Float32,2}, :... 
   717 ./loading.jl                 515 include_string(::String, ::String)     
     1 ./ordering.jl                 57 lt                                     
   717 ./profile.jl                  23 macro expansion                        
     1 ./simdloop.jl                 73 macro expansion                        
     1 ./sort.jl                    660 #sortperm!#12(::Base.Sort.QuickSort... 
     1 ./sort.jl                    252 sort!(::Array{Int64,1}, ::Int64, ::... 
     1 ./sort.jl                    323 

## `AFRped.txt` data



In [15]:
#;ls -al

In [16]:
## cut out the first 6 fields in the pedigree file
#;cut -d ',' -f7- AFRped.txt > AFRped_geno.txt

In [17]:
#;ls -al

In [18]:
rawdata = readcsv("AFRped_geno.txt", Float32)

1325×36499 Array{Float32,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  1.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  1.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  1.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  1.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  1.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  1.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0 

In [19]:
people = 664
X = rawdata[1:people, 1:(end - 1)]

664×36498 Array{Float32,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  

In [20]:
if eltype(rawdata) <: Integer
    H = rawdata[(people + 1):end, 1:(end - 1)] .>> 1
else
    H = rawdata[(people + 1):end, 1:(end - 1)] / 2
end

661×36498 Array{Float32,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  

### phase+impute a fixed window

In [21]:
using BenchmarkTools, MendelImpute, NullableArrays

srand(123)
X1 = X[:, 1:1200]
H1 = H[:, 1:1200]
missingprop = 0.1
Xm = NullableArray(X1, full(sprand(Bool, size(X1, 1), size(X1, 2), missingprop)))
missing_true = round.(Int, X1[Xm.isnull])

79240-element Array{Int64,1}:
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0

In [22]:
n, d = size(X1, 1), size(H1, 1)
M = zeros(eltype(H), d, d)
N = zeros(eltype(X), n, d)
uniqhap = zeros(Bool, d)
permhap = zeros(Int, d)
happair  = zeros(Int, n), zeros(Int, n)
hapscore = zeros(eltype(N), n)
@time haploimpute!(Xm, H1, M, N, uniqhap, permhap, happair, hapscore)
missing_impute = round.(Int, Xm.values[Xm.isnull])
error = countnz(missing_true .≠ missing_impute) / length(missing_true)

  0.240438 seconds (683 allocations: 52.859 KiB)


0.0033064109035840484

In [23]:
@benchmark haploimpute!(Xm, H1, M, N, uniqhap, permhap, happair, hapscore)

BenchmarkTools.Trial: 
  memory estimate:  52.70 KiB
  allocs estimate:  679
  --------------
  minimum time:     193.186 ms (0.00% GC)
  median time:      199.815 ms (0.00% GC)
  mean time:        202.529 ms (0.00% GC)
  maximum time:     219.272 ms (0.00% GC)
  --------------
  samples:          25
  evals/sample:     1

In [24]:
Profile.clear()
@profile haploimpute!(Xm, H1, M, N, uniqhap, permhap, happair, hapscore)
Profile.print(format=:flat)

 Count File                        Line Function                               
   153 ./<missing>                   -1 anonymous                              
     1 ./abstractarray.jl          1504 isequal(::SubArray{Float32,1,Array{... 
     3 ./abstractarray.jl          1512 lexcmp(::SubArray{Float32,1,Array{F... 
     3 ./abstractarray.jl          1513 lexcmp(::SubArray{Float32,1,Array{F... 
     3 ./abstractarray.jl           759 next                                   
     3 ./iterators.jl               184 next                                   
     1 ./linalg/blas.jl            1207 syrk!(::Char, ::Char, ::Float32, ::... 
     1 ./linalg/matmul.jl           191 A_mul_Bt!                              
     1 ./linalg/matmul.jl           293 syrk_wrapper!(::Array{Float32,2}, :... 
   153 ./loading.jl                 515 include_string(::String, ::String)     
     3 ./operators.jl               267 lexcmp                                 
     6 ./ordering.jl                 57 

## Phase + impute by moving window (using flanking windows)

In [25]:
using MendelImpute, NullableArrays

srand(123)
missingprop = 0.1
X = rawdata[1:people, 1:(end - 1)]
Xm = NullableArray(X, full(sprand(Bool, size(X, 1), size(X, 2), missingprop)))
missing_true = round.(Int, X[Xm.isnull])

@time haploimpute!(Xm, H, 800)

Imputing SNPs 1:800
Imputing SNPs 801:1600
Imputing SNPs 1601:2400
Imputing SNPs 2401:3200
Imputing SNPs 3201:4000
Imputing SNPs 4001:4800
Imputing SNPs 4801:5600
Imputing SNPs 5601:6400
Imputing SNPs 6401:7200
Imputing SNPs 7201:8000
Imputing SNPs 8001:8800
Imputing SNPs 8801:9600
Imputing SNPs 9601:10400
Imputing SNPs 10401:11200
Imputing SNPs 11201:12000
Imputing SNPs 12001:12800
Imputing SNPs 12801:13600
Imputing SNPs 13601:14400
Imputing SNPs 14401:15200
Imputing SNPs 15201:16000
Imputing SNPs 16001:16800
Imputing SNPs 16801:17600
Imputing SNPs 17601:18400
Imputing SNPs 18401:19200
Imputing SNPs 19201:20000
Imputing SNPs 20001:20800
Imputing SNPs 20801:21600
Imputing SNPs 21601:22400
Imputing SNPs 22401:23200
Imputing SNPs 23201:24000
Imputing SNPs 24001:24800
Imputing SNPs 24801:25600
Imputing SNPs 25601:26400
Imputing SNPs 26401:27200
Imputing SNPs 27201:28000
Imputing SNPs 28001:28800
Imputing SNPs 28801:29600
Imputing SNPs 29601:30400
Imputing SNPs 30401:31200
Imputing SNPs 31

In [26]:
missing_impute = round.(Int, Xm.values[Xm.isnull])
error = countnz(missing_true .≠ missing_impute) / length(missing_true)

0.0069078088435298385

In [27]:
Profile.clear()
@profile haploimpute!(Xm, H, 400)
Profile.print(format=:flat)

Imputing SNPs 1:400
Imputing SNPs 401:800
Imputing SNPs 801:1200
Imputing SNPs 1201:1600
Imputing SNPs 1601:2000
Imputing SNPs 2001:2400
Imputing SNPs 2401:2800
Imputing SNPs 2801:3200
Imputing SNPs 3201:3600
Imputing SNPs 3601:4000
Imputing SNPs 4001:4400
Imputing SNPs 4401:4800
Imputing SNPs 4801:5200
Imputing SNPs 5201:5600
Imputing SNPs 5601:6000
Imputing SNPs 6001:6400
Imputing SNPs 6401:6800
Imputing SNPs 6801:7200
Imputing SNPs 7201:7600
Imputing SNPs 7601:8000
Imputing SNPs 8001:8400
Imputing SNPs 8401:8800
Imputing SNPs 8801:9200
Imputing SNPs 9201:9600
Imputing SNPs 9601:10000
Imputing SNPs 10001:10400
Imputing SNPs 10401:10800
Imputing SNPs 10801:11200
Imputing SNPs 11201:11600
Imputing SNPs 11601:12000
Imputing SNPs 12001:12400
Imputing SNPs 12401:12800
Imputing SNPs 12801:13200
Imputing SNPs 13201:13600
Imputing SNPs 13601:14000
Imputing SNPs 14001:14400
Imputing SNPs 14401:14800
Imputing SNPs 14801:15200
Imputing SNPs 15201:15600
Imputing SNPs 15601:16000
Imputing SNPs 16

Effect of window size on accuracy and efficiency

In [28]:
width_range = 100:100:1200
timing = zeros(length(width_range))
errate = zeros(length(width_range))
for i in eachindex(width_range)
    tic()
    copy!(Xm.values, rawdata[1:people, 1:(end - 1)])
    haploimpute!(Xm, H, width_range[i], 1, 1e-3, false)
    timing[i] = toq()
    missing_impute = round.(Int, Xm.values[Xm.isnull])
    errate[i] = countnz(missing_true .≠ missing_impute) / length(missing_true)
    println("width = $(width_range[i]), time = $(timing[i]), error rate = $(errate[i])")
end

width = 100, time = 22.351066505, error rate = 0.008510803299875053
width = 200, time = 21.350882231, error rate = 0.004886121805303412
width = 300, time = 19.898207695, error rate = 0.004359765416652744
width = 400, time = 18.010435937, error rate = 0.004465366698388301
width = 500, time = 14.574663724, error rate = 0.004874571665113585
width = 600, time = 13.338330273, error rate = 0.00557541767163199
width = 700, time = 11.927481581, error rate = 0.0061009490502690975
width = 800, time = 10.181323544, error rate = 0.0069078088435298385
width = 900, time = 10.225013411, error rate = 0.007566991844363511
width = 1000, time = 8.439951836, error rate = 0.008076848032742997
width = 1100, time = 8.904734978, error rate = 0.00872860594345464


In [29]:
using Plots
gr()

plot(width_range, timing; xlabel = "Window width", ylabel = "Run time in seconds")

In [30]:
plot(width_range, errate; xlabel = "Window width", ylabel = "Error rate")