# Test and Benchmark

In [1]:
versioninfo()

Julia Version 0.6.0
Commit 903644385b (2017-06-19 13:05 UTC)
Platform Info:
  OS: macOS (x86_64-apple-darwin13.4.0)
  CPU: Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
  WORD_SIZE: 64
  BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
  LAPACK: libopenblas64_
  LIBM: libopenlibm
  LLVM: libLLVM-3.9.1 (ORCJIT, skylake)


## Haplotyping on complete genotype data

In [3]:
using NullableArrays, BenchmarkTools, MendelImpute

srand(123)
n, p, d = 10000, 1000, 50
H = convert(Matrix{Float32}, rand(0:1, d, p))
X = convert(Matrix{Float32}, rand(0:2, n, p))
M   = A_mul_Bt(H, H)
for j in 1:d
    for i in 1:(j - 1)
        M[i, j] = 2M[i, j] + M[i, i] + M[j, j]
    end
end
for j in 1:d
    M[j, j] *= 4
end
N   = A_mul_Bt(X, H)
for I in eachindex(N)
    N[I] *= 2
end
happair  = zeros(Int, n), zeros(Int, n)
hapscore = zeros(eltype(N), n)
@time haplopair!(happair, hapscore, M, N)

  0.012692 seconds (4 allocations: 160 bytes)


In [4]:
happair

([1, 17, 8, 37, 5, 10, 1, 14, 3, 3  …  16, 23, 7, 25, 3, 13, 5, 3, 3, 11], [16, 48, 12, 39, 21, 30, 42, 18, 18, 18  …  35, 26, 47, 50, 39, 35, 24, 17, 18, 43])

In [5]:
hapscore

10000-element Array{Float32,1}:
 -611.0
 -622.0
 -629.0
 -666.0
 -569.0
 -613.0
 -640.0
 -669.0
 -550.0
 -690.0
 -665.0
 -595.0
 -621.0
    ⋮  
 -578.0
 -595.0
 -680.0
 -546.0
 -680.0
 -528.0
 -687.0
 -587.0
 -603.0
 -646.0
 -590.0
 -613.0

In [6]:
haplopair(X, H)

(([1, 17, 8, 37, 5, 10, 1, 14, 3, 3  …  16, 23, 7, 25, 3, 13, 5, 3, 3, 11], [16, 48, 12, 39, 21, 30, 42, 18, 18, 18  …  35, 26, 47, 50, 39, 35, 24, 17, 18, 43]), Float32[1043.0, 1056.0, 1025.0, 1020.0, 1055.0, 1059.0, 1070.0, 1043.0, 1039.0, 1029.0  …  1021.0, 1087.0, 1052.0, 1067.0, 1058.0, 1033.0, 1057.0, 1024.0, 1012.0, 1046.0])

In [7]:
@benchmark haplopair!(happair, hapscore, M, N)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     9.256 ms (0.00% GC)
  median time:      9.784 ms (0.00% GC)
  mean time:        9.891 ms (0.00% GC)
  maximum time:     13.405 ms (0.00% GC)
  --------------
  samples:          506
  evals/sample:     1

In [8]:
Profile.clear()
@profile haplopair!(happair, hapscore, M, N)
Profile.print(format=:flat)

 Count File                        Line Function                               
    10 ./<missing>                   -1 anonymous                              
    11 ./loading.jl                 515 include_string(::String, ::String)     
    10 ./profile.jl                  23 macro expansion                        
    11 ./task.jl                    335 (::IJulia.##11#14)()                   
    11 ...IJulia/src/eventloop.jl     8 eventloop(::ZMQ.Socket)                
    11 .../src/execute_request.jl   160 execute_request(::ZMQ.Socket, ::IJu... 
     1 ...pute/src/haplotyping.jl    30 haplopair!(::Tuple{Array{Int64,1},A... 
     1 ...pute/src/haplotyping.jl    32 haplopair!(::Tuple{Array{Int64,1},A... 
     2 ...pute/src/haplotyping.jl    33 haplopair!(::Tuple{Array{Int64,1},A... 
     4 ...pute/src/haplotyping.jl    34 haplopair!(::Tuple{Array{Int64,1},A... 
     2 ...pute/src/haplotyping.jl    37 haplopair!(::Tuple{Array{Int64,1},A... 


In [9]:
#@code_warntype haplopair!(X, H, M, N, happair, hapscore)

In [9]:
@time haplopair!(X, H, M, N, happair, hapscore)

  0.032333 seconds (4 allocations: 160 bytes)


In [10]:
@benchmark haplopair!(X, H, M, N, happair, hapscore)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     20.488 ms (0.00% GC)
  median time:      20.981 ms (0.00% GC)
  mean time:        21.224 ms (0.00% GC)
  maximum time:     26.561 ms (0.00% GC)
  --------------
  samples:          236
  evals/sample:     1

In [11]:
Profile.clear()
@profile haplopair!(X, H, M, N, happair, hapscore)
Profile.print(format=:flat)

 Count File                        Line Function                               
    28 ./<missing>                   -1 anonymous                              
     4 ./linalg/blas.jl            1027 gemm!(::Char, ::Char, ::Float32, ::... 
     4 ./linalg/matmul.jl           191 A_mul_Bt!                              
     4 ./linalg/matmul.jl           367 gemm_wrapper!(::Array{Float32,2}, :... 
    28 ./loading.jl                 515 include_string(::String, ::String)     
    28 ./profile.jl                  23 macro expansion                        
     8 ./simdloop.jl                 73 macro expansion                        
    28 ./task.jl                    335 (::IJulia.##11#14)()                   
    28 ...IJulia/src/eventloop.jl     8 eventloop(::ZMQ.Socket)                
    28 .../src/execute_request.jl   160 execute_request(::ZMQ.Socket, ::IJu... 
     4 ...pute/src/haplotyping.jl    30 haplopair!(::Tuple{Array{Int64,1},A... 
     2 ...pute/src/haplotyping.jl    32 

## Haplotyping on incomplete genotype data



In [13]:
using MendelImpute, NullableArrays, BenchmarkTools

srand(123)
n, p, d = 10000, 1000, 50
H = convert(Matrix{Float32}, rand(0:1, d, p))
X = convert(Matrix{Float32}, rand(0:2, n, p))
M = zeros(eltype(H), d, d)
N = zeros(eltype(X), n, d)
happair  = zeros(Int, n), zeros(Int, n)
hapscore = zeros(eltype(N), n)
missingprop = 0.1
Xm = NullableArray(X, full(sprand(Bool, n, p, missingprop)))
@time haploimpute!(Xm, H, M, N, happair, hapscore)

  0.098535 seconds (4 allocations: 160 bytes)


In [14]:
@benchmark haploimpute!(Xm, H, M, N, happair, hapscore)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     90.592 ms (0.00% GC)
  median time:      90.955 ms (0.00% GC)
  mean time:        91.509 ms (0.00% GC)
  maximum time:     95.049 ms (0.00% GC)
  --------------
  samples:          55
  evals/sample:     1

In [15]:
Profile.clear()
@profile haploimpute!(Xm, H, M, N, happair, hapscore)
Profile.print(format=:flat)

 Count File                        Line Function                               
    84 ./<missing>                   -1 anonymous                              
     4 ./linalg/blas.jl            1027 gemm!(::Char, ::Char, ::Float32, ::... 
     4 ./linalg/matmul.jl           191 A_mul_Bt!                              
     4 ./linalg/matmul.jl           367 gemm_wrapper!(::Array{Float32,2}, :... 
    84 ./loading.jl                 515 include_string(::String, ::String)     
    84 ./profile.jl                  23 macro expansion                        
     2 ./simdloop.jl                 73 macro expansion                        
    84 ./task.jl                    335 (::IJulia.##11#14)()                   
    84 ...IJulia/src/eventloop.jl     8 eventloop(::ZMQ.Socket)                
    84 .../src/execute_request.jl   160 execute_request(::ZMQ.Socket, ::IJu... 
     6 ...pute/src/haplotyping.jl   145 fillmissing!(::NullableArrays.Nulla... 
     5 ...pute/src/haplotyping.jl   146 

## `AFRped.txt` data



In [17]:
#;ls -al

In [18]:
## cut out the first 6 fields in the pedigree file
#;cut -d ',' -f7- AFRped.txt > AFRped_geno.txt

In [19]:
#;ls -al

In [16]:
rawdata = readcsv("AFRped_geno.txt", Float32)

1325×36499 Array{Float32,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  1.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  1.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  1.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  1.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  1.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  1.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0 

In [17]:
people = 664
X = rawdata[1:people, 1:(end - 1)]

664×36498 Array{Float32,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  

In [18]:
if eltype(rawdata) <: Integer
    H = rawdata[(people + 1):end, 1:(end - 1)] .>> 1
else
    H = rawdata[(people + 1):end, 1:(end - 1)] / 2
end

661×36498 Array{Float32,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  1.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  

### phase+impute a fixed window

In [19]:
using BenchmarkTools, MendelImpute, NullableArrays

X1 = X[:, 1:1200]
H1 = H[:, 1:1200]
missingprop = 0.1
Xm = NullableArray(X1, full(sprand(Bool, size(X1, 1), size(X1, 2), missingprop)))
missing_true = round.(Int, X1[Xm.isnull])

79900-element Array{Int64,1}:
 1
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0

In [21]:
n, d = size(X1, 1), size(H1, 1)
M = zeros(eltype(H), d, d)
N = zeros(eltype(X), n, d)
happair  = zeros(Int, n), zeros(Int, n)
hapscore = zeros(eltype(N), n)
@time haploimpute!(Xm, H1, M, N, happair, hapscore)
missing_impute = round.(Int, Xm.values[Xm.isnull])
error = countnz(missing_true .≠ missing_impute) / length(missing_true)

  0.145084 seconds (4 allocations: 160 bytes)


0.0029662077596996246

In [22]:
@benchmark haploimpute!(Xm, H1, M, N, happair, hapscore)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     123.712 ms (0.00% GC)
  median time:      124.748 ms (0.00% GC)
  mean time:        127.212 ms (0.00% GC)
  maximum time:     152.262 ms (0.00% GC)
  --------------
  samples:          40
  evals/sample:     1

In [23]:
Profile.clear()
@profile haploimpute!(Xm, H1, M, N, happair, hapscore)
Profile.print(format=:flat)

 Count File                        Line Function                               
   102 ./<missing>                   -1 anonymous                              
     1 ./linalg/blas.jl            1027 gemm!(::Char, ::Char, ::Float32, ::... 
     2 ./linalg/matmul.jl           191 A_mul_Bt!                              
     1 ./linalg/matmul.jl           240 copytri!(::Array{Float32,2}, ::Char... 
     1 ./linalg/matmul.jl           367 gemm_wrapper!(::Array{Float32,2}, :... 
     1 ./linalg/matmul.jl           293 syrk_wrapper!(::Array{Float32,2}, :... 
   102 ./loading.jl                 515 include_string(::String, ::String)     
   102 ./profile.jl                  23 macro expansion                        
   102 ./task.jl                    335 (::IJulia.##11#14)()                   
   102 ...IJulia/src/eventloop.jl     8 eventloop(::ZMQ.Socket)                
   102 .../src/execute_request.jl   160 execute_request(::ZMQ.Socket, ::IJu... 
     1 ...pute/src/haplotyping.jl   146 

## Phase + impute by moving window (using flanking windows)

In [24]:
using BenchmarkTools, MendelImpute, NullableArrays

srand(123)
missingprop = 0.1
Xm = NullableArray(X, full(sprand(Bool, size(X, 1), size(X, 2), missingprop)))
missing_true = round.(Int, X[Xm.isnull])

@time haploimpute!(Xm, H, 800)

Imputing SNPs 1:800
Imputing SNPs 801:1600
Imputing SNPs 1601:2400
Imputing SNPs 2401:3200
Imputing SNPs 3201:4000
Imputing SNPs 4001:4800
Imputing SNPs 4801:5600
Imputing SNPs 5601:6400
Imputing SNPs 6401:7200
Imputing SNPs 7201:8000
Imputing SNPs 8001:8800
Imputing SNPs 8801:9600
Imputing SNPs 9601:10400
Imputing SNPs 10401:11200
Imputing SNPs 11201:12000
Imputing SNPs 12001:12800
Imputing SNPs 12801:13600
Imputing SNPs 13601:14400
Imputing SNPs 14401:15200
Imputing SNPs 15201:16000
Imputing SNPs 16001:16800
Imputing SNPs 16801:17600
Imputing SNPs 17601:18400
Imputing SNPs 18401:19200
Imputing SNPs 19201:20000
Imputing SNPs 20001:20800
Imputing SNPs 20801:21600
Imputing SNPs 21601:22400
Imputing SNPs 22401:23200
Imputing SNPs 23201:24000
Imputing SNPs 24001:24800
Imputing SNPs 24801:25600
Imputing SNPs 25601:26400
Imputing SNPs 26401:27200
Imputing SNPs 27201:28000
Imputing SNPs 28001:28800
Imputing SNPs 28801:29600
Imputing SNPs 29601:30400
Imputing SNPs 30401:31200
Imputing SNPs 31

In [26]:
missing_impute = round.(Int, Xm.values[Xm.isnull])
error = countnz(missing_true .≠ missing_impute) / length(missing_true)

0.0069078088435298385

In [27]:
Profile.clear()
@profile haploimpute!(Xm, H, 400)
Profile.print(format=:flat)

Imputing SNPs 1:400
Imputing SNPs 401:800
Imputing SNPs 801:1200
Imputing SNPs 1201:1600
Imputing SNPs 1601:2000
Imputing SNPs 2001:2400
Imputing SNPs 2401:2800
Imputing SNPs 2801:3200
Imputing SNPs 3201:3600
Imputing SNPs 3601:4000
Imputing SNPs 4001:4400
Imputing SNPs 4401:4800
Imputing SNPs 4801:5200
Imputing SNPs 5201:5600
Imputing SNPs 5601:6000
Imputing SNPs 6001:6400
Imputing SNPs 6401:6800
Imputing SNPs 6801:7200
Imputing SNPs 7201:7600
Imputing SNPs 7601:8000
Imputing SNPs 8001:8400
Imputing SNPs 8401:8800
Imputing SNPs 8801:9200
Imputing SNPs 9201:9600
Imputing SNPs 9601:10000
Imputing SNPs 10001:10400
Imputing SNPs 10401:10800
Imputing SNPs 10801:11200
Imputing SNPs 11201:11600
Imputing SNPs 11601:12000
Imputing SNPs 12001:12400
Imputing SNPs 12401:12800
Imputing SNPs 12801:13200
Imputing SNPs 13201:13600
Imputing SNPs 13601:14000
Imputing SNPs 14001:14400
Imputing SNPs 14401:14800
Imputing SNPs 14801:15200
Imputing SNPs 15201:15600
Imputing SNPs 15601:16000
Imputing SNPs 16

Effect of window size on accuracy and efficiency

In [28]:
width_range = 100:100:1200
timing = zeros(length(width_range))
errate = zeros(length(width_range))
for i in eachindex(width_range)
    tic()
    copy!(Xm.values, rawdata[1:people, 1:(end - 1)])
    haploimpute!(Xm, H, width_range[i], 1, 1e-3, false)
    timing[i] = toq()
    missing_impute = round.(Int, Xm.values[Xm.isnull])
    errate[i] = countnz(missing_true .≠ missing_impute) / length(missing_true)
    println("width = $(width_range[i]), time = $(timing[i]), error rate = $(errate[i])")
end

width = 100, time = 45.67825418, error rate = 0.008510803299875053
width = 200, time = 22.400324013, error rate = 0.004886121805303412
width = 300, time = 15.083785878, error rate = 0.004359765416652744
width = 400, time = 11.963894426, error rate = 0.004465366698388301
width = 500, time = 9.600222612, error rate = 0.004874571665113585
width = 600, time = 8.122951738, error rate = 0.00557541767163199
width = 700, time = 7.188097543, error rate = 0.0061009490502690975
width = 800, time = 6.453106615, error rate = 0.0069078088435298385
width = 900, time = 5.949815007, error rate = 0.007566991844363511
width = 1000, time = 5.397322931, error rate = 0.008076848032742997
width = 1100, time = 4.969931858, error rate = 0.00872860594345464
width = 1200, time = 4.633255464, error rate = 0.009398926579471358


In [29]:
using Plots
gr()

plot(width_range, timing; xlabel = "Window width", ylabel = "Run time in seconds")

In [30]:
plot(width_range, errate; xlabel = "Window width", ylabel = "Error rate")