# Test imputation on untyped SNPs chrom 20

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLD2, FileIO, JLSO
using BenchmarkTools
using GroupSlices
using TimerOutputs
using LinearAlgebra

BLAS.set_num_threads(1)

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1278


# MendelImpute error rate (window-window intersection)

In [2]:
Threads.nthreads()

8

In [3]:
# 8 threads
Random.seed!(2020)
d       = 1000
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.maxd$d.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, max_d = d,
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz")
n, p = size(X_mendel)
println("error_rate = ", sum(X_mendel .!= X_complete) / n / p)
rm(outfile, force=true)

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:27[39m
[32mComputing optimal haplotypes...100%|████████████████████| Time: 0:01:00[39m
[32mPhasing...100%|█████████████████████████████████████████| Time: 0:00:36[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:46[39m


Total windows = 3252, averaging ~ 510 unique haplotypes per window.

Timings: 
    Data import                     = 56.8629 seconds
    Computing haplotype pair        = 60.3501 seconds
        BLAS3 mul! to get M and N      = 0.93269 seconds per thread
        haplopair search               = 51.3704 seconds per thread
        initializing missing           = 1.26004 seconds per thread
        allocating and viewing         = 0.0884786 seconds per thread
        index conversion               = 0.0752258 seconds per thread
    Phasing by win-win intersection = 36.1751 seconds
        Window-by-window intersection  = 30.4965 seconds per thread
        Breakpoint search              = 0.7818 seconds per thread
        Recording result               = 2.1571 seconds per thread
    Imputation                      = 51.0597 seconds

212.560535 seconds (496.98 M allocations: 42.256 GiB, 5.13% gc time)
error_rate = 0.0


# Profile code

In [3]:
using Profile
using ProfileView
Profile.init(n = 10^7, delay = 0.01) # 1 backtrace per 0.01 second

In [4]:
# warmup on small problem
cd("/Users/biona001/.julia/dev/MendelImpute/simulation")
Random.seed!(2020)
d       = 1000
tgtfile = "./compare2/target.typedOnly.maf0.01.masked.vcf.gz"
reffile = "./compare2/ref.excludeTarget.maxd$d.jlso"
outfile = "./compare2/mendel.imputed.vcf.gz"
@profview phase(tgtfile, reffile, outfile = outfile, max_d = d,
    dynamic_programming = false);

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:18[39m
[32mComputing optimal haplotypes...100%|████████████████████| Time: 0:00:09[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:17[39m


Total windows = 64, averaging ~ 692 unique haplotypes per window.

Timings: 
    Data import                     = 40.6077 seconds
    Computing haplotype pair        = 20.0551 seconds
        BLAS3 mul! to get M and N      = 0.215178 seconds per thread
        haplopair search               = 2.63116 seconds per thread
        initializing missing           = 0.324441 seconds per thread
        allocating and viewing         = 0.0415098 seconds per thread
        index conversion               = 0.000730224 seconds per thread
    Phasing by win-win intersection = 11.1837 seconds
        Window-by-window intersection  = 0.0532841 seconds per thread
        Breakpoint search              = 0.102382 seconds per thread
        Recording result               = 0.00478878 seconds per thread
    Imputation                      = 18.7042 seconds



In [5]:
# profile real problem
cd("/Users/biona001/.julia/dev/MendelImpute/data/HRC")
Random.seed!(2020)
d       = 1000
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.maxd$d.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
@profview phase(tgtfile, reffile, outfile = outfile, max_d = d,
    dynamic_programming = false);

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:42[39m
[32mComputing optimal haplotypes...100%|████████████████████| Time: 0:02:22[39m
[32mPhasing...100%|█████████████████████████████████████████| Time: 0:01:32[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:02:18[39m


Total windows = 3252, averaging ~ 510 unique haplotypes per window.

Timings: 
    Data import                     = 204.957 seconds
    Computing haplotype pair        = 142.399 seconds
        BLAS3 mul! to get M and N      = 2.35351 seconds per thread
        haplopair search               = 126.861 seconds per thread
        initializing missing           = 3.4539 seconds per thread
        allocating and viewing         = 0.17913 seconds per thread
        index conversion               = 0.133923 seconds per thread
    Phasing by win-win intersection = 92.7697 seconds
        Window-by-window intersection  = 74.2154 seconds per thread
        Breakpoint search              = 2.20214 seconds per thread
        Recording result               = 7.43114 seconds per thread
    Imputation                      = 172.38 seconds

/Users/julia/buildbot/worker/package_macos64/build/usr/share/julia/stdlib/v1.5/Serialization/src/Serialization.jl, handle_deserialize: line 824
/Users/julia/buil