# Test imputation on untyped SNPs chrom 20

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLD2, FileIO, JLSO
using BenchmarkTools
using GroupSlices
using TimerOutputs
using LinearAlgebra
# using Profile
# using ProfileView

BLAS.set_num_threads(1)

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1278


# MendelImpute error rate (window-window intersection)

In [2]:
Threads.nthreads()

8

In [3]:
# 1 threads
Random.seed!(2020)
width   = 64
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w$width.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width,
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile);
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz");
n, p = size(X_mendel);
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:27[39m
[32mComputing optimal haplotypes...100%|████████████████████| Time: 0:23:01[39m
[32mPhasing...100%|█████████████████████████████████████████| Time: 0:02:14[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:44[39m


Total windows = 2789, averaging ~ 874 unique haplotypes per window.

Timings: 
    Data import                     = 58.5177 seconds
    Computing haplotype pair        = 1383.12 seconds
        BLAS3 mul! to get M and N      = 27.2261 seconds per thread
        haplopair search               = 1345.49 seconds per thread
        initializing missing           = 5.69148 seconds per thread
        allocating and viewing         = 0.622084 seconds per thread
        index conversion               = 0.29668 seconds per thread
    Phasing by win-win intersection = 134.576 seconds
        Window-by-window intersection  = 131.214 seconds per thread
        Breakpoint search              = 1.1651 seconds per thread
        Recording result               = 1.51797 seconds per thread
    Imputation                      = 63.5032 seconds

1647.668093 seconds (456.23 M allocations: 40.962 GiB, 0.62% gc time)


0.0012403941355458334

In [3]:
# 8 threads (Threads.@threads)
Random.seed!(2020)
width   = 64
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w$width.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width,
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile);
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz");
n, p = size(X_mendel);
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:27[39m
[32mComputing optimal haplotypes... 86%|█████████████████▏  |  ETA: 0:00:44[39m

LoadError: TaskFailedException:
InterruptException:
Stacktrace:
 [1] macro expansion at /Users/biona001/.julia/dev/MendelImpute/src/haplotype_pair.jl:129 [inlined]
 [2] (::MendelImpute.var"#61#threadsfor_fun#77"{Array{Array{Int32,1},1},Array{Array{Int32,1},1},MendelImpute.CompressedHaplotypes,Array{Union{Missing, UInt8},2},Array{Int64,1},Nothing,Nothing,Bool,Int64,Bool,Int64,Int64,Int64,Array{Array{Float64,1},1},Array{Array{Int32,1},1},Array{Array{Int32,1},1},Array{Array{Float32,1},1},Array{Array{Float32,2},1},Array{Array{Float32,2},1},Progress,UnitRange{Int64}})(::Bool) at ./threadingconstructs.jl:81
 [3] (::MendelImpute.var"#61#threadsfor_fun#77"{Array{Array{Int32,1},1},Array{Array{Int32,1},1},MendelImpute.CompressedHaplotypes,Array{Union{Missing, UInt8},2},Array{Int64,1},Nothing,Nothing,Bool,Int64,Bool,Int64,Int64,Int64,Array{Array{Float64,1},1},Array{Array{Int32,1},1},Array{Array{Int32,1},1},Array{Array{Float32,1},1},Array{Array{Float32,2},1},Array{Array{Float32,2},1},Progress,UnitRange{Int64}})() at ./threadingconstructs.jl:48

In [None]:
# 8 threads (ThreadPools.@qthreads)
Random.seed!(2020)
width   = 64
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w$width.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width,
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile);
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz");
n, p = size(X_mendel);
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:25[39m
[32mComputing optimal haplotypes... 79%|███████████████▊    |  ETA: 0:01:23[39m

In [3]:
# 8 threads
Random.seed!(2020)
width   = 64
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w$width.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width,
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile);
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz");
n, p = size(X_mendel);
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:28[39m
[32mComputing optimal haplotypes...100%|████████████████████| Time: 0:06:49[39m
[32mPhasing...100%|█████████████████████████████████████████| Time: 0:00:27[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:47[39m


Total windows = 2789, averaging ~ 874 unique haplotypes per window.

Timings: 
    Data import                     = 60.0293 seconds
    Computing haplotype pair        = 415.187 seconds
        BLAS3 mul! to get M and N      = 5.38322 seconds per thread
        haplopair search               = 266.638 seconds per thread
        initializing missing           = 1.10494 seconds per thread
        allocating and viewing         = 0.610804 seconds per thread
        index conversion               = 0.0724439 seconds per thread
    Phasing by win-win intersection = 28.4098 seconds
        Window-by-window intersection  = 27.0122 seconds per thread
        Breakpoint search              = 0.334786 seconds per thread
        Recording result               = 0.267538 seconds per thread
    Imputation                      = 54.026 seconds

566.117875 seconds (456.20 M allocations: 44.394 GiB, 2.21% gc time)


0.0012403941355458334

# Try stepwise heuristic

In [None]:
# 8 threads
Random.seed!(2020)
width   = 512
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w$width.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width,
    dynamic_programming = false, max_haplotypes=800, stepwise=100);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile);
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz");
n, p = size(X_mendel);
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:26[39m
[32mComputing optimal haplotypes...  2%|▍                   |  ETA: 1:52:30[39m

## Profile `compute_optimal_haplotypes!`

In [2]:
Profile.init(n = 10^7, delay = 0.01) # 1 backtrace per 0.01 second

In [3]:
# first save intermediate results for quick loading later
Random.seed!(2020)
width   = 64
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w$width.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"

loaded = JLSO.load(reffile)
compressed_Hunique = loaded[:compressed_Hunique]

X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = 
    VCFTools.convert_gt(UInt8, tgtfile, trans=true, 
    save_snp_info=true, msg = "Importing genotype file...")

people = size(X, 2)
tgt_snps = size(X, 1)
ref_snps = length(compressed_Hunique.pos)
windows = floor(Int, tgt_snps / width)
num_unique_haps = round(Int, avg_haplotypes_per_window(compressed_Hunique))

# working arrays
ph = [HaplotypeMosaicPair(ref_snps) for i in 1:people]
haplotype1 = [zeros(Int32, windows) for i in 1:people]
haplotype2 = [zeros(Int32, windows) for i in 1:people];

Profile.clear()
@profile haptimers = MendelImpute.compute_optimal_haplotypes!(haplotype1, haplotype2, 
    compressed_Hunique, X, X_pos, nothing, nothing,
    false, 1000, false)

# save result
# JLSO.save("haplotypes.chr20.w64", :haplotype1 => haplotype1, :haplotype2 => haplotype2)

InterruptException: InterruptException: