# Test imputation on untyped SNPs chrom 20

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLD2, FileIO, JLSO
using BenchmarkTools
using GroupSlices
using TimerOutputs
using LinearAlgebra
# using Profile
# using ProfileView

BLAS.set_num_threads(1)

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


# MendelImpute error rate (window-window intersection)

In [2]:
Threads.nthreads()

8

In [None]:
# 8 threads
Random.seed!(2020)
width   = 64
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w$width.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width,
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile);
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz");
n, p = size(X_mendel);
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:27[39m
[32mComputing optimal haplotypes...100%|████████████████████| Time: 0:14:42[39m
[32mPhasing...100%|█████████████████████████████████████████| Time: 0:00:24[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:47[39m


Total windows = 2789, averaging ~ 874 unique haplotypes per window.

Timings: 
    Data import                     = 61.7013 seconds
    Computing haplotype pair        = 884.229 seconds
        BLAS3 mul! to get M and N      = 4.87337 seconds per thread
        haplopair search               = 187.35 seconds per thread
        initializing missing           = 0.877334 seconds per thread
        allocating internal matrices   = 115.777 seconds per thread
        index conversion               = 0.0592008 seconds per thread
        creating views                 = 0.000147259 seconds per thread
    Phasing by win-win intersection = 25.1239 seconds
        Window-by-window intersection  = 23.3743 seconds per thread
        Breakpoint search              = 0.215903 seconds per thread
        Recording result               = 0.82844 seconds per thread
    Imputation                      = 51.4231 seconds

1038

In [3]:
# 8 threads
Random.seed!(2020)
width   = 64
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w$width.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width,
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile);
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz");
n, p = size(X_mendel);
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:27[39m
[32mComputing optimal haplotypes...100%|████████████████████| Time: 0:14:40[39m
[32mPhasing...100%|█████████████████████████████████████████| Time: 0:00:25[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:47[39m


Total windows = 2789, averaging ~ 874 unique haplotypes per window.

Timings: 
    Data import                     = 62.3254 seconds
    Computing haplotype pair        = 881.311 seconds
        BLAS3 mul! to get M and N      = 4.96215 seconds per thread
        haplopair search               = 187.709 seconds per thread
        initializing missing           = 0.87772 seconds per thread
        allocating internal matrices   = 124.385 seconds per thread
        index conversion               = 0.0610335 seconds per thread
    Phasing by win-win intersection = 25.9149 seconds
        Window-by-window intersection  = 23.9672 seconds per thread
        Breakpoint search              = 0.293426 seconds per thread
        Recording result               = 0.820013 seconds per thread
    Imputation                      = 51.3748 seconds

1037.350792 seconds (471.25 M allocations: 69.230 GiB, 1.40% gc time)


0.0012398718991506012

# Try stepwise heuristic

In [None]:
# 8 threads
Random.seed!(2020)
width   = 512
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w$width.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width,
    dynamic_programming = false, max_haplotypes=800, stepwise=100);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile);
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz");
n, p = size(X_mendel);
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:26[39m
[32mComputing optimal haplotypes...  2%|▍                   |  ETA: 1:52:30[39m

## Profile `compute_optimal_haplotypes!`

In [2]:
Profile.init(n = 10^7, delay = 0.01) # 1 backtrace per 0.01 second

In [3]:
# first save intermediate results for quick loading later
Random.seed!(2020)
width   = 64
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w$width.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"

loaded = JLSO.load(reffile)
compressed_Hunique = loaded[:compressed_Hunique]

X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = 
    VCFTools.convert_gt(UInt8, tgtfile, trans=true, 
    save_snp_info=true, msg = "Importing genotype file...")

people = size(X, 2)
tgt_snps = size(X, 1)
ref_snps = length(compressed_Hunique.pos)
windows = floor(Int, tgt_snps / width)
num_unique_haps = round(Int, avg_haplotypes_per_window(compressed_Hunique))

# working arrays
ph = [HaplotypeMosaicPair(ref_snps) for i in 1:people]
haplotype1 = [zeros(Int32, windows) for i in 1:people]
haplotype2 = [zeros(Int32, windows) for i in 1:people];

Profile.clear()
@profile haptimers = MendelImpute.compute_optimal_haplotypes!(haplotype1, haplotype2, 
    compressed_Hunique, X, X_pos, nothing, nothing,
    false, 1000, false)

# save result
# JLSO.save("haplotypes.chr20.w64", :haplotype1 => haplotype1, :haplotype2 => haplotype2)

InterruptException: InterruptException: