# Test imputation on untyped SNPs chrom 20

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLD2, FileIO, JLSO
using BenchmarkTools
using GroupSlices
using TimerOutputs
using LinearAlgebra

BLAS.set_num_threads(1)

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


# MendelImpute error rate

In [2]:
Threads.nthreads()

8

In [5]:
# 8 threads
Random.seed!(2020)
width   = 64
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w$width.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width,
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile);
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz");
n, p = size(X_mendel);
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:28[39m
[32mComputing optimal haplotypes...100%|████████████████████| Time: 0:14:58[39m
[32mPhasing...100%|█████████████████████████████████████████| Time: 0:01:34[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:48[39m


Total windows = 2789, averaging ~ 874 unique haplotypes per window.

Timings: 
    Data import                     = 63.8725 seconds
    Computing haplotype pair        = 900.19 seconds
        BLAS3 mul! to get M and N      = 8.82825 seconds per thread
        haplopair search               = 252.739 seconds per thread
        index conversion               = 0.0596418 seconds per thread
    Phasing by win-win intersection = 94.5099 seconds
        Window-by-window intersection  = 90.9773 seconds per thread
        Breakpoint search              = 0.27636 seconds per thread
        Recording result               = 1.26076 seconds per thread
    Imputation                      = 52.5559 seconds

1111.278428 seconds (415.80 M allocations: 68.620 GiB, 1.32% gc time)


0.0012398718991506012

# Try optimize Phasing

In [None]:
# first save intermediate results for quick loading later
Random.seed!(2020)
width   = 64
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w$width.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"

loaded = JLSO.load(reffile)
compressed_Hunique = loaded[:compressed_Hunique]

X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = 
    VCFTools.convert_gt(UInt8, tgtfile, trans=true, 
    save_snp_info=true, msg = "Importing genotype file...")

people = size(X, 2)
tgt_snps = size(X, 1)
ref_snps = length(compressed_Hunique.pos)
windows = floor(Int, tgt_snps / width)
num_unique_haps = round(Int, avg_haplotypes_per_window(compressed_Hunique))

# working arrays
ph = [HaplotypeMosaicPair(ref_snps) for i in 1:people]
haplotype1 = [zeros(Int32, windows) for i in 1:people]
haplotype2 = [zeros(Int32, windows) for i in 1:people];

haptimers = MendelImpute.compute_optimal_haplotypes!(haplotype1, haplotype2, 
    compressed_Hunique, X, X_pos, nothing, nothing,
    false, 1000, false)

JLSO.save("haplotypes.chr20.w64", :haplotype1 => haplotype1, :haplotype2 => haplotype2)

In [3]:
# load intermediate results
Random.seed!(2020)
width   = 64
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w$width.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"

loaded = JLSO.load(reffile)
compressed_Hunique = loaded[:compressed_Hunique]

X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = 
    VCFTools.convert_gt(UInt8, tgtfile, trans=true, 
    save_snp_info=true, msg = "Importing genotype file...")

loaded = JLSO.load("haplotypes.chr20.w64")
haplotype1 = loaded[:haplotype1]
haplotype2 = loaded[:haplotype2]
ref_snps = length(compressed_Hunique.pos)
people = size(X, 2)
ph = [HaplotypeMosaicPair(ref_snps) for i in 1:people];

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:29[39m


In [5]:
@time phasetimer = phase_fast!(ph, X, compressed_Hunique, haplotype1, haplotype2);

[32mIntersecting win-by-win...100%|█████████████████████████| Time: 0:07:21[39m


443.843770 seconds (15.53 M allocations: 1.219 GiB, 0.17% gc time)


In [4]:
# reset storage 
haplotype1 = loaded[:haplotype1]
haplotype2 = loaded[:haplotype2]
ph = [HaplotypeMosaicPair(ref_snps) for i in 1:people];

# reset storage
haplotypes = nhaplotypes(compressed_Hunique)
seen = BitSet()
survivors1 = Int32[]
survivors2 = Int32[]
sizehint!(seen, haplotypes)
sizehint!(survivors1, haplotypes)
sizehint!(survivors2, haplotypes)

# computation step
people = size(X, 2)
pmeter = Progress(people, 5, "Intersecting win-by-win...")
t = @timed for i in 1:people
    @inbounds phase_sample!(haplotype1[i], haplotype2[i],
        compressed_Hunique, seen, survivors1, survivors2)
    next!(pmeter)
end
t

[32mIntersecting win-by-win...100%|█████████████████████████| Time: 0:07:17[39m


(nothing, 437.306330106, 23429145, 0.011047775, Base.GC_Diff(23429145, 0, 0, 449990, 139, 0, 11047775, 1, 0))