# Test imputation on untyped SNPs chrom 20

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLD2, FileIO, JLSO
using BenchmarkTools
using GroupSlices
using TimerOutputs
using LinearAlgebra
# using Profile
# using ProfileView

BLAS.set_num_threads(1)

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1278


# MendelImpute error rate (window-window intersection)

In [2]:
Threads.nthreads()

8

In [2]:
# 8 threads (Threads.@threads)
Random.seed!(2020)
d       = 1000
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.maxd$d.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, max_d = d,
    dynamic_programming = false);

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:28[39m
[32mComputing optimal haplotypes...100%|████████████████████| Time: 0:00:59[39m
[32mPhasing...100%|█████████████████████████████████████████| Time: 0:00:38[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:46[39m


Total windows = 3252, averaging ~ 510 unique haplotypes per window.

Timings: 
    Data import                     = 61.639 seconds
    Computing haplotype pair        = 60.1232 seconds
        BLAS3 mul! to get M and N      = 0.947839 seconds per thread
        haplopair search               = 50.9561 seconds per thread
        initializing missing           = 1.27711 seconds per thread
        allocating and viewing         = 0.0885657 seconds per thread
        index conversion               = 0.07459 seconds per thread
    Phasing by win-win intersection = 38.349 seconds
        Window-by-window intersection  = 32.5618 seconds per thread
        Breakpoint search              = 0.741265 seconds per thread
        Recording result               = 2.28144 seconds per thread
    Imputation                      = 51.6932 seconds

219.933616 seconds (496.84 M allocations: 42.250 GiB, 5.70% gc time)


## Profile

In [2]:
Profile.init(n = 10^7, delay = 0.01) # 1 backtrace per 0.01 second

In [3]:
# first save intermediate results for quick loading later
Random.seed!(2020)
width   = 64
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w$width.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"

loaded = JLSO.load(reffile)
compressed_Hunique = loaded[:compressed_Hunique]

X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = 
    VCFTools.convert_gt(UInt8, tgtfile, trans=true, 
    save_snp_info=true, msg = "Importing genotype file...")

people = size(X, 2)
tgt_snps = size(X, 1)
ref_snps = length(compressed_Hunique.pos)
windows = floor(Int, tgt_snps / width)
num_unique_haps = round(Int, avg_haplotypes_per_window(compressed_Hunique))

# working arrays
ph = [HaplotypeMosaicPair(ref_snps) for i in 1:people]
haplotype1 = [zeros(Int32, windows) for i in 1:people]
haplotype2 = [zeros(Int32, windows) for i in 1:people];

Profile.clear()
@profile haptimers = MendelImpute.compute_optimal_haplotypes!(haplotype1, haplotype2, 
    compressed_Hunique, X, X_pos, nothing, nothing,
    false, 1000, false)

# save result
# JLSO.save("haplotypes.chr20.w64", :haplotype1 => haplotype1, :haplotype2 => haplotype2)

InterruptException: InterruptException: