# Test imputation on untyped SNPs chrom 22

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLD2, FileIO, JLSO
using BenchmarkTools
using GroupSlices
using TimerOutputs
using LinearAlgebra

In [4]:
Threads.nthreads()

1

In [6]:
chr = 22
maf = 0.1
width = 2048

tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr$chr.excludeTarget.jlso"

loaded = JLSO.load(reffile)
compressed_Hunique = loaded[:compressed_Hunique]

X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = VCFTools.convert_gt(UInt8, tgtfile, trans=true, save_snp_info=true, msg = "Importing genotype file...")

people = size(X, 2)
ref_snps = compressed_Hunique.snps
windows = floor(Int, ref_snps / width)
redundant_haplotypes = [[Tuple{Int, Int}[] for i in 1:windows] for j in 1:people]
typed_snps = Vector{Vector{Int}}(undef, windows);

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:08[39m


In [8]:
compressed_Hunique.snps

523162

In [9]:
windows

255

In [3]:
# print number of typed snps in each window
avg_typedsnps = 0
for w in 1:windows
    cur_range = compressed_Hunique.CWrange[w]
    Hw_pos = compressed_Hunique.pos[cur_range]
    XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
    XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
    Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
    Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :]
    typed_snps[w] = XtoH_rm_nothing # save typed snps index for current window
    print(length(typed_snps[w]), ", ")
    avg_typedsnps += length(typed_snps[w])
end
println(avg_typedsnps / windows)

40, 116, 189, 297, 236, 347, 265, 240, 307, 269, 285, 200, 303, 219, 182, 255, 280, 98, 140, 227, 200, 212, 284, 225, 320, 182, 215, 228, 188, 206, 268, 176, 226, 140, 173, 282, 513, 377, 212, 284, 410, 268, 266, 231, 164, 222, 296, 262, 268, 324, 289, 74, 123, 70, 130, 207, 335, 191, 288, 326, 206, 148, 139, 91, 262, 233, 152, 244, 339, 205, 344, 305, 322, 356, 253, 230, 311, 301, 303, 250, 120, 67, 70, 84, 125, 187, 115, 248, 157, 201, 264, 223, 161, 144, 137, 220, 278, 192, 248, 201, 223, 161, 124, 126, 54, 104, 104, 209, 343, 249, 292, 254, 317, 220, 285, 202, 151, 186, 183, 212, 309, 168, 248, 289, 422, 196, 155, 179, 260, 176, 343, 216, 189, 279, 190, 160, 68, 238, 356, 164, 259, 196, 193, 244, 281, 277, 354, 282, 160, 200, 170, 94, 155, 256, 169, 109, 222, 178, 232, 215, 322, 261, 152, 196, 112, 136, 167, 223, 118, 207, 241, 122, 185, 202, 185, 144, 235, 216, 222, 180, 223, 219, 290, 361, 232, 245, 256, 241, 245, 185, 234, 301, 389, 287, 320, 270, 367, 316, 226, 271, 315, 227, 2

231.88235294117646

In [6]:
# print number of unique haplotypes in each window
avg_haps = 0
for w in 1:windows
    cur_range = compressed_Hunique.CWrange[w]
    Hw_pos = compressed_Hunique.pos[cur_range]
    XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
    XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
    Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
    Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :]
    typed_snps[w] = XtoH_rm_nothing # save typed snps index for current window
    print(size(Hw_aligned, 2), ", ")
    avg_haps += size(Hw_aligned, 2)
end
avg_haps / windows

43153, 23073, 22231, 41076, 29386, 27008, 46315, 37642, 25765, 21839, 20595, 21527, 16158, 32760, 47892, 18580, 28264, 8729, 9500, 18800, 37802, 17573, 20875, 9675, 26973, 49406, 12181, 37537, 9241, 7059, 35183, 51484, 15874, 7438, 16325, 23100, 25285, 18477, 19886, 26153, 22792, 37703, 26449, 11748, 9639, 27353, 26110, 36999, 13381, 15974, 39117, 20871, 16207, 6714, 11477, 21526, 17455, 21065, 37932, 31302, 18634, 15999, 19634, 17454, 19643, 37004, 31287, 12731, 31721, 13043, 13420, 39254, 32144, 33890, 28011, 35041, 25275, 38534, 28050, 23027, 35030, 6441, 6646, 6048, 6606, 7603, 7407, 24258, 21789, 12665, 9804, 10714, 6218, 5743, 9427, 12611, 11579, 14809, 10717, 13624, 17989, 5594, 8992, 8530, 6352, 7827, 8181, 10707, 20881, 18757, 35873, 31670, 11913, 8145, 29269, 14086, 18221, 17486, 14247, 18401, 12909, 22749, 16742, 10408, 11499, 10458, 9562, 7940, 7660, 20783, 17763, 10630, 18572, 30646, 19162, 17001, 8480, 15818, 34804, 17233, 16029, 27683, 25063, 33654, 24763, 37337, 23868, 

20747.133333333335

In [6]:
# keep all happairs that within some range of best pair
w = 2
cur_range = compressed_Hunique.CWrange[w]
Hw_pos = compressed_Hunique.pos[cur_range]
XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :]
typed_snps[w] = XtoH_rm_nothing

@time happairs, hapscore = haplopair(Xw_aligned, Hw_aligned)
@time compute_redundant_haplotypes!(redundant_haplotypes, compressed_Hunique, happairs, w)

multiplication took 5.10690735 seconds
haplopair      took 317.242422826 seconds
last step      took 9.4127e-5 seconds
initXfloat! time = 0.000624971
haplopair!  time = 322.351309052
323.519544 seconds (347.26 k allocations: 2.309 GiB, 0.30% gc time)
228.806131 seconds (44 allocations: 255.656 KiB)


In [7]:
# keep (<=100) happairs that within some range of best pair
w = 2
cur_range = compressed_Hunique.CWrange[w]
Hw_pos = compressed_Hunique.pos[cur_range]
XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :]
typed_snps[w] = XtoH_rm_nothing

@time happairs, hapscore = haplopair(Xw_aligned, Hw_aligned)
@time compute_redundant_haplotypes!(redundant_haplotypes, compressed_Hunique, happairs, w)

multiplication took 4.806436387 seconds
haplopair      took 335.317340209 seconds
last step      took 9.2642e-5 seconds
initXfloat! time = 0.000614286
haplopair!  time = 340.125338714
341.316505 seconds (313.94 k allocations: 2.096 GiB, 0.09% gc time)
  5.104356 seconds (176 allocations: 1.820 MiB)


In [10]:
# save only best happair
w = 2
cur_range = compressed_Hunique.CWrange[w]
Hw_pos = compressed_Hunique.pos[cur_range]
XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :]
typed_snps[w] = XtoH_rm_nothing

@time happairs, hapscore = haplopair(Xw_aligned, Hw_aligned)
@time compute_redundant_haplotypes!(redundant_haplotypes, compressed_Hunique, happairs, w)

multiplication took 4.792340524 seconds
haplopair      took 192.100335024 seconds
last step      took 9.2731e-5 seconds
initXfloat! time = 0.000865971
haplopair!  time = 196.892854113
197.797263 seconds (204 allocations: 2.080 GiB, 0.06% gc time)
  0.044044 seconds (26 allocations: 65.063 KiB)


In [2]:
@time M = zeros(Float32, 20000, 20000);
@time N = zeros(Float32, 1000, 20000);

  0.805913 seconds (6 allocations: 1.490 GiB, 0.84% gc time)
  0.035577 seconds (7 allocations: 76.294 MiB)


In [3]:
w = 2
cur_range = compressed_Hunique.CWrange[w]
Hw_pos = compressed_Hunique.pos[cur_range]
XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :]
typed_snps[w] = XtoH_rm_nothing

using ProfileView
using Profile
Profile.init(n = 10^7, delay = 0.1)

t = Matrix{Union{UInt8, Missing}}(undef, 100, 100)
t .= rand(UInt8, 100, 100);
@profview happairs, hapscore = haplopair(t, falses(100, 100));
# @profview happairs, hapscore = haplopair(Xw_aligned, Hw_aligned)


Xwork = zeros(Float32, size(Xw_aligned, 1), size(Xw_aligned, 2))
Hwork = convert(Matrix{Float32}, Hw_aligned)
MendelImpute.initXfloat!(Xw_aligned, Xwork)

p, n     = size(Xw_aligned)
d        = size(Hw_aligned, 2)
M        = zeros(Float32, d, d)
N        = zeros(Float32, n, d)
happairs = [Tuple{Int, Int}[] for i in 1:n]
hapscore = zeros(Float32, n)
sizehint!.(happairs, 100) # will not save > 100 unique haplotype pairs to conserve memory
@profview haplopair!(Xwork, Hwork, M, N, happairs, hapscore)

Gtk.GtkWindowLeaf(name="", parent, width-request=-1, height-request=-1, visible=TRUE, sensitive=TRUE, app-paintable=FALSE, can-focus=FALSE, has-focus=FALSE, is-focus=FALSE, focus-on-click=TRUE, can-default=FALSE, has-default=FALSE, receives-default=FALSE, composite-child=FALSE, style, events=0, no-show-all=FALSE, has-tooltip=FALSE, tooltip-markup=NULL, tooltip-text=NULL, window, opacity=1.000000, double-buffered, halign=GTK_ALIGN_FILL, valign=GTK_ALIGN_FILL, margin-left, margin-right, margin-start=0, margin-end=0, margin-top=0, margin-bottom=0, margin=0, hexpand=FALSE, vexpand=FALSE, hexpand-set=FALSE, vexpand-set=FALSE, expand=FALSE, scale-factor=2, border-width=0, resize-mode, child, type=GTK_WINDOW_TOPLEVEL, title="Profile", role=NULL, resizable=TRUE, modal=FALSE, window-position=GTK_WIN_POS_NONE, default-width=800, default-height=600, destroy-with-parent=FALSE, hide-titlebar-when-maximized=FALSE, icon, icon-name=NULL, screen, type-hint=GDK_WINDOW_TYPE_HINT_NORMAL, skip-taskbar-hint

# Time haplopair

In [2]:
chr = 22
maf = 0.1
width = 2048
Random.seed!(2020)
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr$chr.excludeTarget.jlso"

X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = VCFTools.convert_gt(UInt8, tgtfile, trans=true, save_snp_info=true, msg = "Importing genotype file...")
loaded = JLSO.load(reffile)
compressed_Hunique = loaded[:compressed_Hunique]

w = 10
cur_range = compressed_Hunique.CWrange[w]
Hw_pos = compressed_Hunique.pos[cur_range]
XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :];

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:08[39m


In [3]:
size(Xw_aligned), size(Hw_aligned)

((269, 1000), (269, 21839))

In [4]:
const to = TimerOutput()

function haplopair_time(
    X::AbstractMatrix,
    H::BitMatrix
    )

    Xwork = zeros(Float32, size(X, 1), size(X, 2))
    Hwork = convert(Matrix{Float32}, H)
    MendelImpute.initXfloat!(X, Xwork)

    @timeit to "allocation" begin 
        p, n     = size(X)
        d        = size(H, 2)
        M        = zeros(Float32, d, d)
        N        = zeros(Float32, n, d)
        happairs = ones(Int, n), ones(Int, n)
        hapscore = zeros(Float32, n)
    end
    haplopair_time!(Xwork, Hwork, M, N, happairs, hapscore)

    return happairs, hapscore
end

function haplopair_time!(
    X::AbstractMatrix,
    H::AbstractMatrix,
    M::AbstractMatrix,
    N::AbstractMatrix,
    happairs::Tuple{AbstractVector, AbstractVector},
    hapscore::AbstractVector
    )

    p, n, d = size(X, 1), size(X, 2), size(H, 2)

    # assemble M (upper triangular only)
    @timeit to "matmul" begin mul!(M, Transpose(H), H)
        for j in 1:d, i in 1:(j - 1) # off-diagonal
            M[i, j] = 2M[i, j] + M[i, i] + M[j, j]
        end
        for j in 1:d # diagonal
            M[j, j] *= 4
        end

        # assemble N
        mul!(N, Transpose(X), H)
        @simd for I in eachindex(N)
            N[I] *= 2
        end
    end

    # computational routine
    @timeit to "search mat" MendelImpute.haplopair!(happairs, hapscore, M, N)

    # supplement the constant terms in objective
    @timeit to "const term" begin @inbounds for j in 1:n
            @simd for i in 1:p
                hapscore[j] += abs2(X[i, j])
            end
        end
    end
end

haplopair_time(Xw_aligned, Hw_aligned)
print_timer(to)

[0m[1m ─────────────────────────────────────────────────────────────────────[22m
[0m[1m                      [22m        Time                   Allocations      
                      ──────────────────────   ───────────────────────
   Tot / % measured:        195s / 99.2%           2.12GiB / 87.6%    

 Section      ncalls     time   %tot     avg     alloc   %tot      avg
 ─────────────────────────────────────────────────────────────────────
 search mat        1     187s  97.0%    187s     0.00B  0.00%    0.00B
 matmul            1    4.54s  2.35%   4.54s     0.00B  0.00%    0.00B
 allocation        1    1.17s  0.61%   1.17s   1.86GiB  100%   1.86GiB
 const term        1    524μs  0.00%   524μs     0.00B  0.00%    0.00B
[0m[1m ─────────────────────────────────────────────────────────────────────[22m

In [16]:
#w = 2, keep top happairs within some range of best one
@time happairs, hapscore = haplopair(Xw_aligned, Hw_aligned);

377.271177 seconds (2.02 k allocations: 2.081 GiB, 0.05% gc time)


In [None]:
# searching single bkpts, deleting suboptimal pairs in dp, skip windows with <100 typed snps
chr = 22
maf = 0.1
width = 2048

Random.seed!(2020)
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr$chr.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width)

X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:10[39m


running window 161 / 255
running window 193 / 255
running window 225 / 255
running window 129 / 255
running window 97 / 255
running window 1 / 255
running window 65 / 255
running window 33 / 255
running window 2 / 255
window 129 completed haplopair routine
window 129 completed compute_redundant_haplotypes
running window 130 / 255
window 65 completed haplopair routine
window 193 completed haplopair routine


[32mComputing optimal haplotype pairs...  1%|▏              |  ETA: 4:08:22[39m