# Test imputation on untyped SNPs chrom 22

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLD2, FileIO, JLSO
using BenchmarkTools
using GroupSlices

In [2]:
Threads.nthreads()

8

In [2]:
chr = 22
maf = 0.1
width = 2048

tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr$chr.excludeTarget.jlso"

loaded = JLSO.load(reffile)
compressed_Hunique = loaded[:compressed_Hunique]

X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = VCFTools.convert_gt(UInt8, tgtfile, trans=true, save_snp_info=true, msg = "Importing genotype file...")

people = size(X, 2)
ref_snps = compressed_Hunique.snps
windows = floor(Int, ref_snps / width)
redundant_haplotypes = [[Tuple{Int, Int}[] for i in 1:windows] for j in 1:people]
typed_snps = Vector{Vector{Int}}(undef, windows);

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


In [8]:
compressed_Hunique.snps

523162

In [9]:
windows

255

In [10]:
# print number of typed snps in each window
for w in 1:windows
    cur_range = compressed_Hunique.CWrange[w]
    Hw_pos = compressed_Hunique.pos[cur_range]
    XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
    XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
    Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
    Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :]
    typed_snps[w] = XtoH_rm_nothing # save typed snps index for current window
    print(length(typed_snps[w]), ", ")
end

40, 116, 189, 297, 236, 347, 265, 240, 307, 269, 285, 200, 303, 219, 182, 255, 280, 98, 140, 227, 200, 212, 284, 225, 320, 182, 215, 228, 188, 206, 268, 176, 226, 140, 173, 282, 513, 377, 212, 284, 410, 268, 266, 231, 164, 222, 296, 262, 268, 324, 289, 74, 123, 70, 130, 207, 335, 191, 288, 326, 206, 148, 139, 91, 262, 233, 152, 244, 339, 205, 344, 305, 322, 356, 253, 230, 311, 301, 303, 250, 120, 67, 70, 84, 125, 187, 115, 248, 157, 201, 264, 223, 161, 144, 137, 220, 278, 192, 248, 201, 223, 161, 124, 126, 54, 104, 104, 209, 343, 249, 292, 254, 317, 220, 285, 202, 151, 186, 183, 212, 309, 168, 248, 289, 422, 196, 155, 179, 260, 176, 343, 216, 189, 279, 190, 160, 68, 238, 356, 164, 259, 196, 193, 244, 281, 277, 354, 282, 160, 200, 170, 94, 155, 256, 169, 109, 222, 178, 232, 215, 322, 261, 152, 196, 112, 136, 167, 223, 118, 207, 241, 122, 185, 202, 185, 144, 235, 216, 222, 180, 223, 219, 290, 361, 232, 245, 256, 241, 245, 185, 234, 301, 389, 287, 320, 270, 367, 316, 226, 271, 315, 227, 2

In [11]:
w = 2
cur_range = compressed_Hunique.CWrange[w]
Hw_pos = compressed_Hunique.pos[cur_range]
XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :]
typed_snps[w] = XtoH_rm_nothing

@time happairs, hapscore = haplopair(Xw_aligned, Hw_aligned)
@time compute_redundant_haplotypes!(redundant_haplotypes, compressed_Hunique, happairs, w)

331.433073 seconds (3.64 M allocations: 2.250 GiB, 0.01% gc time)
 16.217426 seconds (87.71 k allocations: 7.104 GiB, 15.12% gc time)


In [3]:
w = 2
cur_range = compressed_Hunique.CWrange[w]
Hw_pos = compressed_Hunique.pos[cur_range]
XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :]
typed_snps[w] = XtoH_rm_nothing

using ProfileView
using Profile
Profile.init(n = 10^7, delay = 0.1)

t = Matrix{Union{UInt8, Missing}}(undef, 100, 100)
t .= rand(UInt8, 100, 100);
@profview happairs, hapscore = haplopair(t, falses(100, 100));
# @profview happairs, hapscore = haplopair(Xw_aligned, Hw_aligned)


X, H = Xw_aligned, Hw_aligned

Xwork = zeros(Float32, size(X, 1), size(X, 2))
Hwork = convert(Matrix{Float32}, H)
MendelImpute.initXfloat!(X, Xwork)

p, n     = size(X)
d        = size(H, 2)
M        = zeros(Float32, d, d)
N        = zeros(Float32, n, d)
happairs = [Tuple{Int, Int}[] for i in 1:n]
hapscore = zeros(Float32, n)
sizehint!.(happairs, 100) # will not save > 100 unique haplotype pairs to conserve memory
@profview haplopair!(Xwork, Hwork, M, N, happairs, hapscore)

Gtk.GtkWindowLeaf(name="", parent, width-request=-1, height-request=-1, visible=TRUE, sensitive=TRUE, app-paintable=FALSE, can-focus=FALSE, has-focus=FALSE, is-focus=FALSE, focus-on-click=TRUE, can-default=FALSE, has-default=FALSE, receives-default=FALSE, composite-child=FALSE, style, events=0, no-show-all=FALSE, has-tooltip=FALSE, tooltip-markup=NULL, tooltip-text=NULL, window, opacity=1.000000, double-buffered, halign=GTK_ALIGN_FILL, valign=GTK_ALIGN_FILL, margin-left, margin-right, margin-start=0, margin-end=0, margin-top=0, margin-bottom=0, margin=0, hexpand=FALSE, vexpand=FALSE, hexpand-set=FALSE, vexpand-set=FALSE, expand=FALSE, scale-factor=2, border-width=0, resize-mode, child, type=GTK_WINDOW_TOPLEVEL, title="Profile", role=NULL, resizable=TRUE, modal=FALSE, window-position=GTK_WIN_POS_NONE, default-width=800, default-height=600, destroy-with-parent=FALSE, hide-titlebar-when-maximized=FALSE, icon, icon-name=NULL, screen, type-hint=GDK_WINDOW_TYPE_HINT_NORMAL, skip-taskbar-hint

In [16]:
@time happairs, hapscore = haplopair(Xw_aligned, Hw_aligned);

377.271177 seconds (2.02 k allocations: 2.081 GiB, 0.05% gc time)


In [None]:
# searching single bkpts, deleting suboptimal pairs in dp, skip windows with <100 typed snps
chr = 22
maf = 0.1
width = 2048

Random.seed!(2020)
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr$chr.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width)

X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:09[39m
