# Test imputation on untyped SNPs chrom 22

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLD2, FileIO, JLSO
using BenchmarkTools
using GroupSlices

In [2]:
Threads.nthreads()

8

In [3]:
chr = 21
maf = 0.05
width = 2048

tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr$chr.excludeTarget.jlso"

loaded = JLSO.load(reffile)
compressed_Hunique = loaded[:compressed_Hunique]

X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = VCFTools.convert_gt(UInt8, tgtfile, trans=true, save_snp_info=true, msg = "Importing genotype file...")

people = size(X, 2)
ref_snps = compressed_Hunique.snps
windows = floor(Int, ref_snps / width)
redundant_haplotypes = [[Tuple{Int, Int}[] for i in 1:windows] for j in 1:people]
typed_snps = Vector{Vector{Int}}(undef, windows);

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:10[39m


In [4]:
compressed_Hunique.snps

530007

In [5]:
windows

258

In [6]:
# print number of typed snps in each window
for w in 1:windows
    cur_range = compressed_Hunique.CWrange[w]
    Hw_pos = compressed_Hunique.pos[cur_range]
    XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
    XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
    Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
    Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :]
    typed_snps[w] = XtoH_rm_nothing # save typed snps index for current window
    print(length(typed_snps[w]), ", ")
end

49, 39, 84, 146, 193, 182, 245, 371, 466, 630, 369, 299, 239, 400, 242, 256, 274, 183, 202, 320, 216, 161, 271, 162, 185, 311, 35, 271, 241, 278, 314, 212, 256, 245, 295, 270, 284, 238, 339, 386, 377, 283, 298, 390, 483, 348, 374, 397, 264, 290, 330, 308, 326, 259, 230, 335, 282, 623, 274, 434, 304, 316, 336, 209, 340, 339, 374, 328, 403, 319, 111, 278, 400, 234, 310, 157, 224, 327, 379, 316, 342, 432, 333, 380, 287, 182, 451, 402, 321, 284, 435, 312, 314, 335, 187, 246, 283, 240, 354, 258, 246, 248, 313, 317, 318, 537, 393, 400, 492, 366, 430, 323, 287, 284, 287, 400, 375, 363, 379, 340, 297, 445, 420, 245, 256, 131, 119, 208, 296, 293, 367, 193, 420, 246, 240, 388, 276, 275, 284, 204, 278, 380, 249, 403, 354, 321, 327, 259, 303, 226, 273, 344, 331, 413, 299, 178, 108, 280, 332, 216, 241, 246, 429, 229, 167, 262, 327, 301, 296, 236, 357, 204, 243, 427, 407, 281, 313, 330, 285, 297, 351, 323, 239, 343, 237, 378, 363, 287, 173, 249, 340, 340, 249, 322, 427, 350, 375, 313, 427, 418, 394,

In [7]:
w = 2 # 39 SNPs
cur_range = compressed_Hunique.CWrange[w]
Hw_pos = compressed_Hunique.pos[cur_range]
XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :]
typed_snps[w] = XtoH_rm_nothing

@time happairs, hapscore = haplopair(Xw_aligned, Hw_aligned)
@time compute_redundant_haplotypes!(redundant_haplotypes, compressed_Hunique, happairs, w)

697.783005 seconds (3.64 M allocations: 3.736 GiB, 0.01% gc time)
  7.041011 seconds (86.94 k allocations: 699.708 MiB)


In [8]:
w = 8 # 371 SNPs
cur_range = compressed_Hunique.CWrange[w]
Hw_pos = compressed_Hunique.pos[cur_range]
XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :]
typed_snps[w] = XtoH_rm_nothing

@time happairs, hapscore = haplopair(Xw_aligned, Hw_aligned)
@time compute_redundant_haplotypes!(redundant_haplotypes, compressed_Hunique, happairs, w)

486.585927 seconds (2.02 k allocations: 2.628 GiB, 0.11% gc time)
  7.574592 seconds (11.11 k allocations: 712.886 MiB, 12.91% gc time)


In [None]:
# searching single bkpts, deleting suboptimal pairs in dp, skip windows with <100 typed snps
chr = 22
maf = 0.1
width = 2048

Random.seed!(2020)
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr$chr.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width)

X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:09[39m
