# Test imputation on untyped SNPs chrom 20

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLD2, FileIO, JLSO
using BenchmarkTools
using GroupSlices
using TimerOutputs
using LinearAlgebra

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


# JLSO compression

In [2]:
widths = [64, 128, 256, 512, 1024, 2048]
mafs = [0.01]

reffile = "ref.chr20.excludeTarget.vcf.gz"
H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, reffile, trans=true, save_snp_info=true, msg="importing reference data...")

for maf in mafs
    tgtfile = "target.chr20.typedOnly.maf$maf.masked.vcf.gz"
    X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = VCFTools.convert_gt(UInt8, tgtfile, trans=true, save_snp_info=true, msg = "Importing genotype file...")
    for width in widths
        outfile = "ref.chr20.$width.maf$maf.excludeTarget.jlso"
        @time compress_haplotypes(H, X, outfile, X_pos, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt, width)
    end
end

[32mimporting reference data...100%|████████████████████████| Time: 1:58:27[39m
[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:47[39m


1448.312517 seconds (25.33 M allocations: 208.233 GiB, 6.09% gc time)
860.528588 seconds (2.32 M allocations: 111.641 GiB, 3.83% gc time)
744.464060 seconds (2.09 M allocations: 65.391 GiB, 3.37% gc time)
924.790384 seconds (1.95 M allocations: 43.912 GiB, 3.62% gc time)
1253.220605 seconds (1.87 M allocations: 36.084 GiB, 3.08% gc time)
1069.813418 seconds (1.83 M allocations: 36.346 GiB, 3.18% gc time)


In [4]:
;ls -al ref.chr20.excludeTarget.vcf.gz

-rw-r--r--  1 biona001  staff  1510126972 Jun 23 14:01 ref.chr20.excludeTarget.vcf.gz


In [5]:
;ls -al ref.chr20.64.maf0.01.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  681365591 Jun 23 16:36 ref.chr20.64.maf0.01.excludeTarget.jlso


In [6]:
;ls -al ref.chr20.128.maf0.01.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  508560677 Jun 23 16:50 ref.chr20.128.maf0.01.excludeTarget.jlso


In [7]:
;ls -al ref.chr20.256.maf0.01.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  439346754 Jun 23 17:03 ref.chr20.256.maf0.01.excludeTarget.jlso


In [8]:
;ls -al ref.chr20.512.maf0.01.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  501298920 Jun 23 17:18 ref.chr20.512.maf0.01.excludeTarget.jlso


In [9]:
;ls -al ref.chr20.1024.maf0.01.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  712678854 Jun 23 17:39 ref.chr20.1024.maf0.01.excludeTarget.jlso


In [10]:
;ls -al ref.chr20.2048.maf0.01.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  1081480064 Jun 23 17:57 ref.chr20.2048.maf0.01.excludeTarget.jlso


# Average number of unique haplotypes

In [22]:
for width in [64, 128, 256, 512, 1024, 2048]
    @time loaded = JLSO.load("ref.chr20.$width.maf0.01.excludeTarget.jlso")
    @time compressed_Hunique = loaded[:compressed_Hunique]
    windows = length(compressed_Hunique.CW_typed)
    num_unique = 0
    for i in 1:windows
        num_unique += size(compressed_Hunique.CW_typed[i].uniqueH, 2)
    end
    num_unique /= windows
    println("width = $width has ~ $num_unique unique haplotypes per window\n")
end

 25.850278 seconds (12.59 M allocations: 5.699 GiB, 19.83% gc time)
  0.000001 seconds
width = 64 has ~ 873.78486912872 unique haplotypes per window

 21.706964 seconds (12.51 M allocations: 3.673 GiB, 27.18% gc time)
  0.000001 seconds
width = 128 has ~ 2497.544476327116 unique haplotypes per window

 19.490517 seconds (12.46 M allocations: 3.285 GiB, 28.05% gc time)
  0.000001 seconds
width = 256 has ~ 6985.797704447633 unique haplotypes per window

 23.512566 seconds (12.44 M allocations: 4.301 GiB, 29.49% gc time)
  0.000001 seconds
width = 512 has ~ 17234.44252873563 unique haplotypes per window

 33.005762 seconds (12.43 M allocations: 6.088 GiB, 26.90% gc time)
  0.000001 seconds
width = 1024 has ~ 32428.72988505747 unique haplotypes per window

 68.480450 seconds (12.42 M allocations: 7.605 GiB, 52.07% gc time)
  0.000000 seconds
width = 2048 has ~ 45384.942528735635 unique haplotypes per window



In [2]:
Random.seed!(2020)
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.w64.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
width   = 64

loaded = JLSO.load(reffile)
compressed_Hunique = loaded[:compressed_Hunique]

X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = VCFTools.convert_gt(UInt8, tgtfile, trans=true, save_snp_info=true, msg = "Importing genotype file...")

people = size(X, 2)
tgt_snps = size(X, 1)
ref_snps = length(compressed_Hunique.pos)
windows = floor(Int, tgt_snps / width)

redundant_haplotypes = [[Tuple{Int32, Int32}[] for i in 1:windows] for j in 1:people]
[[sizehint!(redundant_haplotypes[j][i], 1000) for i in 1:windows] for j in 1:people] # don't save >1000 redundant happairs
num_unique_haps = 0;

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:27[39m


In [6]:
Base.summarysize(compressed_Hunique)

5096466809

In [8]:
Base.summarysize(redundant_haplotypes)

133920040

In [11]:
w = 10
Hw_aligned = compressed_Hunique.CW_typed[w].uniqueH
Xw_idx_start = (w - 1) * width + 1
Xw_idx_end = (w == windows ? length(X_pos) : w * width)
Xw_aligned = X[Xw_idx_start:Xw_idx_end, :]

@time happairs, hapscore, t1, t2, t3 = haplopair(Xw_aligned, Hw_aligned)
@time compute_redundant_haplotypes!(redundant_haplotypes, compressed_Hunique, happairs, w)

  0.108125 seconds (28 allocations: 3.569 MiB)
  0.348203 seconds (38 allocations: 513.578 KiB)


(0.0001159999999999999, 0.34750362300000015, 0.0004779989999999998)

In [4]:
@btime compute_redundant_haplotypes!(redundant_haplotypes, compressed_Hunique, happairs, w)

  398.734 ms (30 allocations: 513.34 KiB)


In [15]:
t1, t2, t3 = compute_redundant_haplotypes!(redundant_haplotypes, compressed_Hunique, happairs, w)

(9.732700000000017e-5, 0.3735346199999999, 0.000452377)

In [17]:
person = 100
@show happairs[1][person], happairs[2][person]
redundant_haplotypes[person][w]

((happairs[1])[person], (happairs[2])[person]) = (1, 7)


1000-element Array{Tuple{Int32,Int32},1}:
 (1, 20)   
 (1, 38)   
 (1, 61)   
 (1, 88)   
 (1, 97)   
 (1, 163)  
 (1, 168)  
 (1, 177)  
 (1, 180)  
 (1, 266)  
 (1, 334)  
 (1, 520)  
 (1, 583)  
 ⋮         
 (1, 34041)
 (1, 34050)
 (1, 34066)
 (1, 34116)
 (1, 34151)
 (1, 34172)
 (1, 34180)
 (1, 34212)
 (1, 34222)
 (1, 34242)
 (1, 34248)
 (1, 34350)

In [27]:
println(compressed_Hunique.CW_typed[1000].hapmap)

[1, 1, 1, 1, 1, 6, 1, 8, 1, 8, 11, 12, 1, 1, 1, 6, 17, 18, 1, 6, 6, 1, 1, 1, 11, 1, 18, 1, 1, 6, 31, 1, 1, 34, 1, 36, 6, 1, 18, 6, 1, 1, 6, 31, 6, 1, 11, 6, 1, 1, 1, 18, 1, 1, 1, 1, 1, 8, 1, 6, 1, 8, 8, 1, 1, 1, 36, 1, 31, 31, 1, 1, 1, 12, 8, 12, 1, 17, 1, 8, 31, 1, 1, 1, 1, 18, 87, 88, 36, 1, 1, 1, 6, 1, 11, 6, 1, 12, 8, 6, 1, 18, 1, 1, 6, 1, 1, 1, 1, 1, 1, 31, 1, 6, 1, 12, 1, 1, 1, 1, 6, 1, 123, 31, 1, 1, 18, 18, 1, 130, 1, 1, 1, 11, 135, 1, 135, 130, 6, 8, 1, 6, 12, 11, 6, 11, 135, 1, 1, 12, 31, 123, 6, 1, 6, 1, 6, 8, 8, 6, 1, 8, 6, 1, 1, 11, 8, 6, 1, 1, 6, 1, 1, 1, 6, 6, 12, 11, 6, 6, 6, 88, 1, 1, 36, 36, 1, 8, 6, 11, 6, 1, 1, 6, 1, 6, 6, 6, 1, 6, 88, 1, 1, 1, 1, 8, 11, 1, 8, 6, 6, 11, 36, 214, 6, 1, 1, 6, 1, 1, 6, 1, 1, 1, 6, 123, 1, 6, 1, 6, 6, 12, 135, 1, 1, 1, 6, 130, 1, 1, 1, 1, 1, 6, 12, 12, 31, 1, 6, 6, 31, 1, 1, 31, 12, 1, 1, 6, 12, 36, 1, 6, 12, 11, 17, 266, 6, 1, 12, 31, 1, 130, 6, 135, 17, 1, 1, 31, 1, 6, 31, 135, 88, 214, 1, 1, 1, 6, 6, 214, 6, 1, 6, 31, 1, 1, 12, 8, 1,

# MendelImpute error rate

In [2]:
Threads.nthreads()

8

In [2]:
# width 64
Random.seed!(2020)
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.64.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
width   = 64
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


SystemError: SystemError: opening file "ref.chr20.64.maf0.01.excludeTarget.jlso": No such file or directory

In [3]:
;ls

HRC.r1-1.EGA.GRCh37.chr20.haplotypes.vcf.gz
HRC.r1-1.EGA.GRCh37.chr21.haplotypes.vcf.gz
HRC.r1-1.EGA.GRCh37.chr22.haplotypes.vcf.gz
chr20.ipynb
chr21.ipynb
chr22.ipynb
ref.chr20.excludeTarget.vcf.gz
ref.chr20.w512.maf0.01.excludeTarget.jlso
target.chr20.full.vcf.gz
target.chr20.typedOnly.maf0.01.masked.vcf
target.chr20.typedOnly.maf0.01.masked.vcf.gz
target.chr20.typedOnly.maf0.01.vcf.gz
target.chr21.full.vcf.gz
target.chr21.typedOnly.maf0.05.masked.vcf.gz
target.chr21.typedOnly.maf0.05.vcf.gz
target.chr22.full.vcf.gz
target.chr22.typedOnly.maf0.1.masked.vcf.gz
target.chr22.typedOnly.maf0.1.vcf.gz
