# Test imputation on untyped SNPs chrom 18

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLD2, FileIO, JLSO
using BenchmarkTools
using GroupSlices
using TimerOutputs
using LinearAlgebra

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


### Memory requirement

**Prephasing step:** 
+ Target data requies $people * snps * 4$ bytes of RAM
+ Reference haplotype data requires $haplotypes * snps$ bits of RAM
+ Redundant haplotype set for imputation target requires roughly
$people * windows * 1000$ (max haplotypes per win) $* 16 bytes$ of RAM

## Generate subset of markers for prephasing

In [3]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function filter_and_mask()
    for chr in [18]
        # filter chromosome data for unique snps
        data = "../beagle_raw/chr$chr.1kg.phase3.v5a.vcf.gz"
        full_record_index = .!find_duplicate_marker(data)
        @time VCFTools.filter(data, full_record_index, 1:nsamples(data), 
            des = "chr$chr.uniqueSNPs.vcf.gz")

        # summarize data
        total_snps, samples, _, _, _, maf_by_record, _ = gtstats("chr$chr.uniqueSNPs.vcf.gz")

        # generate target panel with all snps
        n = 100
        sample_idx = falses(samples)
        sample_idx[1:n] .= true
        shuffle!(sample_idx)
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
            sample_idx, des = "target.chr$chr.full.vcf.gz", allow_multiallelic=false)

        # also generate reference panel without target samples
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
            .!sample_idx, des = "ref.chr$chr.excludeTarget.vcf.gz", allow_multiallelic=false)
        
        for maf in [0.0005, 0.001, 0.01, 0.1, 0.45]
            
            # generate target file with 100 samples and typed snps with certain maf
            my_maf = findall(x -> x > maf, maf_by_record)  
            p = length(my_maf)
            record_idx = falses(total_snps)
            record_idx[my_maf] .= true
            @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, sample_idx, 
                des = "target.chr$chr.typedOnly.maf$maf.vcf.gz", allow_multiallelic=false)

            # unphase and mask 1% entries in target file
            masks = falses(p, n)
            missingprop = 0.001
            for j in 1:n, i in 1:p
                rand() < missingprop && (masks[i, j] = true)
            end
            @time mask_gt("target.chr$chr.typedOnly.maf$maf.vcf.gz", masks, 
                des="target.chr$chr.typedOnly.maf$maf.masked.vcf.gz", unphase=true)
        end
    end 
end
Random.seed!(2020)
@time filter_and_mask()

752.076342 seconds (6.51 G allocations: 615.353 GiB, 7.01% gc time)


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:08:17[39m


549.100648 seconds (6.86 G allocations: 650.723 GiB, 10.18% gc time)
1139.017251 seconds (16.68 G allocations: 1.249 TiB, 10.94% gc time)
599.893049 seconds (6.86 G allocations: 650.911 GiB, 10.61% gc time)
 36.158916 seconds (267.39 M allocations: 27.452 GiB, 6.96% gc time)
608.676819 seconds (6.83 G allocations: 648.780 GiB, 10.30% gc time)
 35.610763 seconds (249.89 M allocations: 25.655 GiB, 6.86% gc time)
560.687780 seconds (6.61 G allocations: 626.620 GiB, 10.27% gc time)
 16.352906 seconds (119.38 M allocations: 12.252 GiB, 6.40% gc time)
520.047088 seconds (6.51 G allocations: 615.447 GiB, 10.21% gc time)
  8.014085 seconds (53.29 M allocations: 5.482 GiB, 5.36% gc time)
516.890704 seconds (6.43 G allocations: 607.361 GiB, 10.60% gc time)
  0.831218 seconds (5.35 M allocations: 563.018 MiB, 5.66% gc time)
6410.931663 seconds (76.92 G allocations: 6.823 TiB, 10.06% gc time)


# Try jlso compression

In [3]:
widths = [64, 128, 256, 512, 1024, 2048]
mafs = [0.1, 0.45]

reffile = "ref.chr18.excludeTarget.vcf.gz"
H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, reffile, trans=true, save_snp_info=true, msg="importing reference data...")

for maf in mafs
    tgtfile = "target.chr18.typedOnly.maf$maf.masked.vcf.gz"
    X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = VCFTools.convert_gt(UInt8, tgtfile, trans=true, save_snp_info=true, msg = "Importing genotype file...")
    for width in widths
        outfile = "ref.chr18.$width.maf$maf.excludeTarget.jlso"
        @time compress_haplotypes(H, X, outfile, X_pos, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt, width)
    end
end

[32mimporting reference data...100%|████████████████████████| Time: 0:04:02[39m


284.710377 seconds (9.87 M allocations: 174.502 GiB, 7.98% gc time)
178.516175 seconds (2.21 M allocations: 89.193 GiB, 5.91% gc time)
138.308593 seconds (1.99 M allocations: 46.972 GiB, 5.17% gc time)
112.405633 seconds (1.86 M allocations: 25.760 GiB, 3.65% gc time)
103.946622 seconds (1.79 M allocations: 15.040 GiB, 2.40% gc time)
107.580908 seconds (1.76 M allocations: 9.968 GiB, 1.42% gc time)
 96.188280 seconds (1.83 M allocations: 20.970 GiB, 2.68% gc time)
 93.829112 seconds (1.78 M allocations: 12.590 GiB, 2.97% gc time)
 97.163965 seconds (1.75 M allocations: 8.556 GiB, 1.87% gc time)
 96.925367 seconds (1.74 M allocations: 6.573 GiB, 1.51% gc time)
111.183097 seconds (1.73 M allocations: 6.373 GiB, 1.26% gc time)
117.908977 seconds (1.72 M allocations: 5.992 GiB, 1.79% gc time)


In [129]:
widths = [512]
mafs = [0.1]

reffile = "ref.chr18.excludeTarget.vcf.gz"
H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, reffile, trans=true, save_snp_info=true, msg="importing reference data...")

for maf in mafs
    tgtfile = "target.chr18.typedOnly.maf$maf.masked.vcf.gz"
    X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = VCFTools.convert_gt(UInt8, tgtfile, trans=true, save_snp_info=true, msg = "Importing genotype file...")
    for width in widths
        outfile = "ref.chr18.$width.maf$maf.excludeTarget.jlso"
        @time compress_haplotypes(H, X, outfile, X_pos, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt, width)
    end
end

[32mimporting reference data...100%|████████████████████████| Time: 0:07:05[39m


147.792776 seconds (15.59 M allocations: 26.415 GiB, 14.38% gc time)


In [11]:
;ls -al ref.chr18.excludeTarget.vcf.gz

-rw-r--r--@ 1 biona001  staff  247072174 May 19 23:43 ref.chr18.excludeTarget.vcf.gz


In [4]:
;ls -al ref.chr18.2048.maf0.45.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  274061616 Jun 19 22:39 ref.chr18.2048.maf0.45.excludeTarget.jlso


In [6]:
;ls -al ref.chr18.1024.maf0.45.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  259888540 Jun 19 22:37 ref.chr18.1024.maf0.45.excludeTarget.jlso


In [7]:
;ls -al ref.chr18.512.maf0.45.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  189521433 Jun 19 22:36 ref.chr18.512.maf0.45.excludeTarget.jlso


In [8]:
;ls -al ref.chr18.256.maf0.45.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  169514463 Jun 19 22:34 ref.chr18.256.maf0.45.excludeTarget.jlso


In [9]:
;ls -al ref.chr18.128.maf0.45.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  143137236 Jun 19 22:32 ref.chr18.128.maf0.45.excludeTarget.jlso


In [10]:
;ls -al ref.chr18.64.maf0.45.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  122370280 Jun 19 22:31 ref.chr18.64.maf0.45.excludeTarget.jlso


In [5]:
;ls -al ref.chr18.2048.maf0.1.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  182197211 Jun 19 22:29 ref.chr18.2048.maf0.1.excludeTarget.jlso


In [12]:
;ls -al ref.chr18.1024.maf0.1.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  147319546 Jun 19 22:27 ref.chr18.1024.maf0.1.excludeTarget.jlso


In [130]:
;ls -al ref.chr18.512.maf0.1.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  124899593 Jun 22 21:01 ref.chr18.512.maf0.1.excludeTarget.jlso


In [14]:
;ls -al ref.chr18.256.maf0.1.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  108411897 Jun 19 22:24 ref.chr18.256.maf0.1.excludeTarget.jlso


In [15]:
;ls -al ref.chr18.128.maf0.1.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  105229905 Jun 19 22:22 ref.chr18.128.maf0.1.excludeTarget.jlso


In [16]:
;ls -al ref.chr18.64.maf0.1.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  116959455 Jun 19 22:19 ref.chr18.64.maf0.1.excludeTarget.jlso


# Haplotype Thinning

In [2]:
Threads.nthreads()

8

In [3]:
# 8 threads, hamming, search top 100 unique haplotypes per window
Random.seed!(2020)
chr = 18
maf = 0.1
width = 2000
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.$width.chr18.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width)

X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:20[39m


Each window have ~ 4206 unique haplotypes on average


[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:05[39m


Data import time                    = 10.2285 seconds
Computing haplotype pair time       = 20.9888 seconds
Phasing by dynamic programming time = 0.962151 seconds
Imputing time                       = 7.64746 seconds
 61.483484 seconds (316.41 M allocations: 22.896 GiB, 7.52% gc time)
error overall = 0.015155699845883544 



In [6]:
# 8 threads, hamming, search top 1000 unique haplotypes per window
Random.seed!(2020)
chr = 18
maf = 0.1
width = 2000
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.$width.chr18.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width)

X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:56[39m


Each window have ~ 4206 unique haplotypes on average


[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:05[39m


Data import time                    = 8.30653 seconds
Computing haplotype pair time       = 116.482 seconds
Phasing by dynamic programming time = 0.150957 seconds
Imputing time                       = 7.68015 seconds
132.606046 seconds (231.05 M allocations: 20.794 GiB, 4.15% gc time)
error overall = 0.009443362788264631 



In [4]:
# 8 threads, euclidean, search top 1000 unique haplotypes per window
Random.seed!(2020)
chr = 18
maf = 0.1
width = 2000
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.$width.chr18.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width)

X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:56[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:06[39m


Total window = 426, each with ~ 4206 unique haplotypes on avg
Data import time                    = 9.03808 seconds
Computing haplotype pair time       = 116.16 seconds
    BLAS3 mul! to get M and N           = 102.169 seconds (on thread 1)
    haplopair search                    = 7.02312 seconds (on thread 1)
    supplying constant terms            = 0.00553182 seconds (on thread 1)
    finding redundant happairs          = 0.0778568 seconds (on thread 1)
Phasing by dynamic programming time = 0.247162 seconds
Imputing time                       = 8.84356 seconds
134.422072 seconds (54.62 M allocations: 12.249 GiB, 3.16% gc time)
error overall = 0.007742639590336406 



In [3]:
# 8 threads, euclidean, search top 100 unique haplotypes per window
Random.seed!(2020)
chr = 18
maf = 0.1
width = 2000
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.$width.chr18.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width)

X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:08[39m


Each window have ~ 4206 unique haplotypes on average


[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:05[39m


Data import time                    = 8.4482 seconds
Computing haplotype pair time       = 8.34869 seconds
Phasing by dynamic programming time = 0.16332 seconds
Imputing time                       = 7.12257 seconds
 24.160206 seconds (54.39 M allocations: 10.095 GiB, 14.30% gc time)
error overall = 0.012632811088878516 



# MendelImpute error rate and speed

In [131]:
Threads.nthreads()

8

In [132]:
# 8 threads, keep best happair
Random.seed!(2020)
chr = 18
maf = 0.1
width = 512
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr18.$width.maf$maf.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width)

X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:51[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:05[39m


Total windows = 331, averaging ~ 4096 unique haplotypes per window.

Timings: 
    Data import                     = 15.1827 seconds
    Computing haplotype pair        = 111.942 seconds
        BLAS3 mul! to get M and N      = 11.4561 seconds (on thread 1)
        haplopair search               = 38.4227 seconds (on thread 1)
        supplying constant terms       = 0.00239734 seconds (on thread 1)
        finding redundant happairs     = 0.0444248 seconds (on thread 1)
    Phasing by dynamic programming  = 0.183461 seconds
    Imputation                      = 124.245 seconds

251.544015 seconds (1.29 G allocations: 52.105 GiB, 20.01% gc time)
error overall = 0.006573465696772938 



In [4]:
loaded = JLSO.load(reffile)
compressed_Hunique = loaded[:compressed_Hunique]
X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = VCFTools.convert_gt(UInt8, tgtfile, trans=true, save_snp_info=true, msg = "Importing genotype file...")

people = size(X, 2)
tgt_snps = size(X, 1)
ref_snps = length(compressed_Hunique.pos)
windows = floor(Int, tgt_snps / width)

redundant_haplotypes = [[Tuple{Int, Int}[] for i in 1:windows] for j in 1:people]
[[sizehint!(redundant_haplotypes[j][i], 1000) for i in 1:windows] for j in 1:people] # don't save >1000 redundant happairs
Threads.@threads for w in 1:windows
    Hw_aligned = compressed_Hunique.CW_typed[w].uniqueH
    Xw_idx_start = (w - 1) * width + 1
    Xw_idx_end = (w == windows ? length(X_pos) : w * width)
    Xw_aligned = X[Xw_idx_start:Xw_idx_end, :]

    # computational routine
    happairs, hapscore, t1, t2, t3 = haplopair(Xw_aligned, Hw_aligned)
    # happairs, hapscore, t1, t2, t3 = (size(Hw_aligned, 2) < 1000 ? haplopair(Xw_aligned, Hw_aligned) :  
    #     haplopair_thin(Xw_aligned, Hw_aligned, keep=1000))

    # convert happairs (which index off unique haplotypes) to indices of full haplotype pool, and find all matching happairs
    t4 = @elapsed compute_redundant_haplotypes!(redundant_haplotypes, compressed_Hunique, happairs, w)
end

In [105]:
ph = [HaplotypeMosaicPair(ref_snps) for i in 1:people]
phase!(ph, X, compressed_Hunique, redundant_haplotypes, X_pos)
H_pos = compressed_Hunique.pos
XtoH_idx = indexin(X_pos, H_pos) 
X_full = Matrix{Union{Missing, UInt8}}(missing, ref_snps, people)
copyto!(@view(X_full[XtoH_idx, :]), X);

# convert phase's starting position from X's index to H's index
update_marker_position!(ph, XtoH_idx)

In [106]:
ph[1].strand1.start

331-element Array{Int64,1}:
      1
   2019
   4127
   6069
   8036
   9846
  11491
  14129
  17110
  18971
  20741
  22658
  24090
      ⋮
 820783
 822695
 824838
 827284
 830200
 833013
 835300
 837797
 840844
 843956
 846872
 849108

In [78]:
person = 1
snp = 1321
hap1_segment = searchsortedlast(ph[person].strand1.start, snp)
hap2_segment = searchsortedlast(ph[person].strand2.start, snp)
hap1_segment, hap2_segment

@show h1 = ph[person].strand1.haplotypelabel[hap1_segment]
@show h2 = ph[person].strand2.haplotypelabel[hap2_segment]
@show w1 = ph[person].strand1.window[hap1_segment]
@show w2 = ph[person].strand2.window[hap2_segment]
@show i1 = snp - first(compressed_Hunique.CWrange[w1]) + 1
@show i2 = snp - first(compressed_Hunique.CWrange[w2]) + 1


H1 = compressed_Hunique.CW[w1].uniqueH
H2 = compressed_Hunique.CW[w2].uniqueH
X_full[snp, person] = H1[i1, h1] + H2[i2, h2]

h1 = (ph[person]).strand1.haplotypelabel[hap1_segment] = 117
h2 = (ph[person]).strand2.haplotypelabel[hap2_segment] = 854
w1 = (ph[person]).strand1.window[hap1_segment] = 1
w2 = (ph[person]).strand2.window[hap2_segment] = 1
i1 = (snp - first(compressed_Hunique.CWrange[w1])) + 1 = 1321
i2 = (snp - first(compressed_Hunique.CWrange[w2])) + 1 = 1321


0

In [127]:
ref_snps = size(H, 1)
tgt_snps = size(X, 1)
windows = floor(Int, tgt_snps / width)
Hw_idx_start = 1
w = 1
Xw_idx_start = (w - 1) * width + 1
Xw_idx_end = (w == windows ? length(X_pos) : w * width)
Xw_pos_end = X_pos[Xw_idx_end]
Xw_pos_next = X_pos[w * width + 1]
Hw_idx_end = (w == windows ? length(H_pos) : 
    something(findnext(x -> x == Xw_pos_next, H_pos, Hw_idx_start)) - 1)




2018

In [125]:
X_pos[512]

194055

In [119]:
Xw_pos_next

16444

In [121]:
H_pos[78]

16444

In [118]:
something(findnext(x -> x == Xw_pos_next, H_pos, Hw_idx_start))

78

In [114]:
something(findnext(x -> x == Xw_idx_next_start, H_pos, Hw_idx_start))

ArgumentError: ArgumentError: No value arguments present

In [115]:
Xw_idx_next_start

513

In [96]:
ph[1].strand1.start

331-element Array{Int64,1}:
      1
   2019
   4127
   6069
   8036
   9846
  11491
  14129
  17110
  18971
  20741
  22658
  24090
      ⋮
 820783
 822695
 824838
 827284
 830200
 833013
 835300
 837797
 840844
 843956
 846872
 849108

In [94]:
p, n = size(X)

@inbounds for person in 1:n, snp in 1:p
    if ismissing(X_full[snp, person])
        try
            #find which segment the snp is located
            hap1_segment = searchsortedlast(ph[person].strand1.start, snp)
            hap2_segment = searchsortedlast(ph[person].strand2.start, snp)

            #find haplotype pair in corresponding window for this segment
            h1 = ph[person].strand1.haplotypelabel[hap1_segment]
            h2 = ph[person].strand2.haplotypelabel[hap2_segment]
            w1 = ph[person].strand1.window[hap1_segment]
            w2 = ph[person].strand2.window[hap2_segment]
            i1 = snp - first(compressed_Hunique.CWrange[w1]) + 1
            i2 = snp - first(compressed_Hunique.CWrange[w2]) + 1

            # imputation step
            H1 = compressed_Hunique.CW[w1].uniqueH
            H2 = compressed_Hunique.CW[w2].uniqueH
            X_full[snp, person] = H1[i1, h1] + H2[i2, h2]
        catch
            #find which segment the snp is located
            hap1_segment = searchsortedlast(ph[person].strand1.start, snp)
            hap2_segment = searchsortedlast(ph[person].strand2.start, snp)

            #find haplotype pair in corresponding window for this segment
            h1 = ph[person].strand1.haplotypelabel[hap1_segment]
            h2 = ph[person].strand2.haplotypelabel[hap2_segment]
            w1 = ph[person].strand1.window[hap1_segment]
            w2 = ph[person].strand2.window[hap2_segment]
            i1 = snp - first(compressed_Hunique.CWrange[w1]) + 1
            i2 = snp - first(compressed_Hunique.CWrange[w2]) + 1

            # imputation step
            H1 = compressed_Hunique.CW[w1].uniqueH
            H2 = compressed_Hunique.CW[w2].uniqueH
            
            println("snp = $snp, person = $person")
            println("h1 = $h1, h2 = $h2, w1 = $w1, w2 = $w2, i1 = $i1, i2 = $i2")
            println("size(H1) = $(size(H1))")
            println("size(H2) = $(size(H2))")
            error("hi")
        end
    end
end

snp = 2012, person = 1
h1 = 117, h2 = 854, w1 = 1, w2 = 1, i1 = 2012, i2 = 2012
size(H1) = (2011, 4801)
size(H2) = (2011, 4801)


ErrorException: hi

In [95]:
compressed_Hunique.CWrange[1]

1:2011

In [30]:
H = convert_ht(Bool, "ref.chr18.excludeTarget.vcf.gz", trans=true, msg="importing H");

[32mimporting H100%|████████████████████████████████████████| Time: 0:04:08[39m


In [46]:
all(H[1:2011, 2767] .== compressed_Hunique.CW[1].uniqueH[:, 2764])

true

In [45]:
sol_path[id][1][1], h1

(2767, 2764)

In [43]:
i = 100
id = 1
chunk_offset = 0
MendelImpute.connect_happairs!(sol_path[id], nxt_pair[id], tree_err[id], redundant_haplotypes[i], λ = 1.0)

h1 = MendelImpute.complete_idx_to_unique_all_idx(sol_path[id][1][1], 1, compressed_Hunique)
h2 = MendelImpute.complete_idx_to_unique_all_idx(sol_path[id][1][2], 1, compressed_Hunique)


3032

In [24]:
ph[1].strand1.start

331-element Array{Int64,1}:
      1
    513
   1025
   1537
   2049
   2561
   3073
   3585
   4097
   4609
   5121
   5633
   6145
      ⋮
 163329
 163841
 164353
 164865
 165377
 165889
 166401
 166913
 167425
 167937
 168449
 168961

In [26]:
unique(compressed_Hunique.CW_typed[1].hapmap)

4778-element Array{Int64,1}:
    1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
    ⋮
 4797
 4798
 4799
 4800
 4801
 4802
 4803
 4804
 4805
 4806
 4807
 4808

In [14]:
unique(compressed_Hunique.CW[1].hapmap)

4801-element Array{Int64,1}:
    1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
    ⋮
 4797
 4798
 4799
 4800
 4801
 4802
 4803
 4804
 4805
 4806
 4807
 4808

In [6]:
# 8 threads, searching single bkpts, deleting suboptimal pairs in dp, skip windows with <50 typed snps
Random.seed!(2020)
chr = 18
maf = 0.1
width = 2000
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.$width.chr18.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width)

X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:05:22[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:07[39m


Total window = 426, each with ~ 4206 unique haplotypes on avg
Data import time                    = 10.1402 seconds
Computing haplotype pair time       = 322.393 seconds
    BLAS3 mul! to get M and N           = 9.69796 seconds (on thread 1)
    haplopair search                    = 55.0657 seconds (on thread 1)
    supplying constant terms            = 0.00392874 seconds (on thread 1)
    finding redundant happairs          = 0.701392 seconds (on thread 1)
Phasing by dynamic programming time = 0.251607 seconds
Imputing time                       = 10.174 seconds
343.092854 seconds (52.46 M allocations: 38.321 GiB, 6.70% gc time)
error overall = 0.005999012434875828 



In [4]:
# 8 threads, searching single bkpts, deleting suboptimal pairs in dp, skip windows with <50 typed snps
Random.seed!(2020)
chr = 18
maf = 0.1
width = 2000
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.$width.chr18.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width)

X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:03:03[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:06[39m


Data import time                    = 10.8066 seconds
Computing haplotype pair time       = 183.787 seconds
Phasing by dynamic programming time = 1.65179 seconds
Imputing time                       = 9.13026 seconds
229.475853 seconds (141.20 M allocations: 42.904 GiB, 3.60% gc time)
error overall = 0.005566501134175149 



In [3]:
# 8 threads, save only best happair, searching single bkpts
# deleting suboptimal pairs in dp, skip windows with <50 typed snps
Random.seed!(2020)
chr = 18
maf = 0.1
width = 2000
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.$width.chr18.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width)

X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:42[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:06[39m


Data import time                    = 10.23 seconds
Computing haplotype pair time       = 102.337 seconds
Phasing by dynamic programming time = 1.22405 seconds
Imputing time                       = 8.44946 seconds
144.593542 seconds (140.29 M allocations: 42.697 GiB, 5.00% gc time)
error overall = 0.005999012434875828 



In [2]:
# impute only typed snps
Random.seed!(2020)
chr = 18
maf = 0.45
width = 2000
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.$width.chr18.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=false, width=width)

X_complete, _, _, X_complete_pos, _, _, _ = convert_gt(Float64, "target.chr18.full.vcf.gz", trans=true, save_snp_info=true)
X_typedonly, _, _, X_typedonly_pos, _, _, _ = convert_gt(Float64, outfile, trans=true, save_snp_info=true)
typed_to_complete_idx = indexin(X_typedonly_pos, X_complete_pos)
X_complete_typedonly = X_complete[typed_to_complete_idx, :]
p, n = size(X_typedonly)
println("error overall = $(sum(X_typedonly .!= X_complete_typedonly) / n / p) \n")

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:57[39m


Data import time                    = 7.74952 seconds
Computing haplotype pair time       = 57.2797 seconds
Phasing by dynamic programming time = 1.90273 seconds
Imputing time                       = 0.57671 seconds
 89.702625 seconds (105.16 M allocations: 14.344 GiB, 3.63% gc time)
error overall = 0.0007017132128608308 



In [5]:
# searching single bkpts, deleting suboptimal pairs in dp, skip windows with <50 typed snps
Random.seed!(2020)
chr = 18
maf = 0.45
width = 2000
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr$chr.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
    min_typed_snps=50)

X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:55[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


Data import time                    = 6.68637 seconds
Computing haplotype pair time       = 55.7005 seconds
Phasing by dynamic programming time = 1.49851 seconds
Imputing time                       = 13.3049 seconds
 77.173139 seconds (20.27 M allocations: 10.581 GiB, 3.04% gc time)
error overall = 0.0944998604272568 



In [17]:
# try different widths
# deleting suboptimal pairs in dp, not searching bkpts, skip windows with <5% typed snps
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
    n, p = size(X_complete)
    chr = 18
    maf = 0.1
    for width in [1000, 2000, 4000]
        println("MendelImpute with dynamic programming, width = $width, maf = $maf")
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.$width.chr$chr.excludeTarget.jlso"
        outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
            fast_method=false)
        X_mendel = convert_gt(UInt8, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

MendelImpute with dynamic programming, width = 1000, maf = 0.1


[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:25[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:13[39m


Data import time                    = 9.53489 seconds
Computing haplotype pair time       = 1919.31 seconds
Phasing by dynamic programming time = 25.534 seconds
Imputing time                       = 18.0856 seconds
1972.573863 seconds (60.84 M allocations: 222.162 GiB, 3.90% gc time)
error overall = 0.005454584905970195 

MendelImpute with dynamic programming, width = 2000, maf = 0.1


[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:08:51[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:12[39m


Data import time                    = 12.9095 seconds
Computing haplotype pair time       = 531.806 seconds
Phasing by dynamic programming time = 2.22416 seconds
Imputing time                       = 14.0015 seconds
560.924354 seconds (53.12 M allocations: 45.984 GiB, 11.69% gc time)
error overall = 0.005866805379297726 

MendelImpute with dynamic programming, width = 4000, maf = 0.1


[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:02:50[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


Data import time                    = 12.3763 seconds
Computing haplotype pair time       = 170.933 seconds
Phasing by dynamic programming time = 0.038868 seconds
Imputing time                       = 13.1781 seconds
196.524946 seconds (52.65 M allocations: 26.672 GiB, 3.82% gc time)
error overall = 0.008089859043258167 



# Beagle 5.0

In [3]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 18
    for maf in [0.0005, 0.001, 0.01, 0.1, 0.45]
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "beagle.imputed.maf$maf"
        Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
    end
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 10:09 AM PDT on 08 May 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr18.typedOnly.maf0.0005.masked.vcf.gz
  ref=ref.chr18.excludeTarget.vcf.gz
  out=beagle.imputed.maf0.0005
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (18:10644-40010629)
Reference markers:     414,911
Study markers:         414,911

Burnin  iteration 1:           48 seconds
Burnin  iteration 2:           1 minute 28 seconds
Burnin  iteration 3:           1 minute 42 seconds
Burnin  iteration 4:           1 minute 46 seconds
Burnin  iteration 5:           1 minute 53 seconds
Burnin  iteration 6:           2 minutes 5 seconds

Phasing iteration 1:           1 minute 59 seconds
Phasing iteration 2:           2 minutes 1 second
Phasing iterati

DimensionMismatch: DimensionMismatch("arrays could not be broadcast to a common size; got a dimension with lengths 678557 and 863592")

In [5]:
for maf in [0.0005]
    # beagle error rate    
    chr = 18
    X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
    X_beagle = convert_gt(UInt8, "beagle.imputed.maf$maf.vcf.gz")
    n, p = size(X_complete)
    println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")
end

error overall = 5.453964372064586e-6 



In [1]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 18
    for maf in [0.001, 0.01, 0.1, 0.45]
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "beagle.imputed.maf$maf"
        Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
    end
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 02:52 PM PDT on 12 May 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr18.typedOnly.maf0.001.masked.vcf.gz
  ref=ref.chr18.excludeTarget.vcf.gz
  out=beagle.imputed.maf0.001
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (18:10644-40010629)
Reference markers:     414,911
Study markers:         388,114

Burnin  iteration 1:           38 seconds
Burnin  iteration 2:           1 minute 26 seconds
Burnin  iteration 3:           1 minute 28 seconds
Burnin  iteration 4:           1 minute 35 seconds
Burnin  iteration 5:           1 minute 50 seconds
Burnin  iteration 6:           1 minute 47 seconds

Phasing iteration 1:           1 minute 43 seconds
Phasing iteration 2:           1 minute 47 seconds
Phasing iteratio

UndefVarError: UndefVarError: convert_gt not defined

In [3]:
# beagle error rate    
chr = 18
X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
X_beagle = convert_gt(UInt8, "beagle.imputed.maf0.001.vcf.gz")
n, p = size(X_complete)
println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")

ErrorException: GeneticVariation.VCF.Reader file format error on line 11 ~>"�;IMP\t"

In [7]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 18
    for maf in [0.01]
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "beagle.imputed.maf$maf"
        Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
    
        # beagle error rate    
        chr = 18
        X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
        X_beagle = convert_gt(UInt8, "beagle.imputed.maf$maf.vcf.gz")
        n, p = size(X_complete)
        println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")    
    end
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 04:03 PM PDT on 12 May 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr18.typedOnly.maf0.01.masked.vcf.gz
  ref=ref.chr18.excludeTarget.vcf.gz
  out=beagle.imputed.maf0.01
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (18:10644-40010629)
Reference markers:     414,911
Study markers:         183,460

Burnin  iteration 1:           22 seconds
Burnin  iteration 2:           31 seconds
Burnin  iteration 3:           34 seconds
Burnin  iteration 4:           33 seconds
Burnin  iteration 5:           38 seconds
Burnin  iteration 6:           38 seconds

Phasing iteration 1:           37 seconds
Phasing iteration 2:           35 seconds
Phasing iteration 3:           42 seconds
Phasing iteration 4:           38 secon

ErrorException: GeneticVariation.VCF.Reader file format error on line 11 ~>"�;IMP\t"

In [8]:
chr = 18
maf = 0.01
X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
X_beagle = convert_gt(UInt8, "beagle.imputed.maf$maf.vcf.gz")
n, p = size(X_complete)
println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")    

error overall = 0.0011677968299845297 



In [9]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 18
    for maf in [0.1, 0.45]
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "beagle.imputed.maf$maf"
        Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
    
        # beagle error rate
        chr = 18
        X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
        X_beagle = convert_gt(UInt8, "beagle.imputed.maf$maf.vcf.gz")
        n, p = size(X_complete)
        println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")
    end
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 04:47 PM PDT on 12 May 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr18.typedOnly.maf0.1.masked.vcf.gz
  ref=ref.chr18.excludeTarget.vcf.gz
  out=beagle.imputed.maf0.1
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (18:10644-40010629)
Reference markers:     414,911
Study markers:          82,786

Burnin  iteration 1:           18 seconds
Burnin  iteration 2:           19 seconds
Burnin  iteration 3:           19 seconds
Burnin  iteration 4:           19 seconds
Burnin  iteration 5:           22 seconds
Burnin  iteration 6:           19 seconds

Phasing iteration 1:           19 seconds
Phasing iteration 2:           18 seconds
Phasing iteration 3:           18 seconds
Phasing iteration 4:           17 seconds

# Eagle 2 + Minimac4

In order to use the reference panel in Eagle 2's prephase option, one must first convert it to `.bcf` format via e.g. `htslib` which is *extremely* difficult to install. Even after we went through all the hard work to obtain the final `.bcf` reference file (see commands below), eagle 2.4 STILL SAYS the file is not acceptable (not bgzipped or some processing error). Therefore, I have no choice but to prephase without the reference panel. 

In [None]:
# run eagle 2.4: 3367.79 sec on amd-2382 machine (can only run on linux systems)
eagle --vcf=target.chr20.typedOnly.masked.vcf.gz --outPrefix=eagle.phased.chr20 --numThreads=4 --geneticMapFile=../Eagle_v2.4.1/tables/genetic_map_hg19_withX.txt.gz

In [None]:
# convert ref file to m3vcf format (Total Run completed in 1 hours, 46 mins, 24 seconds)
/u/home/b/biona001/haplotype_comparisons/Minimac3/bin/Minimac3 --refHaps ref.chr20.excludeTarget.vcf.gz --processReference --prefix ref.chr20.excludeTarget

In [None]:
# run minimac4 (2619 seconds)
minimac4 --refHaps ref.chr20.excludeTarget.m3vcf.gz --haps eagle.phased.vcf.gz --prefix minimac.imputed.chr20 --format GT --cpus 4

In [None]:
# minimac4 error rate    
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz")
X_minimac = convert_gt(Float32, "minimac.imputed.chr20.dose.vcf.gz")
n, p = size(X_complete)
println("error overall = $(sum(X_minimac .!= X_complete) / n / p) \n")