# Test imputation on untyped SNPs chrom 18

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLD2, FileIO, JLSO
using BenchmarkTools
using GroupSlices

└ @ Revise /Users/biona001/.julia/packages/Revise/439di/src/Revise.jl:1108
┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


### Memory requirement

**Prephasing step:** 
+ Target data requies $people * snps * 4$ bytes of RAM
+ Reference haplotype data requires $haplotypes * snps$ bits of RAM
+ Redundant haplotype set for imputation target requires roughly
$people * windows * 1000$ (max haplotypes per win) $* 16 bytes$ of RAM

## Generate subset of markers for prephasing

In [3]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function filter_and_mask()
    for chr in [18]
        # filter chromosome data for unique snps
        data = "../beagle_raw/chr$chr.1kg.phase3.v5a.vcf.gz"
        full_record_index = .!find_duplicate_marker(data)
        @time VCFTools.filter(data, full_record_index, 1:nsamples(data), 
            des = "chr$chr.uniqueSNPs.vcf.gz")

        # summarize data
        total_snps, samples, _, _, _, maf_by_record, _ = gtstats("chr$chr.uniqueSNPs.vcf.gz")

        # generate target panel with all snps
        n = 100
        sample_idx = falses(samples)
        sample_idx[1:n] .= true
        shuffle!(sample_idx)
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
            sample_idx, des = "target.chr$chr.full.vcf.gz", allow_multiallelic=false)

        # also generate reference panel without target samples
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
            .!sample_idx, des = "ref.chr$chr.excludeTarget.vcf.gz", allow_multiallelic=false)
        
        for maf in [0.0005, 0.001, 0.01, 0.1, 0.45]
            
            # generate target file with 100 samples and typed snps with certain maf
            my_maf = findall(x -> x > maf, maf_by_record)  
            p = length(my_maf)
            record_idx = falses(total_snps)
            record_idx[my_maf] .= true
            @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, sample_idx, 
                des = "target.chr$chr.typedOnly.maf$maf.vcf.gz", allow_multiallelic=false)

            # unphase and mask 1% entries in target file
            masks = falses(p, n)
            missingprop = 0.001
            for j in 1:n, i in 1:p
                rand() < missingprop && (masks[i, j] = true)
            end
            @time mask_gt("target.chr$chr.typedOnly.maf$maf.vcf.gz", masks, 
                des="target.chr$chr.typedOnly.maf$maf.masked.vcf.gz", unphase=true)
        end
    end 
end
Random.seed!(2020)
@time filter_and_mask()

752.076342 seconds (6.51 G allocations: 615.353 GiB, 7.01% gc time)


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:08:17[39m


549.100648 seconds (6.86 G allocations: 650.723 GiB, 10.18% gc time)
1139.017251 seconds (16.68 G allocations: 1.249 TiB, 10.94% gc time)
599.893049 seconds (6.86 G allocations: 650.911 GiB, 10.61% gc time)
 36.158916 seconds (267.39 M allocations: 27.452 GiB, 6.96% gc time)
608.676819 seconds (6.83 G allocations: 648.780 GiB, 10.30% gc time)
 35.610763 seconds (249.89 M allocations: 25.655 GiB, 6.86% gc time)
560.687780 seconds (6.61 G allocations: 626.620 GiB, 10.27% gc time)
 16.352906 seconds (119.38 M allocations: 12.252 GiB, 6.40% gc time)
520.047088 seconds (6.51 G allocations: 615.447 GiB, 10.21% gc time)
  8.014085 seconds (53.29 M allocations: 5.482 GiB, 5.36% gc time)
516.890704 seconds (6.43 G allocations: 607.361 GiB, 10.60% gc time)
  0.831218 seconds (5.35 M allocations: 563.018 MiB, 5.66% gc time)
6410.931663 seconds (76.92 G allocations: 6.823 TiB, 10.06% gc time)


### Missing rate

In typed markers, 0.1% of data is missing at random, and the untyped markers missing rate are:

In [2]:
reffile = "ref.chr18.excludeTarget.vcf.gz"
@show ref_records = nrecords(reffile)
for maf in [0.0005, 0.001, 0.01, 0.1, 0.45]
    tgtfile = "target.chr18.typedOnly.maf$maf.masked.vcf.gz"
    tgt_records = nrecords(tgtfile)
    missing_rate = 1 - tgt_records / ref_records
    println("there are $tgt_records snps w/ maf > $maf. Missing rate = $missing_rate")
end

ref_records = nrecords(reffile) = 852602
there are 852602 snps w/ maf > 0.0005. Missing rate = 0.0
there are 796810 snps w/ maf > 0.001. Missing rate = 0.06543733183830203
there are 380645 snps w/ maf > 0.01. Missing rate = 0.5535490181819888
there are 169914 snps w/ maf > 0.1. Missing rate = 0.8007112345502356
there are 17044 snps w/ maf > 0.45. Missing rate = 0.9800094299567677


# Try compressing with JLD2

In [4]:
width = 2000    # use 2000 SNPs per window (which almost all haplotypes are unique)
reffile = "ref.chr18.excludeTarget.vcf.gz"
outfile = "ref.chr18.excludeTarget.jld2"
@time compress_haplotypes(reffile, outfile, width);

[32mimporting vcf data...100%|██████████████████████████████| Time: 0:04:24[39m


310.253834 seconds (4.17 G allocations: 309.774 GiB, 17.72% gc time)


In [5]:
@time @load "ref.chr18.excludeTarget.jld2" compressed_Hunique;

  4.488734 seconds (19.12 M allocations: 1.624 GiB, 16.99% gc time)


In [6]:
;ls -al ref.chr18.excludeTarget.jld2

-rw-r--r--  1 biona001  staff  754339754 May 28 10:06 ref.chr18.excludeTarget.jld2


In [7]:
;ls -al ref.chr18.excludeTarget.vcf.gz

-rw-r--r--@ 1 biona001  staff  247072174 May 19 23:43 ref.chr18.excludeTarget.vcf.gz


# Try jlso compression

In [2]:
vcffile = "ref.chr18.excludeTarget.vcf.gz"
H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, vcffile, trans=true, save_snp_info=true, msg="importing vcf data...")

[32mimporting vcf data...100%|██████████████████████████████| Time: 0:04:34[39m


(Bool[0 0 … 0 0; 0 0 … 0 0; … ; 1 1 … 1 1; 1 1 … 1 1], ["HG00096", "HG00097", "HG00099", "HG00100", "HG00101", "HG00102", "HG00103", "HG00105", "HG00106", "HG00107"  …  "NA21128", "NA21129", "NA21130", "NA21133", "NA21135", "NA21137", "NA21141", "NA21142", "NA21143", "NA21144"], ["18", "18", "18", "18", "18", "18", "18", "18", "18", "18"  …  "18", "18", "18", "18", "18", "18", "18", "18", "18", "18"], [10644, 10661, 10701, 10719, 10743, 10764, 10770, 10780, 10796, 10839  …  78016732, 78016797, 78016888, 78016911, 78017030, 78017064, 78017073, 78017128, 78017155, 78017156], Array{String,1}[["rs115926245"], ["rs566492364"], ["rs139482401"], ["rs574502329"], ["rs532878481"], ["rs550985930"], ["rs530200462"], ["rs566603231"], ["rs115877650"], ["rs574589272"]  …  ["rs527898535"], ["rs140902457"], ["rs140216338"], ["rs202183949"], ["rs145359037"], ["rs137912669"], ["rs149056417"], ["rs188332254"], ["rs587690570"], ["rs587765114"]], ["C", "G", "C", "C", "G", "C", "G", "G", "C", "C"  …  "G", "

In [11]:
for width in [1000, 2000, 4000]
    outfile = "ref.$width.chr18.excludeTarget.jlso"
    dims=2
    flankwidth=0
    snps = (dims == 2 ? size(H, 1) : size(H, 2))
    windows = floor(Int, snps / width);
    compressed_Hunique = MendelImpute.CompressedHaplotypes(windows, width, snps, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt)
    for w in 1:windows
        if w == 1
            cur_range = 1:(width + flankwidth)
        elseif w == windows
            cur_range = ((windows - 1) * width - flankwidth + 1):snps
        else
            cur_range = ((w - 1) * width - flankwidth + 1):(w * width + flankwidth)
        end
        compressed_Hunique.CWrange[w] = cur_range

        H_cur_window = (dims == 2 ? view(H, cur_range, :) : view(H, :, cur_range))
        hapmap = groupslices(H_cur_window, dims=dims)
        unique_idx = unique(hapmap)
        uniqueH = (dims == 2 ? H_cur_window[:, unique_idx] : H_cur_window[unique_idx, :])
        compressed_Hunique[w] = MendelImpute.CompressedWindow(unique_idx, hapmap, uniqueH)
    end
    endswith(outfile, ".jlso") && JLSO.save(outfile, :compressed_Hunique => compressed_Hunique, format=:julia_serialize, compression=:gzip)
end

In [None]:
width = 1000    # use 2000 SNPs per window (which almost all haplotypes are unique)
reffile = "ref.chr18.excludeTarget.vcf.gz"
outfile = "ref.$width.chr18.excludeTarget.jlso"
@time compress_haplotypes(reffile, outfile, width);

In [12]:
;ls -al ref.1000.chr18.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  49410994 Jun  1 22:24 ref.1000.chr18.excludeTarget.jlso


In [13]:
;ls -al ref.2000.chr18.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  56525290 Jun  1 22:25 ref.2000.chr18.excludeTarget.jlso


In [14]:
;ls -al ref.4000.chr18.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  65786356 Jun  1 22:26 ref.4000.chr18.excludeTarget.jlso


In [15]:
;ls -al ref.chr18.excludeTarget.vcf.gz

-rw-r--r--@ 1 biona001  staff  247072174 May 19 23:43 ref.chr18.excludeTarget.vcf.gz


In [2]:
Threads.nthreads()

8

In [15]:
# searching single bkpts, deleting suboptimal pairs in dp, skip windows with <100 typed snps
Random.seed!(2020)
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr$chr.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
    fast_method=false)

X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:05:20[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:06[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


Data import time                    = 12.7563 seconds
Computing haplotype pair time       = 320.28 seconds
Phasing by dynamic programming time = 6.85594 seconds
Imputing time                       = 13.147 seconds
353.038002 seconds (53.63 M allocations: 46.721 GiB, 23.32% gc time)
error overall = 0.005482159319354166 



In [17]:
# deleting suboptimal pairs in dp, not searching bkpts, skip windows with <5% typed snps
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
    n, p = size(X_complete)
    chr = 18
    maf = 0.1
    for width in [1000, 2000, 4000]
        println("MendelImpute with dynamic programming, width = $width, maf = $maf")
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.$width.chr$chr.excludeTarget.jlso"
        outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
            fast_method=false)
        X_mendel = convert_gt(UInt8, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

MendelImpute with dynamic programming, width = 1000, maf = 0.1


[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:25[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:13[39m


Data import time                    = 9.53489 seconds
Computing haplotype pair time       = 1919.31 seconds
Phasing by dynamic programming time = 25.534 seconds
Imputing time                       = 18.0856 seconds
1972.573863 seconds (60.84 M allocations: 222.162 GiB, 3.90% gc time)
error overall = 0.005454584905970195 

MendelImpute with dynamic programming, width = 2000, maf = 0.1


[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:08:51[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:12[39m


Data import time                    = 12.9095 seconds
Computing haplotype pair time       = 531.806 seconds
Phasing by dynamic programming time = 2.22416 seconds
Imputing time                       = 14.0015 seconds
560.924354 seconds (53.12 M allocations: 45.984 GiB, 11.69% gc time)
error overall = 0.005866805379297726 

MendelImpute with dynamic programming, width = 4000, maf = 0.1


[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:02:50[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


Data import time                    = 12.3763 seconds
Computing haplotype pair time       = 170.933 seconds
Phasing by dynamic programming time = 0.038868 seconds
Imputing time                       = 13.1781 seconds
196.524946 seconds (52.65 M allocations: 26.672 GiB, 3.82% gc time)
error overall = 0.008089859043258167 



In [3]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
    n, p = size(X_complete)
    chr = 18
    width = 500
    for maf in [0.45]
        println("MendelImpute with dynamic programming, width = $width, maf = $maf")
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
            fast_method=false)
        X_mendel = convert_gt(UInt8, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

MendelImpute with dynamic programming, width = 500, maf = 0.45


[32mImporting reference haplotype files...100%|█████████████| Time: 0:04:24[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:28[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:11[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


346.406477 seconds (4.15 G allocations: 311.657 GiB, 15.04% gc time)
error overall = 0.07907043380146891 



In [4]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
    n, p = size(X_complete)
    chr = 18
    width = 500
    for maf in [0.1]
        println("MendelImpute with dynamic programming, width = $width, maf = $maf")
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
            fast_method=false)
        X_mendel = convert_gt(UInt8, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

MendelImpute with dynamic programming, width = 500, maf = 0.1


[32mImporting reference haplotype files...100%|█████████████| Time: 0:04:36[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:04:13[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:01:29[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


668.244995 seconds (4.16 G allocations: 335.367 GiB, 8.26% gc time)
error overall = 0.00520817450580693 



In [3]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
    n, p = size(X_complete)
    chr = 18
    width = 500
    for maf in [0.01]
        println("MendelImpute with dynamic programming, width = $width, maf = $maf")
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
            fast_method=false)
        X_mendel = convert_gt(UInt8, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

MendelImpute with dynamic programming, width = 500, maf = 0.01


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:04:20[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:06:07[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:02:08[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


816.071981 seconds (4.27 G allocations: 347.758 GiB, 7.73% gc time)
error overall = 0.0015719057661136146 



In [2]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
    n, p = size(X_complete)
    chr = 18
    width = 500
    for maf in [0.001]
        println("MendelImpute with dynamic programming, width = $width, maf = $maf")
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
            fast_method=false)
        X_mendel = convert_gt(UInt8, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

MendelImpute with dynamic programming, width = 500, maf = 0.001


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:12[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:04:10[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:07:03[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:01:54[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


862.650772 seconds (4.54 G allocations: 362.817 GiB, 12.38% gc time)
error overall = 0.00010537155671696759 



In [3]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
    n, p = size(X_complete)
    chr = 18
    width = 500
    for maf in [0.0005]
        println("MendelImpute with dynamic programming, width = $width, maf = $maf")
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
            fast_method=false)
        X_mendel = convert_gt(UInt8, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

MendelImpute with dynamic programming, width = 500, maf = 0.0005


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:14[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:04:24[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:08:17[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:02:02[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


965.395900 seconds (4.59 G allocations: 367.155 GiB, 13.17% gc time)
error overall = 6.145892221693123e-6 



# MendelImpute with intersecting haplotypes

In [4]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
    n, p = size(X_complete)
    chr = 18
    width = 500
    for maf in [0.45]
        println("MendelImpute with dynamic programming, width = $width, maf = $maf")
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
            fast_method=true)
        X_mendel = convert_gt(UInt8, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

MendelImpute with dynamic programming, width = 500, maf = 0.45


[32mImporting reference haplotype files...100%|█████████████| Time: 0:04:59[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:30[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:06[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:12[39m


386.665610 seconds (4.15 G allocations: 311.672 GiB, 15.08% gc time)
error overall = 0.07958381519161345 



In [3]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
    n, p = size(X_complete)
    chr = 18
    width = 500
    for maf in [0.1]
        println("MendelImpute with dynamic programming, width = $width, maf = $maf")
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
            fast_method=true)
        X_mendel = convert_gt(UInt8, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

MendelImpute with dynamic programming, width = 500, maf = 0.1


[32mImporting reference haplotype files...100%|█████████████| Time: 0:05:05[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:03:50[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:46[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


637.821443 seconds (4.19 G allocations: 336.438 GiB, 9.47% gc time)
error overall = 0.005501347639344032 



# Beagle 5.0

In [3]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 18
    for maf in [0.0005, 0.001, 0.01, 0.1, 0.45]
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "beagle.imputed.maf$maf"
        Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
    end
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 10:09 AM PDT on 08 May 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr18.typedOnly.maf0.0005.masked.vcf.gz
  ref=ref.chr18.excludeTarget.vcf.gz
  out=beagle.imputed.maf0.0005
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (18:10644-40010629)
Reference markers:     414,911
Study markers:         414,911

Burnin  iteration 1:           48 seconds
Burnin  iteration 2:           1 minute 28 seconds
Burnin  iteration 3:           1 minute 42 seconds
Burnin  iteration 4:           1 minute 46 seconds
Burnin  iteration 5:           1 minute 53 seconds
Burnin  iteration 6:           2 minutes 5 seconds

Phasing iteration 1:           1 minute 59 seconds
Phasing iteration 2:           2 minutes 1 second
Phasing iterati

DimensionMismatch: DimensionMismatch("arrays could not be broadcast to a common size; got a dimension with lengths 678557 and 863592")

In [5]:
for maf in [0.0005]
    # beagle error rate    
    chr = 18
    X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
    X_beagle = convert_gt(UInt8, "beagle.imputed.maf$maf.vcf.gz")
    n, p = size(X_complete)
    println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")
end

error overall = 5.453964372064586e-6 



In [1]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 18
    for maf in [0.001, 0.01, 0.1, 0.45]
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "beagle.imputed.maf$maf"
        Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
    end
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 02:52 PM PDT on 12 May 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr18.typedOnly.maf0.001.masked.vcf.gz
  ref=ref.chr18.excludeTarget.vcf.gz
  out=beagle.imputed.maf0.001
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (18:10644-40010629)
Reference markers:     414,911
Study markers:         388,114

Burnin  iteration 1:           38 seconds
Burnin  iteration 2:           1 minute 26 seconds
Burnin  iteration 3:           1 minute 28 seconds
Burnin  iteration 4:           1 minute 35 seconds
Burnin  iteration 5:           1 minute 50 seconds
Burnin  iteration 6:           1 minute 47 seconds

Phasing iteration 1:           1 minute 43 seconds
Phasing iteration 2:           1 minute 47 seconds
Phasing iteratio

UndefVarError: UndefVarError: convert_gt not defined

In [3]:
# beagle error rate    
chr = 18
X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
X_beagle = convert_gt(UInt8, "beagle.imputed.maf0.001.vcf.gz")
n, p = size(X_complete)
println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")

ErrorException: GeneticVariation.VCF.Reader file format error on line 11 ~>"�;IMP\t"

In [7]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 18
    for maf in [0.01]
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "beagle.imputed.maf$maf"
        Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
    
        # beagle error rate    
        chr = 18
        X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
        X_beagle = convert_gt(UInt8, "beagle.imputed.maf$maf.vcf.gz")
        n, p = size(X_complete)
        println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")    
    end
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 04:03 PM PDT on 12 May 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr18.typedOnly.maf0.01.masked.vcf.gz
  ref=ref.chr18.excludeTarget.vcf.gz
  out=beagle.imputed.maf0.01
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (18:10644-40010629)
Reference markers:     414,911
Study markers:         183,460

Burnin  iteration 1:           22 seconds
Burnin  iteration 2:           31 seconds
Burnin  iteration 3:           34 seconds
Burnin  iteration 4:           33 seconds
Burnin  iteration 5:           38 seconds
Burnin  iteration 6:           38 seconds

Phasing iteration 1:           37 seconds
Phasing iteration 2:           35 seconds
Phasing iteration 3:           42 seconds
Phasing iteration 4:           38 secon

ErrorException: GeneticVariation.VCF.Reader file format error on line 11 ~>"�;IMP\t"

In [8]:
chr = 18
maf = 0.01
X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
X_beagle = convert_gt(UInt8, "beagle.imputed.maf$maf.vcf.gz")
n, p = size(X_complete)
println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")    

error overall = 0.0011677968299845297 



In [9]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 18
    for maf in [0.1, 0.45]
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "beagle.imputed.maf$maf"
        Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
    
        # beagle error rate
        chr = 18
        X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
        X_beagle = convert_gt(UInt8, "beagle.imputed.maf$maf.vcf.gz")
        n, p = size(X_complete)
        println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")
    end
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 04:47 PM PDT on 12 May 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr18.typedOnly.maf0.1.masked.vcf.gz
  ref=ref.chr18.excludeTarget.vcf.gz
  out=beagle.imputed.maf0.1
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (18:10644-40010629)
Reference markers:     414,911
Study markers:          82,786

Burnin  iteration 1:           18 seconds
Burnin  iteration 2:           19 seconds
Burnin  iteration 3:           19 seconds
Burnin  iteration 4:           19 seconds
Burnin  iteration 5:           22 seconds
Burnin  iteration 6:           19 seconds

Phasing iteration 1:           19 seconds
Phasing iteration 2:           18 seconds
Phasing iteration 3:           18 seconds
Phasing iteration 4:           17 seconds

# Eagle 2 + Minimac4

In order to use the reference panel in Eagle 2's prephase option, one must first convert it to `.bcf` format via e.g. `htslib` which is *extremely* difficult to install. Even after we went through all the hard work to obtain the final `.bcf` reference file (see commands below), eagle 2.4 STILL SAYS the file is not acceptable (not bgzipped or some processing error). Therefore, I have no choice but to prephase without the reference panel. 

In [None]:
# run eagle 2.4: 3367.79 sec on amd-2382 machine (can only run on linux systems)
eagle --vcf=target.chr20.typedOnly.masked.vcf.gz --outPrefix=eagle.phased.chr20 --numThreads=4 --geneticMapFile=../Eagle_v2.4.1/tables/genetic_map_hg19_withX.txt.gz

In [None]:
# convert ref file to m3vcf format (Total Run completed in 1 hours, 46 mins, 24 seconds)
/u/home/b/biona001/haplotype_comparisons/Minimac3/bin/Minimac3 --refHaps ref.chr20.excludeTarget.vcf.gz --processReference --prefix ref.chr20.excludeTarget

In [None]:
# run minimac4 (2619 seconds)
minimac4 --refHaps ref.chr20.excludeTarget.m3vcf.gz --haps eagle.phased.vcf.gz --prefix minimac.imputed.chr20 --format GT --cpus 4

In [None]:
# minimac4 error rate    
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz")
X_minimac = convert_gt(Float32, "minimac.imputed.chr20.dose.vcf.gz")
n, p = size(X_complete)
println("error overall = $(sum(X_minimac .!= X_complete) / n / p) \n")