# Comparison of MendelImpute against Minimac4 and Beagle5 on 1000 genome project

**Dynamic programming memory requirement:** 
+ Target data requies $people * snps * 4$ bytes of RAM
+ Reference haplotype data requires $haplotypes * snps * 4$ bytes of RAM
+ Redundant haplotype set for imputation target requires roughly
$people * windows * 1000$ (max haplotypes per win) $* 16 bytes$ of RAM

**Fast method memory requirement:** 
+ Target data requies $people * snps * 4$ bytes of RAM
+ Reference haplotype data requires $haplotypes * snps * 4$ bytes of RAM
+ Redundant haplotype set for imputation target requires roughly
$people * windows * snps$ bits of RAM


In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase

└ @ Revise /Users/biona001/.julia/packages/Revise/439di/src/Revise.jl:1108


# Filter data into target and reference data

+ `ref_chr22.vcf.gz`: haplotype reference files
+ `tgt_chr22.vcf.gz`: complete genotype information
+ `tgt_masked_chr22.vcf.gz`: the same as `tgt_chr22.vcf.gz` except some entries are masked
+ `tgt_masked_unphased_chr22.vcf.gz`: the same as `tgt_chr22.vcf.gz` except some entries are masked and heterozygote genotypes (1|0 or 0|1) are all changed to 1/0. 

In [10]:
"""
    filter_and_mask(data::String, samples::Int)

Creates reference haplotypes and (unphased) target genotype files from `data`. Duplicate
records (SNPs) are filtered out as well. 

# Inputs
`data`: The full (phased) data
`samples`: Number of samples (genotypes) desired in target file. Remaining haplotypes will become the reference panel
"""
function filter_and_mask(data::String, samples::Int; missingprop::Float64 = 0.01)
    n = nsamples(data)
    samples > n && error("requested samples exceed total number of genotypes in $data.")

    # output filenames (tgt_chr22.vcf.gz, ref_chr22.vcf.gz, and tgt_masked_chr22.vcf.gz)
    file = split(data, '.')[4] * ".vcf.gz"
    tgt = "./tgt_" * file
    ref = "./ref_" * file
    tgt_mask = "./tgt_masked_" * file
    tgt_mask_unphase = "./tgt_masked_unphased_" * file

    # compute target and reference index
    println("computing tgt/ref/record index")
    tgt_index = falses(n)
    tgt_index[sample(1:n, samples, replace=false)] .= true
    ref_index = .!tgt_index
    record_index = .!find_duplicate(data) # save only unique records (SNPs) 

    # generate masking matrix with `missingprop`% of trues (true = convert to missing)
    p = sum(record_index)
    masks = falses(p, samples)
    for j in 1:samples, i in 1:p
        rand() < missingprop && (masks[i, j] = true)
    end

    # create outputs 
    println("generating target file")
    VCFTools.filter(data, record_index, tgt_index, des = tgt)
    println("generating reference file")
    VCFTools.filter(data, record_index, ref_index, des = ref)
    println("generating masked file")
    mask_gt(tgt, masks, des=tgt_mask)

    # finally, unphase the target data
    unphase(tgt_mask, outfile=tgt_mask_unphase)
end

filter_and_mask

In [11]:
Random.seed!(2020)
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")

chr22 = "../raw/ALL.chr22.phase3_v5.shapeit2_mvncall_integrated.noSingleton.genotypes.vcf.gz"
@time filter_and_mask(chr22, 500) # 500 imputation samples, remaining 2004 used as reference

computing tgt/ref/record index
generating target file
generating reference file
generating masked file


[32mCreating ./tgt_masked_unphased_chr22.vcf.gz...100%|█████| Time: 0:01:56[39m


2101.643985 seconds (24.78 G allocations: 2.084 TiB, 9.99% gc time)


# Run MendelImpute (fast)

In [2]:
# fast method
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function run()
    X_complete = convert_gt(Float32, "tgt_chr22.vcf.gz")
    n, p = size(X_complete)
    for width in 1000
        println("running width = $width")
        tgtfile = "./tgt_masked_unphased_chr22.vcf.gz"
        reffile = "./ref_chr22.vcf.gz"
        outfile = "./mendel_imputed_fast_$(width).vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, fast_method=true)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p)")
        X_mendel = nothing
        GC.gc()
        println("")
    end
end
run()

running width = 1000


[32mImporting genotype and haplotype files...100%|██████████| Time: 0:03:59[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:28:42[39m
[32mIntersecting haplotypes...100%|█████████████████████████| Time: 0:00:01[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:52:04[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:51[39m


5247.320492 seconds (4.92 G allocations: 460.318 GiB, 0.69% gc time)
error = 5.980410550455159e-5



# Run MendelImpute (dynamic programming)

In [2]:
Threads.nthreads()

4

In [None]:
# dynamic programming 
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function run_dp()
    X_complete = convert_gt(Float32, "tgt_chr22.vcf.gz")
    n, p = size(X_complete)
    for width in 1000
        println("running width = $width")
        tgtfile = "./tgt_masked_unphased_chr22.vcf.gz"
        reffile = "./ref_chr22.vcf.gz"
        outfile = "./mendel_imputed_fast_$(width).vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p)")
        X_mendel = nothing
        GC.gc()
        println("")
    end
end
run_dp()

running width = 1000


[32mImporting genotype and haplotype files...100%|██████████| Time: 0:04:14[39m
[32mComputing optimal haplotype pairs... 41%|██████▎        |  ETA: 0:18:51[39m

# Run Beagle 5.0

In [13]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function run_beagle()
    run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=tgt_masked_unphased_chr22.vcf.gz ref=ref_chr22.vcf.gz out=beagle_imputed, nthreads=4`)
end
run_beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 12:56 AM PDT on 06 Apr 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=tgt_masked_unphased_chr22.vcf.gz
  ref=ref_chr22.vcf.gz
  out=beagle_imputed,
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,004
Study samples:             500

Window 1 (22:16050115-51244237)
Reference markers:     644,939
Study markers:         644,939

Burnin  iteration 1:           3 minutes 25 seconds
Burnin  iteration 2:           3 minutes 59 seconds
Burnin  iteration 3:           5 minutes 13 seconds
Burnin  iteration 4:           6 minutes 47 seconds
Burnin  iteration 5:           8 minutes 20 seconds
Burnin  iteration 6:           8 minutes 55 seconds

Phasing iteration 1:           8 minutes 47 seconds
Phasing iteration 2:           8 minutes 37 seconds
Phasing iteration 3:           

Process(`[4mjava[24m [4m-Xmx15g[24m [4m-jar[24m [4mbeagle.28Sep18.793.jar[24m [4mgt=tgt_masked_unphased_chr22.vcf.gz[24m [4mref=ref_chr22.vcf.gz[24m [4mout=beagle_imputed,[24m [4mnthreads=4[24m`, ProcessExited(0))

In [14]:
X_complete = convert_gt(Float32, "tgt_chr22.vcf.gz")
X_beagle = convert_gt(Float32, "beagle_imputed,.vcf.gz")
n, p = size(X_complete)
println("error = $(sum(X_beagle .!= X_complete) / n / p)")

error = 5.513389638399911e-5


## Check number of haplotype pairs

In [4]:
tgtfile = "./tgt_masked_unphased_chr22.vcf.gz"
reffile = "./ref_chr22.vcf.gz"
width = 2000
X = convert_gt(Float32, tgtfile, trans=true)
H = convert_ht(Float32, reffile, trans=true)
hs = compute_optimal_halotype_set(X, H, width = width, fast_method=false)


[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:24:28[39m


500-element Array{Array{Array{Tuple{Int64,Int64},1},1},1}:
 [[(689, 2734)], [(540, 3777)], [(539, 1130)], [(530, 951)], [(354, 3644)], [(101, 2768)], [(828, 1381)], [(75, 152)], [(730, 3974)], [(2668, 2729)]  …  [(625, 925), (2822, 925), (3771, 925)], [(77, 103), (1605, 103)], [(50, 3812)], [(254, 2760), (703, 2760), (2801, 2760)], [(70, 2570)], [(709, 3484)], [(48, 2760)], [(48, 146)], [(275, 2756)], [(640, 2756)]]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

In [24]:
[maximum(length.(hs[i])) for i in 1:500]

500-element Array{Int64,1}:
  60
  44
  96
  42
  68
 203
  48
 295
 144
 192
 240
 120
 248
   ⋮
 160
 108
 408
  46
  96
  56
  24
  71
  42
  32
  33
  27

In [25]:
tgtfile = "./tgt_masked_unphased_chr22.vcf.gz"
reffile = "./ref_chr22.vcf.gz"
width = 500
X = convert_gt(Float32, tgtfile, trans=true)
H = convert_ht(Float32, reffile, trans=true)
hs = compute_optimal_halotype_set(X, H, width = width, fast_method=false)

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:24:52[39m


500-element Array{Array{Array{Tuple{Int64,Int64},1},1},1}:
 [[(26, 463), (3663, 463)], [(722, 928)], [(858, 2706)], [(2858, 3944)], [(606, 973)], [(1270, 3490)], [(2389, 3698)], [(131, 539), (1148, 539), (1179, 539), (2762, 539), (3714, 539)], [(21, 539), (27, 539), (59, 539), (62, 539), (100, 539), (133, 539), (148, 539), (159, 539), (180, 539), (278, 539)  …  (3684, 539), (3691, 539), (3696, 539), (3716, 539), (3749, 539), (3778, 539), (3780, 539), (3807, 539), (3818, 539), (4002, 539)], [(14, 539), (62, 539), (180, 539), (607, 539), (840, 539), (1375, 539), (1380, 539), (3716, 539), (3828, 539)]  …  [(30, 56), (30, 260), (30, 353), (30, 812), (30, 844), (30, 881), (30, 918), (30, 928), (30, 937), (30, 1119)  …  (3843, 2264), (3843, 2516), (3843, 2560), (3843, 2725), (3843, 2728), (3843, 2774), (3843, 3702), (3843, 3725), (3843, 3726), (3843, 3895)], [(2763, 848), (2569, 3839), (1501, 849), (3968, 816), (452, 680), (453, 816), (921, 3484), (3927, 139), (1494, 816), (3748, 2772)  …  (

In [30]:
findall(x -> x > 900, length.(hs[1]))

315-element Array{Int64,1}:
   25
   26
   42
   47
   70
   75
   76
   77
   91
   93
   98
  102
  103
    ⋮
 1263
 1266
 1267
 1268
 1276
 1280
 1282
 1287
 1291
 1296
 1298
 1301