# Comparison of MendelImpute against Minimac4 and Beagle5 on 1000 genome project

**Dynamic programming memory requirement:** 
+ Target data requies $people * snps * 4$ bytes of RAM
+ Reference haplotype data requires $haplotypes * snps$ bits of RAM
+ Redundant haplotype set for imputation target requires roughly
$people * windows * 1000$ (max haplotypes per win) $* 16 bytes$ of RAM

**Fast method memory requirement:** 
+ Target data requies $people * snps * 4$ bytes of RAM
+ Reference haplotype data requires $haplotypes * snps$ bits of RAM
+ Redundant haplotype set for imputation target requires roughly
$people * snps$ bits of RAM

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase

└ @ Revise /Users/biona001/.julia/packages/Revise/439di/src/Revise.jl:1108


# Filter data into target and reference data

+ `ref_chr22.vcf.gz`: haplotype reference files
+ `tgt_chr22.vcf.gz`: complete genotype information
+ `tgt_masked_chr22.vcf.gz`: the same as `tgt_chr22.vcf.gz` except some entries are masked
+ `tgt_masked_unphased_chr22.vcf.gz`: the same as `tgt_chr22.vcf.gz` except some entries are masked and heterozygote genotypes (1|0 or 0|1) are all changed to 1/0. 

In [3]:
"""
    filter_and_mask(data::String, samples::Int)

Creates reference haplotypes and (unphased) target genotype files from `data`. Duplicate
records (SNPs) are filtered out as well. 

# Inputs
`data`: The full (phased) data
`samples`: Number of samples (genotypes) desired in target file. Remaining haplotypes will become the reference panel
"""
function filter_and_mask(data::String, samples::Int; missingprop::Float64 = 0.01)
    n = nsamples(data)
    samples > n && error("requested samples exceed total number of genotypes in $data.")

    # output filenames (tgt_chr22.vcf.gz, ref_chr22.vcf.gz, and tgt_masked_chr22.vcf.gz)
    file = split(data, '.')[4] * ".vcf.gz"
    tgt = "./tgt_" * file
    ref = "./ref_" * file
    tgt_mask = "./tgt_masked_" * file
    tgt_mask_unphase = "./tgt_masked_unphased_" * file

    # compute target and reference index
    println("computing tgt/ref/record index")
    tgt_index = falses(n)
    tgt_index[sample(1:n, samples, replace=false)] .= true
    ref_index = .!tgt_index
    record_index = .!find_duplicate(data) # save only unique records (SNPs) 

    # generate masking matrix with `missingprop`% of trues (true = convert to missing)
    p = sum(record_index)
    masks = falses(p, samples)
    for j in 1:samples, i in 1:p
        rand() < missingprop && (masks[i, j] = true)
    end

    # create outputs 
    println("generating target file")
    VCFTools.filter(data, record_index, tgt_index, des = tgt)
    println("generating reference file")
    VCFTools.filter(data, record_index, ref_index, des = ref)
    println("generating masked file")
    mask_gt(tgt, masks, des=tgt_mask)

    # finally, unphase the target data
    unphase(tgt_mask, outfile=tgt_mask_unphase)
end

filter_and_mask

In [11]:
Random.seed!(2020)
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/split")

chr22 = "../raw/ALL.chr22.phase3_v5.shapeit2_mvncall_integrated.noSingleton.genotypes.vcf.gz"
@time filter_and_mask(chr22, 500) # 500 imputation samples, remaining 2004 used as reference

computing tgt/ref/record index
generating target file
generating reference file
generating masked file


[32mCreating ./tgt_masked_unphased_chr22.vcf.gz...100%|█████| Time: 0:01:56[39m


2101.643985 seconds (24.78 G allocations: 2.084 TiB, 9.99% gc time)


In [4]:
Random.seed!(2020)
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/split")

chr20 = "../raw/ALL.chr20.phase3_v5.shapeit2_mvncall_integrated.noSingleton.genotypes.vcf.gz"
@time filter_and_mask(chr20, 500) # 500 imputation samples, remaining 2004 used as reference

computing tgt/ref/record index
generating target file
generating reference file
generating masked file


[32mCreating ./tgt_masked_unphased_chr20.vcf.gz...100%|█████| Time: 0:02:36[39m


3030.980141 seconds (39.83 G allocations: 3.342 TiB, 9.41% gc time)


In [5]:
Random.seed!(2020)
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/split")

chr21 = "../raw/ALL.chr21.phase3_v5.shapeit2_mvncall_integrated.noSingleton.genotypes.vcf.gz"
@time filter_and_mask(chr21, 500) # 500 imputation samples, remaining 2004 used as reference

computing tgt/ref/record index
generating target file
generating reference file
generating masked file


[32mCreating ./tgt_masked_unphased_chr21.vcf.gz...100%|█████| Time: 0:01:37[39m


1735.602025 seconds (24.84 G allocations: 2.086 TiB, 7.54% gc time)


# Run MendelImpute (unique happairs only)

In [2]:
Threads.nthreads()

8

In [4]:
# unique happairs only method (8 threads, no bkpt search, 0 overlapping window)
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/split")
function run()
    X_complete = convert_gt(Float32, "tgt_chr22.vcf.gz")
    n, p = size(X_complete)
    for width in [500, 1000]
        println("running fast method, width = $width")
        tgtfile = "./tgt_masked_unphased_chr22.vcf.gz"
        reffile = "./ref_chr22.vcf.gz"
        outfile = "./mendel_imputed_uniqonly_$(width).vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, 
            unique_only=true, flankwidth=0)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p)")
        X_mendel = nothing
        GC.gc()
        println("")
    end
end
run()

running fast method, width = 500
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:08[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:02:20[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:05:38[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:50[39m


707.969658 seconds (4.92 G allocations: 443.617 GiB, 5.08% gc time)
error = 7.152614433302995e-5

running fast method, width = 1000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:09[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:02:22[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:07:06[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:51[39m


799.512207 seconds (4.90 G allocations: 447.684 GiB, 4.44% gc time)
error = 6.803744230074472e-5



In [8]:
# unique happairs only method (4 threads, no bkpt search, 10% overlapping window)
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/split")
function run()
    X_complete = convert_gt(Float32, "tgt_chr22.vcf.gz")
    n, p = size(X_complete)
    for width in [500, 1000]
        println("running fast method, width = $width")
        tgtfile = "./tgt_masked_unphased_chr22.vcf.gz"
        reffile = "./ref_chr22.vcf.gz"
        outfile = "./mendel_imputed_uniqonly_$(width).vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, unique_only=true)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p)")
        X_mendel = nothing
        GC.gc()
        println("")
    end
end
run()

running fast method, width = 500
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:10[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:02:36[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:10:15[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:51[39m


1013.368178 seconds (4.91 G allocations: 446.391 GiB, 3.60% gc time)
error = 6.651171661195866e-5

running fast method, width = 1000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:07[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:02:34[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:12:18[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:47[39m


1123.341832 seconds (4.90 G allocations: 451.233 GiB, 3.06% gc time)
error = 6.679391384301462e-5



In [3]:
# unique happairs only method (4 threads, no bkpt search, 250 snps overlap in each window)
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/split")
function run()
    X_complete = convert_gt(Float32, "tgt_chr22.vcf.gz")
    n, p = size(X_complete)
    for width in [500, 1000]
        println("running fast method, width = $width")
        tgtfile = "./tgt_masked_unphased_chr22.vcf.gz"
        reffile = "./ref_chr22.vcf.gz"
        outfile = "./mendel_imputed_uniqonly_$(width).vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, 
            unique_only=true, flankwidth=250)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p)")
        X_mendel = nothing
        GC.gc()
        println("")
    end
end
run()

running fast method, width = 500
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:06[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:02:21[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:19:45[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:54[39m


1583.703778 seconds (4.93 G allocations: 462.818 GiB, 2.26% gc time)
error = 6.261987567816491e-5

running fast method, width = 1000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:08[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:02:26[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:15:16[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:51[39m


1303.977857 seconds (4.90 G allocations: 455.191 GiB, 2.74% gc time)
error = 6.947943914075595e-5



In [6]:
# unique happairs only method (8 threads, only single bkpt search, 10% overlapping window)
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/split")
function run()
    X_complete = convert_gt(Float32, "tgt_chr22.vcf.gz")
    n, p = size(X_complete)
    for width in [500, 1000]
        println("running fast method, width = $width")
        tgtfile = "./tgt_masked_unphased_chr22.vcf.gz"
        reffile = "./ref_chr22.vcf.gz"
        outfile = "./mendel_imputed_uniqonly_$(width).vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, unique_only=true)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p)")
        X_mendel = nothing
        GC.gc()
        println("")
    end
end
run()

running fast method, width = 500
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:11[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:02:36[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:07:10[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:48[39m


820.594466 seconds (4.91 G allocations: 446.670 GiB, 4.94% gc time)
error = 6.590390719122273e-5

running fast method, width = 1000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:11[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:02:34[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:09:22[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:59[39m


963.145339 seconds (4.90 G allocations: 450.530 GiB, 4.67% gc time)
error = 6.587599757496445e-5



In [3]:
# unique happairs only method (8 threads, search both strand bkpts, 10% overlapping window)
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/split")
function run()
    X_complete = convert_gt(Float32, "tgt_chr22.vcf.gz")
    n, p = size(X_complete)
    for width in 500
        println("running fast method, width = $width")
        tgtfile = "./tgt_masked_unphased_chr22.vcf.gz"
        reffile = "./ref_chr22.vcf.gz"
        outfile = "./mendel_imputed_uniqonly_$(width).vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, unique_only=true)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p)")
        X_mendel = nothing
        GC.gc()
        println("")
    end
end
run()

running fast method, width = 500
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:06[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:02:16[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:07:05[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:31:42[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:49[39m


2694.200313 seconds (4.93 G allocations: 448.141 GiB, 1.26% gc time)
error = 6.76560108785482e-5



# Run MendelImpute (fast)

In [2]:
Threads.nthreads()

8

In [3]:
# fast method
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/split")
function run()
    X_complete = convert_gt(Float32, "tgt_chr22.vcf.gz")
    n, p = size(X_complete)
    for width in 1000
        println("running fast method, width = $width")
        tgtfile = "./tgt_masked_unphased_chr22.vcf.gz"
        reffile = "./ref_chr22.vcf.gz"
        outfile = "./mendel_imputed_fast_$(width).vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, fast_method=true)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p)")
        X_mendel = nothing
        GC.gc()
        println("")
    end
end
run()

running fast method, width = 1000


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:04[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:02:15[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:27:23[39m
[32mIntersecting haplotypes...100%|█████████████████████████| Time: 0:00:01[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:25:18[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:47[39m


3525.422826 seconds (4.92 G allocations: 451.743 GiB, 0.89% gc time)
error = 5.980410550455159e-5



# Run MendelImpute (dynamic programming)

In [2]:
Threads.nthreads()

8

In [3]:
# dynamic programming without searching bkpts
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/split")
function run_dp()
    X_complete = convert_gt(Float32, "tgt_chr22.vcf.gz")
    n, p = size(X_complete)
    for width in 1000
        println("running dynamic programming, width = $width")
        tgtfile = "./tgt_masked_unphased_chr22.vcf.gz"
        reffile = "./ref_chr22.vcf.gz"
        outfile = "./mendel_imputed_dp_$(width).vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p)")
        X_mendel = nothing
        GC.gc()
        println("")
    end
end
run_dp()

running dynamic programming, width = 1000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:06[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:02:17[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:08:33[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:41[39m


873.752105 seconds (4.92 G allocations: 453.692 GiB, 3.73% gc time)
error = 6.543874692025138e-5



In [3]:
# dynamic programming 
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/split")
function run_dp()
    X_complete = convert_gt(Float32, "tgt_chr22.vcf.gz")
    n, p = size(X_complete)
    for width in 1000
        println("running dynamic programming, width = $width")
        tgtfile = "./tgt_masked_unphased_chr22.vcf.gz"
        reffile = "./ref_chr22.vcf.gz"
        outfile = "./mendel_imputed_dp_$(width).vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p)")
        X_mendel = nothing
        GC.gc()
        println("")
    end
end
run_dp()

running width = 1000


[32mImporting genotype and haplotype files...100%|██████████| Time: 0:04:14[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:29:11[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:30:50[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:48[39m


4012.073367 seconds (4.91 G allocations: 461.667 GiB, 0.94% gc time)
error = 5.823806592561467e-5



# Run Beagle 5.0

In [13]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/split")
function run_beagle()
    run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=tgt_masked_unphased_chr22.vcf.gz ref=ref_chr22.vcf.gz out=beagle_imputed nthreads=4`)
end
run_beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 12:56 AM PDT on 06 Apr 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=tgt_masked_unphased_chr22.vcf.gz
  ref=ref_chr22.vcf.gz
  out=beagle_imputed,
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,004
Study samples:             500

Window 1 (22:16050115-51244237)
Reference markers:     644,939
Study markers:         644,939

Burnin  iteration 1:           3 minutes 25 seconds
Burnin  iteration 2:           3 minutes 59 seconds
Burnin  iteration 3:           5 minutes 13 seconds
Burnin  iteration 4:           6 minutes 47 seconds
Burnin  iteration 5:           8 minutes 20 seconds
Burnin  iteration 6:           8 minutes 55 seconds

Phasing iteration 1:           8 minutes 47 seconds
Phasing iteration 2:           8 minutes 37 seconds
Phasing iteration 3:           

Process(`[4mjava[24m [4m-Xmx15g[24m [4m-jar[24m [4mbeagle.28Sep18.793.jar[24m [4mgt=tgt_masked_unphased_chr22.vcf.gz[24m [4mref=ref_chr22.vcf.gz[24m [4mout=beagle_imputed,[24m [4mnthreads=4[24m`, ProcessExited(0))

In [14]:
X_complete = convert_gt(Float32, "tgt_chr22.vcf.gz")
X_beagle = convert_gt(Float32, "beagle_imputed.vcf.gz")
n, p = size(X_complete)
println("error = $(sum(X_beagle .!= X_complete) / n / p)")

error = 5.513389638399911e-5


# Run Minimac 4

In [3]:
# minimac 4: beagle4's prephasing increases total number of snps, so just use true phase info
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/split")
run(`minimac4 --refHaps ref_chr22.m3vcf.gz --haps tgt_masked_chr22.vcf.gz --prefix minimac_imputed_chr22 --format GT --cpus 4`)

Base.IOError: IOError: could not spawn `minimac4 --refHaps ref_chr22.m3vcf.gz --haps tgt_masked_chr22.vcf.gz --prefix minimac_imputed_chr22 --format GT --cpus 4`: no such file or directory (ENOENT)

## Minimac4 output (ran in terminal)

```(base) $ minimac4 --refHaps ref_chr22.m3vcf.gz --haps tgt_masked_chr22.vcf.gz --prefix minimac_imputed_chr22 --format GT --cpus 4```


 --------------------------------------------------------------------------------
          Minimac4 - Fast Imputation Based on State Space Reduction HMM
 --------------------------------------------------------------------------------
           (c) 2014 - Sayantan Das, Christian Fuchsberger, David Hinds
                             Mary Kate Wing, Goncalo Abecasis

 Version: 1.0.2;
 Built: Mon Sep 30 11:52:22 PDT 2019 by biona001

 URL = http://genome.sph.umich.edu/wiki/Minimac4

 Command Line Options:
       Reference Haplotypes : --refHaps [ref_chr22.m3vcf.gz], --passOnly,
                              --rsid, --referenceEstimates [ON],
                              --mapFile [docs/geneticMapFile.b38.map.txt.gz]
          Target Haplotypes : --haps [tgt_masked_chr22.vcf.gz]
          Output Parameters : --prefix [minimac_imputed_chr22], --estimate,
                              --nobgzip, --vcfBuffer [200], --format [GT],
                              --allTypedSites, --meta, --memUsage
        Chunking Parameters : --ChunkLengthMb [20.00], --ChunkOverlapMb [3.00]
          Subset Parameters : --chr [], --start, --end, --window
   Approximation Parameters : --minimac3, --probThreshold [0.01],
                              --diffThreshold [0.01], --topThreshold [0.01]
           Other Parameters : --log, --help, --cpus [4], --params
                  PhoneHome : --noPhoneHome, --phoneHomeThinning [50]

 Starting Main Imputation/Estimation Analysis ...

 Performing preliminary check on input parameters...

 ------------------------------------------------------------------------------
                             PRELIMINARY FILE CHECK
 ------------------------------------------------------------------------------

 Checking GWAS haplotype file : tgt_masked_chr22.vcf.gz

 Gathering variant information ...

 Successful !!!

 Checking Reference haplotype file : ref_chr22.m3vcf.gz

 Gathering variant information ...

 Successful !!!

 Reference Panel   : Found 2004 samples (4008 haplotypes) and 644939 variants ...

 Target/GWAS Panel : Found 500 samples (1000 haplotypes) and 644939 variants ...
                     644939 variants overlap with Reference panel
                     0 variants imported that exist only in Target/GWAS panel

 ------------------------------------------------------------------------------
                           CHUNKING INFORMATION
 ------------------------------------------------------------------------------

 Chunking region into 1 chunk(s) with atleast 644939 variants in each chunk ...

 Details of chunks is given below ...

 No   LeftBuffer      LeftEnd   RightPoint  RightBuffer       #Sites(GWAS/Ref/%)
 -------------------------------------------------------------------------------
  1     16050115     16050115     51244237     51244237   644939/  644939/ 100.00%


 ------------------------------------------------------------------------------
                           MAIN IMPUTATION ANALYSIS
 ------------------------------------------------------------------------------

 Starting imputation analysis of 1 chunk(s) ...

 -------------------------------------------
 Analyzing Chunk 1/1 [22:16050115-51244237]
 -------------------------------------------

 Reading chunk from reference panel ...
 Reading chunk from target/GWAS panel ...

 Compressing reference panel at GWAS sites ...
 Re-compression successful (1079 seconds) !!!

 Starting Imputation ...

  Imputing Samples 1-200 [40%] out of 500 samples ...
       Saving Samples in temporary VCF file ...
  Imputing Samples 201-400 [80%] out of 500 samples ...
       Saving Samples in temporary VCF file ...
  Imputing Samples 401-500 [100%] out of 500 samples ...
       Saving Samples in temporary VCF file ...

 Imputation successful (9418 seconds) !!!

 Appending chunk to final output VCF File :  minimac_imputed_chr22.dose.vcf.gz
 Appending successful (19 seconds) !!!

 Time taken for this chunk = 10597 seconds.

 ------------------------------------------------------------------------------
                              SUMMARY OF ANALYSIS
 ------------------------------------------------------------------------------

 Info file written to                    : minimac_imputed_chr22.info
 Imputed VCF information written to      : minimac_imputed_chr22.dose.vcf.gz

 Time Taken for Reading File             = 114 seconds
 Time Taken for Re-compression           = 1079 seconds
 Time Taken for Imputation               = 9373 seconds
 Time Taken for Writing File             = 64 seconds

 ------------------------------------------------------------------------------
                                END OF PROGRAM
 ------------------------------------------------------------------------------

 Program Successfully Implemented...

 Total Run completed in 2 hours, 57 mins, 27 seconds.

 Thank You for using Minimac4 !!!

In [3]:
X_complete = convert_gt(Float32, "tgt_chr22.vcf.gz")
X_minimac = convert_gt(Float32, "minimac_imputed_chr22.dose.vcf.gz")
n, p = size(X_complete)
println("error = $(sum(X_minimac .!= X_complete) / n / p)")

error = 0.001040730983860489


# Prephase using beagle 4.1

In [None]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/split")
function prephase()
    run(`java -Xmx42g -jar beagle.27Jan18.7e1.jar gt=tgt_masked_unphased_chr22.vcf.gz ref=ref_chr22.vcf.gz out=tgt_masked_prephased_chr22 niterations=0 impute=false nthreads=4`)
end
prephase()

beagle.27Jan18.7e1.jar (version 4.1)
Copyright (C) 2014-2015 Brian L. Browning
Enter "java -jar beagle.27Jan18.7e1.jar" for a summary of command line arguments.
Start time: 08:50 PM PDT on 16 Apr 2020

Command line: java -Xmx38229m -jar beagle.jar
  gt=tgt_masked_unphased_chr22.vcf.gz
  ref=ref_chr22.vcf.gz
  out=tgt_masked_prephased_chr22
  niterations=0
  impute=false
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

reference samples:    2004
target samples:        500

Window 1 [ 22:16050115-19217267 ]
reference markers:   50000
target markers:      50000

Starting burn-in iterations

Window=1 Iteration=1
Time for building model:         13 minutes 3 seconds
Time for sampling (singles):     1 minute 55 seconds
DAG statistics
mean edges/level: 243    max edges/level: 515
mean edges/node:  1.046  mean count/edge: 21

Window=1 Iteration=2
Time for building model:         12 minutes 17 seconds
Time for sampling (singles):     1 minute 3 seconds
DAG statistics
mean edges/lev

ProcessFailedException: failed process: Process(`java -Xmx42g -jar beagle.27Jan18.7e1.jar gt=tgt_masked_unphased_chr22.vcf.gz ref=ref_chr22.vcf.gz out=tgt_masked_prephased_chr22 niterations=0 impute=false nthreads=4`, ProcessExited(130)) [130]


## Scripts for submitting jobs on Hoffman

In [None]:
# convert to m3vcf file using minimac3
cd("/u/home/b/biona001/haplotype_comparisons/data") 
open("minimac3.sh", "w") do io
    println(io, "#!/bin/bash")
    println(io, "#\$ -cwd")
    println(io, "# error = Merged with joblog")
    println(io, "#\$ -o joblog.\$JOB_ID")
    println(io, "#\$ -j y")
    println(io, "#\$ -l exclusive,h_rt=24:00:00,h_data=20G")
    println(io, "# Email address to notify")
    println(io, "#\$ -M \$USER@mail")
    println(io, "# Notify when")
    println(io, "#\$ -m a")
    println(io)
    println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
    println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
    println(io)
    println(io, "# load the job environment:")
    println(io, ". /u/local/Modules/default/init/modules.sh")
    println(io, "module load julia/1.2.0")
    println(io, "module load R/3.5.1")
    println(io, "module load java/1.8.0_111")
    println(io)
    println(io, "echo \'converting chr22 from vcf.gz to m3vcf.gz\' ")
    println(io, "/u/home/b/biona001/haplotype_comparisons/Minimac3/bin/Minimac3 --refHaps chr22.uniqueSNPs.vcf.gz --processReference --prefix chr22.uniqueSNPs")
#     println(io, "/u/home/b/biona001/haplotype_comparisons/Minimac3/bin/Minimac3 --refHaps ref_chr21.vcf.gz --processReference --prefix ref_chr21")
#     println(io, "/u/home/b/biona001/haplotype_comparisons/Minimac3/bin/Minimac3 --refHaps ref_chr20.vcf.gz --processReference --prefix ref_chr20")
    println(io)
    println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
    println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
    println(io)
end
# submit job
run(`qsub minimac3.sh`)
rm("minimac3.sh", force=true)

In [None]:
# run mendel dp and fast method
cd("/u/home/b/biona001/haplotype_comparisons/data") 
for w in [400, 800, 1200, 1600]
    open("mendel.sh", "w") do io
        println(io, "#!/bin/bash")
        println(io, "#\$ -cwd")
        println(io, "# error = Merged with joblog")
        println(io, "#\$ -o joblog.\$JOB_ID")
        println(io, "#\$ -j y")
        println(io, "#\$ -l exclusive,h_rt=24:00:00,h_data=20G")
        println(io, "# Email address to notify")
        println(io, "#\$ -M \$USER@mail")
        println(io, "# Notify when")
        println(io, "#\$ -m a")
        println(io)
        println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
        println(io)
        println(io, "# load the job environment:")
        println(io, ". /u/local/Modules/default/init/modules.sh")
        println(io, "module load julia/1.2.0")
        println(io, "module load java/1.8.0_111")
        println(io)
        println(io, "echo \'run mendel (dp and fast method) on chr 20, 21, 22. width = $w on 4 threads\' ")
        println(io, "export JULIA_NUM_THREADS=4")
        println(io, "julia mendel.jl $w")
        println(io)
        println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
        println(io)
    end
    # submit job
    run(`qsub mendel.sh`)
    rm("mendel.sh", force=true)
end

In [None]:
# run beagle 5
cd("/u/home/b/biona001/haplotype_comparisons/data") 
open("beagle.sh", "w") do io
    println(io, "#!/bin/bash")
    println(io, "#\$ -cwd")
    println(io, "# error = Merged with joblog")
    println(io, "#\$ -o joblog.\$JOB_ID")
    println(io, "#\$ -j y")
    println(io, "#\$ -l exclusive,h_rt=24:00:00,h_data=20G")
    println(io, "# Email address to notify")
    println(io, "#\$ -M \$USER@mail")
    println(io, "# Notify when")
    println(io, "#\$ -m a")
    println(io)
    println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
    println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
    println(io)
    println(io, "# load the job environment:")
    println(io, ". /u/local/Modules/default/init/modules.sh")
    println(io, "module load julia/1.2.0")
    println(io, "module load java/1.8.0_111")
    println(io)
    println(io, "echo \'run beagle 5.0 on chr 20, 21 with 12 threads\' ")
    println(io, "java -Xmx42g -jar beagle5.0.jar gt=tgt_masked_unphased_chr20.vcf.gz ref=ref_chr20.vcf.gz out=beagle_imputed_chr20 nthreads=4")
    println(io, "java -Xmx42g -jar beagle5.0.jar gt=tgt_masked_unphased_chr21.vcf.gz ref=ref_chr21.vcf.gz out=beagle_imputed_chr21 nthreads=4")
    println(io)
    println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
    println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
    println(io)
end
# submit job
run(`qsub beagle.sh`)
rm("beagle.sh", force=true)

In [None]:
# prephase using beagle 4 (this takes like 3 days or so per chrom)
cd("/u/home/b/biona001/haplotype_comparisons/data") 
open("beagle.sh", "w") do io
    println(io, "#!/bin/bash")
    println(io, "#\$ -cwd")
    println(io, "# error = Merged with joblog")
    println(io, "#\$ -o joblog.\$JOB_ID")
    println(io, "#\$ -j y")
    println(io, "#\$ -l highp,h_rt=200:00:00,h_data=25G")
    println(io, "# Email address to notify")
    println(io, "#\$ -M \$USER@mail")
    println(io, "# Notify when")
    println(io, "#\$ -m a")
    println(io)
    println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
    println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
    println(io)
    println(io, "# load the job environment:")
    println(io, ". /u/local/Modules/default/init/modules.sh")
    println(io, "module load julia/1.2.0")
    println(io, "module load java/1.8.0_111")
    println(io)
    println(io, "echo \'prephase using beagle 4.1 on chr 20, 4 threads\' ")
    println(io, "java -Xmx20g -jar beagle4.1.jar gt=tgt_masked_unphased_chr20.vcf.gz ref=ref_chr20.vcf.gz out=tgt_masked_prephased_chr20 niterations=0 impute=false nthreads=4")
#     println(io, "java -Xmx20g -jar beagle4.1.jar gt=tgt_masked_unphased_chr21.vcf.gz ref=ref_chr21.vcf.gz out=tgt_masked_prephased_chr21 niterations=0 impute=false nthreads=4")
#     println(io, "java -Xmx20g -jar beagle4.1.jar gt=tgt_masked_unphased_chr22.vcf.gz ref=ref_chr22.vcf.gz out=tgt_masked_prephased_chr22 niterations=0 impute=false nthreads=4")
    println(io)
    println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
    println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
    println(io)
end
# submit job
run(`qsub beagle.sh`)
rm("beagle.sh", force=true)

In [None]:
# run minimac 4
cd("/u/home/b/biona001/haplotype_comparisons/data") 
open("minimac.sh", "w") do io
    println(io, "#!/bin/bash")
    println(io, "#\$ -cwd")
    println(io, "# error = Merged with joblog")
    println(io, "#\$ -o joblog.\$JOB_ID")
    println(io, "#\$ -j y")
    println(io, "#\$ -l arch=intel-X5650,exclusive,h_rt=24:00:00,h_data=20G")
    println(io, "# Email address to notify")
    println(io, "#\$ -M \$USER@mail")
    println(io, "# Notify when")
    println(io, "#\$ -m a")
    println(io)
    println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
    println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
    println(io)
    println(io, "# load the job environment:")
    println(io, ". /u/local/Modules/default/init/modules.sh")
    println(io, "module load julia/1.2.0")
    println(io, "module load java/1.8.0_111")
    println(io)
    println(io, "echo \'run minimac4 on chr 22 with 8 threads\' ")
#     println(io, "/u/home/b/biona001/haplotype_comparisons/Minimac4/build/minimac4 --refHaps ref_chr20.m3vcf.gz --haps tgt_masked_prephased_chr20.vcf.gz --prefix minimac_imputed_chr20 --format GT --cpus 12")
#     println(io, "/u/home/b/biona001/haplotype_comparisons/Minimac4/build/minimac4 --refHaps ref_chr21.m3vcf.gz --haps tgt_masked_prephased_chr21.vcf.gz --prefix minimac_imputed_chr21 --format GT --cpus 12")
    println(io, "/u/home/b/biona001/haplotype_comparisons/Minimac4/build/minimac4 --refHaps ref_chr22.m3vcf.gz --haps tgt_masked_chr22.vcf.gz --prefix minimac_imputed_chr22 --format GT --cpus 12")
    println(io)
    println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
    println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
    println(io)
end
# submit job
run(`qsub minimac.sh`)
rm("minimac.sh", force=true)

# Error rate

## Chr 20
+ mendel = 2 hours 39 minutes (1 core, width = 400)
+ beagle = 8 hours 42 minutes (4 cores)

In [6]:
# chr20
X_complete = convert_gt(Float32, "tgt_chr20.vcf.gz", msg="importing ")
X_mendel_400 = convert_gt(Float32, "mendel_imputed_fast_chr20_400.vcf.gz", msg="importing ")
X_mendel_800 = convert_gt(Float32, "mendel_imputed_fast_chr20_800.vcf.gz", msg="importing ")
X_mendel_1200 = convert_gt(Float32, "mendel_imputed_fast_chr20_1200.vcf.gz", msg="importing ")
X_mendel_1600 = convert_gt(Float32, "mendel_imputed_fast_chr20_1600.vcf.gz", msg="importing ")
X_beagle = convert_gt(Float32, "beagle_imputed_chr20.vcf.gz", msg="importing ")
n, p = size(X_complete)
mendel_400_error = sum(X_complete .!= X_mendel_400) / n / p
mendel_800_error = sum(X_complete .!= X_mendel_800) / n / p
mendel_1200_error = sum(X_complete .!= X_mendel_1200) / n / p
mendel_1600_error = sum(X_complete .!= X_mendel_1600) / n / p
beagle_error = sum(X_complete .!= X_beagle) / n / p

@show mendel_400_error
@show mendel_800_error
@show mendel_1200_error
@show mendel_1600_error
@show beagle_error

[32mimporting 100%|█████████████████████████████████████████| Time: 0:01:26[39m
[32mimporting 100%|█████████████████████████████████████████| Time: 0:01:31[39m
[32mimporting 100%|█████████████████████████████████████████| Time: 0:01:37[39m
[32mimporting 100%|█████████████████████████████████████████| Time: 0:01:39[39m
[32mimporting 100%|█████████████████████████████████████████| Time: 0:01:39[39m
[32mimporting 100%|█████████████████████████████████████████| Time: 0:01:37[39m


mendel_400_error = 4.8138136864388806e-5
mendel_800_error = 4.594022449554055e-5
mendel_1200_error = 4.7831585928733656e-5
mendel_1600_error = 5.1918931737469005e-5
beagle_error = 3.673020047274396e-5


3.673020047274396e-5

In [3]:
# chr21
X_complete = convert_gt(Float32, "tgt_chr21.vcf.gz", msg="importing ")
X_mendel_400 = convert_gt(Float32, "mendel_imputed_fast_chr21_400.vcf.gz", msg="importing ")
X_beagle = convert_gt(Float32, "beagle_imputed_chr21.vcf.gz", msg="importing ")
n, p = size(X_complete)
mendel_400_error = sum(X_complete .!= X_mendel_400) / n / p
beagle_error = sum(X_complete .!= X_beagle) / n / p

@show mendel_400_error
@show beagle_error

[32mimporting 100%|█████████████████████████████████████████| Time: 0:01:00[39m
[32mimporting 100%|█████████████████████████████████████████| Time: 0:01:02[39m
[32mimporting 100%|█████████████████████████████████████████| Time: 0:01:03[39m


mendel_400_error = 5.7447779315891646e-5
beagle_error = 5.55236762124247e-5


5.55236762124247e-5

In [7]:
# chr22
X_complete = convert_gt(Float32, "tgt_chr22.vcf.gz", msg="importing ")
X_mendel_1000_fast = convert_gt(Float32, "mendel_imputed_fast_chr22_1000.vcf.gz", msg="importing ")
X_mendel_1000_dp = convert_gt(Float32, "mendel_imputed_dp_chr22_1000.vcf.gz", msg="importing ")
X_beagle = convert_gt(Float32, "beagle_imputed_chr22.vcf.gz", msg="importing ")
n, p = size(X_complete)
mendel_1000_fast_error = sum(X_complete .!= X_mendel_1000_fast) / n / p
mendel_1000_dp_error = sum(X_complete .!= X_mendel_1000_dp) / n / p
beagle_error = sum(X_complete .!= X_beagle) / n / p

@show mendel_1000_fast_error
@show mendel_1000_dp_error
@show beagle_error

[32mimporting 100%|█████████████████████████████████████████| Time: 0:00:59[39m
[32mimporting 100%|█████████████████████████████████████████| Time: 0:01:00[39m


SystemError: SystemError: opening file "mendel_imputed_dp_chr22_1000.vcf.gz": No such file or directory

In [5]:
@show nrecords("tgt_chr20.vcf.gz")
@show nrecords("tgt_chr21.vcf.gz")
@show nrecords("tgt_chr22.vcf.gz")

nrecords("tgt_chr20.vcf.gz") = 1037348
nrecords("tgt_chr21.vcf.gz") = 646535
nrecords("tgt_chr22.vcf.gz") = 644939


644939