# Compare MendelImpute against Minimac4 and Beagle5 on simulated data

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using SparseArrays
using JLD2, FileIO, JLSO
using ProgressMeter
using GroupSlices
using ThreadPools
# using Plots
# using ProfileView

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


# Simulate data

### Step 0. Install `msprime`

[msprime download Link](https://msprime.readthedocs.io/en/stable/installation.html).

Some people might need to activate conda environment via `conda config --set auto_activate_base True`. You can turn it off once simulation is done by executing `conda config --set auto_activate_base False`.


### Step 1. Simulate data in terminal

```
python3 msprime_script.py 4000 10000 5000000 2e-8 2e-8 2019 > full.vcf
```

Arguments: 
+ Number of haplotypes = 40000
+ Effective population size = 10000 ([source](https://www.the-scientist.com/the-nutshell/ancient-humans-more-diverse-43556))
+ Sequence length = 10 million (same as Beagle 5's choice)
+ Rrecombination rate = 2e-8 (default)
+ mutation rate = 2e-8 (default)
+ seed = 2019

### Step 2: Convert simulated haplotypes to reference haplotypes and target genotype files

+ `haplo_ref.vcf.gz`: haplotype reference files
+ `target.vcf.gz`: complete genotype information
+ `target_masked.vcf.gz`: the same as `target.vcf.gz` except some entries are masked

In [4]:
records, samples = nrecords("./compare1/full.vcf"), nsamples("./compare1/full.vcf")
@show records
@show samples;

# compute target and reference index
tgt_index = falses(samples)
tgt_index[samples-999:end] .= true
ref_index = .!tgt_index
record_index = trues(records) # save all records (SNPs) 

# create target.vcf.gz and haplo_ref.vcf.gz
@time VCFTools.filter("./compare1/full.vcf", record_index, tgt_index, des = "./compare1/target.vcf.gz")
@time VCFTools.filter("./compare1/full.vcf", record_index, ref_index, des = "./compare1/haplo_ref.vcf.gz")

# import full target matrix. Also transpose so that columns are samples. 
@time X = convert_gt(Float32, "target.vcf.gz"; as_minorallele=false)
X = copy(X')

# mask 10% entries
p, n = size(X)
Random.seed!(123)
missingprop = 0.1
X .= ifelse.(rand(Float32, p, n) .< missingprop, missing, X)
masks = ismissing.(X)

# save X to new VCF file
mask_gt("target.vcf.gz", masks, des="target_masked.vcf.gz")

records = 35897
samples = 2000
 70.297829 seconds (397.28 M allocations: 33.310 GiB, 11.51% gc time)
 67.404677 seconds (395.78 M allocations: 33.237 GiB, 11.69% gc time)
 18.210343 seconds (144.39 M allocations: 12.666 GiB, 17.95% gc time)


# Try compressing haplotype ref panels

In [12]:
# compress as jld2
reffile = "./compare1/haplo_ref.vcf.gz"
tgtfile = "./compare1/target_masked.vcf.gz"
outfile = "./compare1/haplo_ref.jld2"
width = 512
@time compress_haplotypes(reffile, tgtfile, outfile, width);

[32mimporting reference data...100%|████████████████████████| Time: 0:00:06[39m
[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


 19.165787 seconds (154.79 M allocations: 11.436 GiB, 7.42% gc time)


In [13]:
# compress as jlso
reffile = "./compare1/haplo_ref.vcf.gz"
tgtfile = "./compare1/target_masked.vcf.gz"
outfile = "./compare1/haplo_ref.jlso"
width = 512
@time compress_haplotypes(reffile, tgtfile, outfile, width);

[32mimporting reference data...100%|████████████████████████| Time: 0:00:06[39m
[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


 18.817403 seconds (152.23 M allocations: 11.387 GiB, 7.13% gc time)


In [2]:
# compress as jlso
function compress()
    widths  = [250, 500]
    reffile = "./compare1/haplo_ref.vcf.gz"
    tgtfile = "./compare1/target_masked.vcf.gz"
    H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, reffile, trans=true, save_snp_info=true, msg="importing reference data...")
    X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = VCFTools.convert_gt(UInt8, tgtfile, trans=true, save_snp_info=true, msg = "Importing genotype file...")
    for width in widths
        outfile = "./compare1/haplo_ref.w$width.jlso"
        @time compress_haplotypes(H, X, outfile, X_pos, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt, width)
    end
end
compress()

[32mimporting reference data...100%|████████████████████████| Time: 0:00:07[39m
[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


  7.138870 seconds (9.25 M allocations: 799.583 MiB, 5.66% gc time)
  2.545023 seconds (472.66 k allocations: 218.274 MiB, 1.20% gc time)


In [18]:
# load jld2
@time @load "./compare1/haplo_ref.jld2" compressed_Hunique;

  0.088776 seconds (774.50 k allocations: 58.516 MiB)


In [20]:
# load jlso
@time loaded = JLSO.load("./compare1/haplo_ref.jlso")
compressed_Hunique = loaded[:compressed_Hunique];

  0.225266 seconds (612.38 k allocations: 38.652 MiB, 19.15% gc time)


In [21]:
;ls -al ./compare1/haplo_ref.jld2

-rw-r--r--  1 biona001  staff  23098725 Jul  2 20:58 ./compare1/haplo_ref.jld2


In [22]:
;ls -al ./compare1/haplo_ref.jlso

-rw-r--r--  1 biona001  staff  1959743 Jul  2 20:58 ./compare1/haplo_ref.jlso


In [23]:
;ls -al ./compare1/haplo_ref.vcf.gz

-rw-r--r--@ 1 biona001  staff  5449864 Apr  5 19:59 ./compare1/haplo_ref.vcf.gz


# Haplotype thinning

In [4]:
# doesn't account for allele freq
Random.seed!(2020)
width   = 512
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    dynamic_programming=false, thinning_factor=100, 
    thinning_scale_allelefreq=false, max_haplotypes = 100);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
rm(outfile, force=true)
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:08[39m
[32mPhasing chunk 1/1...100%|███████████████████████████████| Time: 0:00:15[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 8.55815 seconds
    Computing haplotype pair        = 15.5393 seconds
        screening for top haplotypes   = 0.311708 seconds per thread
        BLAS3 mul! to get M and N      = 1.43476 seconds per thread
        haplopair search               = 0.0615894 seconds per thread
        finding redundant happairs     = 0.00335906 seconds per thread
    Phasing by win-win intersection = 0.0462821 seconds
    Imputation                      = 3.43169 seconds

 27.590531 seconds (73.82 M allocations: 5.933 GiB, 3.72% gc time)


0.00014246315848121012

In [21]:
# haplopair_thin (scale freq by 1-p), keep = 100 (1 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    dynamic_programming=false, thinning_factor=100, 
    thinning_scale_allelefreq=false, max_haplotypes = 100);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
# X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.86957 seconds
    Computing haplotype pair        = 3.00972 seconds
        screening for top haplotypes   = 0.447713 seconds per thread
        BLAS3 mul! to get M and N      = 2.18678 seconds per thread
        haplopair search               = 0.0895825 seconds per thread
        finding redundant happairs     = 0.00679355 seconds per thread
    Phasing by win-win intersection = 0.0390542 seconds
    Imputation                      = 2.91976 seconds

 13.861564 seconds (73.82 M allocations: 5.933 GiB, 6.70% gc time)


0.00014246315848121012

# MendelImpute error

In [3]:
# keep best pair only (1 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.25091 seconds
    Computing haplotype pair        = 3.38048 seconds
        BLAS3 mul! to get M and N      = 0.088452 seconds per thread
        haplopair search               = 2.40554 seconds per thread
        finding redundant happairs     = 0.248962 seconds per thread
    Phasing by dynamic programming  = 8.26324 seconds
    Imputation                      = 3.08873 seconds

 22.118953 seconds (73.75 M allocations: 6.635 GiB, 4.19% gc time)


0.00012268434688135499

In [3]:
# keep best pair only (8 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.21779 seconds
    Computing haplotype pair        = 0.742479 seconds
        BLAS3 mul! to get M and N      = 0.0258475 seconds per thread
        haplopair search               = 0.5315 seconds per thread
        finding redundant happairs     = 0.0493769 seconds per thread
    Phasing by dynamic programming  = 1.24009 seconds
    Imputation                      = 3.18624 seconds

 12.536885 seconds (73.76 M allocations: 6.679 GiB, 8.31% gc time)


0.00012268434688135499

# Rescreen

In [18]:
# keep top matching happairs (1 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    dynamic_programming=false, rescreen=true);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m
[32mPhasing chunk 1/1...100%|███████████████████████████████| Time: 0:00:06[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.03889 seconds
    Computing haplotype pair        = 6.09789 seconds
        BLAS3 mul! to get M and N      = 0.0231243 seconds per thread
        haplopair search               = 0.421832 seconds per thread
        min least sq on observed data  = 0.177795 seconds per thread
        finding redundant happairs     = 0.00342426 seconds per thread
    Phasing by win-win intersection = 0.112965 seconds
    Imputation                      = 2.82866 seconds

 16.094751 seconds (74.54 M allocations: 6.043 GiB, 4.83% gc time)


0.00013842382371785945

In [3]:
# Keep a list of top haplotype pairs (1 thread)
Random.seed!(2020)
tgtfile = "target_masked.vcf.gz"
reffile = "haplo_ref.jlso"
outfile = "imputed_target.vcf.gz"
width   = 500
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:10[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:25[39m


Data import time                    = 6.6587 seconds
Computing haplotype pair time       = 10.2962 seconds
Phasing by dynamic programming time = 25.9828 seconds
Imputing time                       = 3.6637 seconds
 46.601150 seconds (73.68 M allocations: 7.879 GiB, 2.17% gc time)


0.00010181909351756413

In [5]:
# Keep a list of top haplotype pairs (8 thread)
Random.seed!(2020)
tgtfile = "target_masked.vcf.gz"
reffile = "haplo_ref.jlso"
outfile = "imputed_target.vcf.gz"
width   = 500
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


Data import time                    = 7.00521 seconds
Computing haplotype pair time       = 2.14302 seconds
Phasing by dynamic programming time = 4.11389 seconds
Imputing time                       = 3.09487 seconds
 16.357013 seconds (73.69 M allocations: 7.939 GiB, 5.88% gc time)


0.00010181909351756413

# MendelImpute with intersecting haplotype sets

In [3]:
# keep best pair only (8 thread)
Random.seed!(2020)
width   = 512
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
rm(outfile, force=true)
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.89825 seconds
    Computing haplotype pair        = 3.64744 seconds
        BLAS3 mul! to get M and N      = 0.0259534 seconds per thread
        haplopair search               = 0.309124 seconds per thread
        finding redundant happairs     = 0.00785295 seconds per thread
    Phasing by win-win intersection = 0.108992 seconds
    Imputation                      = 2.93534 seconds

 14.608200 seconds (73.80 M allocations: 5.957 GiB, 6.97% gc time)


0.00014143243167952753

In [3]:
# scale by allele freq (8 thread)
Random.seed!(2020)
width   = 512
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    dynamic_programming = false, scale_allelefreq=true);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
rm(outfile, force=true)
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m
[32mPhasing chunk 1/1...100%|███████████████████████████████| Time: 0:00:05[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.22951 seconds
    Computing haplotype pair        = 5.139 seconds
        BLAS3 mul! to get M and N      = 0.0250747 seconds per thread
        haplopair search               = 0.480302 seconds per thread
        finding redundant happairs     = 0.00330738 seconds per thread
    Phasing by win-win intersection = 0.15647 seconds
    Imputation                      = 3.23542 seconds

 15.782315 seconds (74.37 M allocations: 5.987 GiB, 5.11% gc time)


0.00014583391369752348

# Screening flanking windows

In [25]:
# keep best pair only (8 thread)
Random.seed!(2020)
width   = 512
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
rm(outfile, force=true)
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.93589 seconds
    Computing haplotype pair        = 0.74407 seconds
        BLAS3 mul! to get M and N      = 0.0491758 seconds per thread
        haplopair search               = 0.528538 seconds per thread
        finding redundant happairs     = 0.00354468 seconds per thread
    Phasing by win-win intersection = 0.112905 seconds
    Imputation                      = 2.91421 seconds

 12.239887 seconds (73.90 M allocations: 5.961 GiB, 6.79% gc time)


0.00013842382371785945

# Try Lasso

In [10]:
# keep best pair only (8 thread)
Random.seed!(2020)
width   = 512
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    dynamic_programming=false, lasso = 1, max_haplotypes = 100);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
rm(outfile, force=true)
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.20137 seconds
    Computing haplotype pair        = 0.346325 seconds
        BLAS3 mul! to get M and N      = 0.0768071 seconds per thread
        haplopair search               = 0.00933216 seconds per thread
        finding redundant happairs     = 0.00774728 seconds per thread
    Phasing by win-win intersection = 0.0452878 seconds
    Imputation                      = 2.85384 seconds

 10.471833 seconds (74.08 M allocations: 5.981 GiB, 7.35% gc time)


0.0008021004540769423

In [12]:
# keep best pair only (8 thread)
Random.seed!(2020)
width   = 512
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    dynamic_programming=false, lasso = 20, max_haplotypes = 100);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
rm(outfile, force=true)
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.35286 seconds
    Computing haplotype pair        = 0.481014 seconds
        BLAS3 mul! to get M and N      = 0.0624651 seconds per thread
        haplopair search               = 0.162875 seconds per thread
        finding redundant happairs     = 0.00697969 seconds per thread
    Phasing by win-win intersection = 0.0351119 seconds
    Imputation                      = 2.81127 seconds

 10.706639 seconds (73.83 M allocations: 5.958 GiB, 7.33% gc time)


0.00014817394211215422

In [10]:
# scale allele freq
Random.seed!(2020)
width   = 512
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    dynamic_programming=false, lasso = 1, max_haplotypes = 100, scale_allelefreq=true);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
# X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
rm(outfile, force=true)
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.9007 seconds
    Computing haplotype pair        = 0.304369 seconds
        BLAS3 mul! to get M and N      = 0.0960437 seconds per thread
        haplopair search               = 0.0104537 seconds per thread
        finding redundant happairs     = 0.00717207 seconds per thread
    Phasing by win-win intersection = 0.059835 seconds
    Imputation                      = 3.19103 seconds

 11.472878 seconds (74.08 M allocations: 5.984 GiB, 7.27% gc time)


0.0008021004540769423

In [9]:
# scale allele freq
Random.seed!(2020)
width   = 512
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    dynamic_programming=false, lasso = 20, max_haplotypes = 100, scale_allelefreq=true);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
# X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
rm(outfile, force=true)
error_rate = sum(X_mendel .!= X_complete) / n / p

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:07[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 8.10954 seconds
    Computing haplotype pair        = 0.847077 seconds
        BLAS3 mul! to get M and N      = 0.0688667 seconds per thread
        haplopair search               = 0.166925 seconds per thread
        finding redundant happairs     = 0.00591267 seconds per thread
    Phasing by win-win intersection = 0.05669 seconds
    Imputation                      = 3.45064 seconds

 12.490121 seconds (74.98 M allocations: 6.017 GiB, 9.37% gc time)


0.00014817394211215422

# Chunking

In [5]:
# sets num_windows_per_chunks = 10
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width,
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

┌ Info: Importing reference haplotype data...
└ @ MendelImpute /Users/biona001/.julia/dev/MendelImpute/src/phasing.jl:33
[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:09[39m


chunk = 1, windows = 10, w_start = 1, w_end = 10
chunk = 2, windows = 10, w_start = 11, w_end = 20
chunk = 3, windows = 10, w_start = 21, w_end = 30
chunk = 4, windows = 10, w_start = 31, w_end = 40
chunk = 5, windows = 10, w_start = 41, w_end = 50
chunk = 6, windows = 10, w_start = 51, w_end = 60
chunk = 7, windows = 10, w_start = 61, w_end = 70
[1, 513, 1025, 1537, 2049, 2561, 3073, 3585, 4097, 4609, 5121, 5633, 6145, 6657, 7169, 7681, 8193, 8705, 9217, 9729, 10241, 10753, 11265, 11777, 12289, 12801, 13313, 13825, 14337, 14849, 15361, 15873, 16385, 16897, 17409, 17921, 18433, 18945, 19457, 19969, 20481, 20993, 21505, 22017, 22017, 22529, 23041, 23553, 24065, 24577, 25089, 25601, 26113, 26625, 27137, 27584, 27649, 28161, 28673, 29185, 29244, 29697, 30209, 30721, 31233, 31745, 32257, 32769, 33281, 33793, 34305, 34817, 35329]
Total windows = windows, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 9.70717 seconds
    Computing haplotype pai

0.00015243613672451738

In [4]:
# default num_windows_per_chunks (i.e. only 1 chunk)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width,
    dynamic_programming = false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

┌ Info: Importing reference haplotype data...
└ @ MendelImpute /Users/biona001/.julia/dev/MendelImpute/src/phasing.jl:33
[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


chunk = 1, windows = 70, w_start = 1, w_end = 70
Total windows = windows, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 7.28044 seconds
    Computing haplotype pair        = 0.694511 seconds
        BLAS3 mul! to get M and N      = 0.0260292 seconds per thread
        haplopair search               = 0.528569 seconds per thread
        finding redundant happairs     = 0.00670662 seconds per thread
    Phasing by win-win intersection = 0.034327 seconds
    Imputation                      = 2.91026 seconds

 11.092406 seconds (74.38 M allocations: 6.160 GiB, 8.84% gc time)


0.00014143243167952753

In [5]:
# dp and only 1 chunk
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time ph = phase(tgtfile, reffile, outfile = outfile, width = width,
    dynamic_programming = true);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

┌ Info: Importing reference haplotype data...
└ @ MendelImpute /Users/biona001/.julia/dev/MendelImpute/src/phasing.jl:33
[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


chunk = 1, windows = 70, w_start = 1, w_end = 70
Total windows = windows, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 6.97036 seconds
    Computing haplotype pair        = 1.04969 seconds
        BLAS3 mul! to get M and N      = 0.0299686 seconds per thread
        haplopair search               = 0.544858 seconds per thread
        finding redundant happairs     = 0.0502279 seconds per thread
    Phasing by dynamic programming  = 1.40822 seconds
    Imputation                      = 2.92008 seconds

 12.534480 seconds (75.24 M allocations: 6.741 GiB, 7.25% gc time)


0.00012268434688135499

# Beagle 5.1 Error

In [5]:
# convert to bref3 (run in terminal)
java -jar bref3.18May20.d20.jar haplo_ref.vcf.gz > haplo_ref.bref3 

usage:
  java -jar bref3.18May20.d20.jar help

  java -jar bref3.18May20.d20.jar [vcf] <nseq>  > [bref3]

  cat   [vcf]   | java -jar bref3.18May20.d20.jar <nseq>  > [bref3]

where
  [bref3]  = the output bref3 file
  [vcf]    = A VCF file with phased, non-missing genotype data.  If the
             file is gzip-compressed, its filename must end in ".gz"
             and "cat" must be replaced with "zcat"
  <nseq>   = optional argument for maximum number of unique sequences
             in a bref3 block. If there are N reference samples,
             the default value is: <max-seq>=2^(2*log10(N) + 1)



In [6]:
# run beagle 5 (1 thread)
run(`java -jar beagle.18May20.d20.jar gt=target_masked.vcf.gz ref=haplo_ref.bref3 out=beagle.result nthreads=1`)

beagle.18May20.d20.jar (version 5.1)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.18May20.d20.jar" to list command line argument
Start time: 09:01 PM PDT on 29 Jun 2020

Command line: java -Xmx3641m -jar beagle.18May20.d20.jar
  gt=target_masked.vcf.gz
  ref=haplo_ref.bref3
  out=beagle.result
  nthreads=1

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       1,000
Study samples:           1,000

Window 1 (1:36-4999683)
Reference markers:      35,897
Study markers:          35,897

Burnin  iteration 1:           2 minutes 39 seconds
Burnin  iteration 2:           2 minutes 56 seconds
Burnin  iteration 3:           2 minutes 59 seconds
Burnin  iteration 4:           2 minutes 56 seconds
Burnin  iteration 5:           2 minutes 58 seconds
Burnin  iteration 6:           4 minutes 33 seconds

Phasing iteration 1:           5 minutes 48 seconds
Phasing iteration 2:           2 minutes 50 seconds
Phasing iteration 3:           2 minutes 52 seconds
Pha

MethodError: MethodError: no method matching convert_gt(::Type{Float32}, ::String; as_minorallele=false)
Closest candidates are:
  convert_gt(::Type{T}, ::AbstractString; model, impute, center, scale, trans, msg, save_snp_info) where T<:Real at /Users/biona001/.julia/packages/VCFTools/fTfDS/src/convert.jl:232 got unsupported keyword argument "as_minorallele"
  convert_gt(::Type{T}, !Matched::Tuple{Bool,Bool}) where T<:Real at /Users/biona001/.julia/packages/VCFTools/fTfDS/src/convert.jl:10 got unsupported keyword argument "as_minorallele"
  convert_gt(::Type{T}, !Matched::Tuple{Bool,Bool}, !Matched::Symbol) where T<:Real at /Users/biona001/.julia/packages/VCFTools/fTfDS/src/convert.jl:10 got unsupported keyword argument "as_minorallele"

In [8]:
# beagle 5.1 error rate
X_complete = convert_gt(Float32, "target.vcf.gz")
n, p = size(X_complete)
X_beagle = convert_gt(Float32, "beagle.result.vcf.gz")
error_rate = sum(X_beagle .!= X_complete) / n / p

2.0698108477031506e-5

In [2]:
# run beagle 5.1 (8 thread)
run(`java -jar beagle.18May20.d20.jar gt=./compare1/target_masked.vcf.gz ref=./compare1/haplo_ref.bref3 out=./compare1/beagle.result nthreads=8`)
    
# beagle 5 error rate
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_complete)
X_beagle = convert_gt(Float32, "./compare1/beagle.result.vcf.gz")
error_rate = sum(X_beagle .!= X_complete) / n / p

beagle.18May20.d20.jar (version 5.1)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.18May20.d20.jar" to list command line argument
Start time: 07:33 PM PDT on 30 Jun 2020

Command line: java -Xmx3641m -jar beagle.18May20.d20.jar
  gt=./compare1/target_masked.vcf.gz
  ref=./compare1/haplo_ref.bref3
  out=./compare1/beagle.result
  nthreads=8

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       1,000
Study samples:           1,000

Window 1 (1:36-4999683)
Reference markers:      35,897
Study markers:          35,897

Burnin  iteration 1:           26 seconds
Burnin  iteration 2:           29 seconds
Burnin  iteration 3:           30 seconds
Burnin  iteration 4:           45 seconds
Burnin  iteration 5:           34 seconds
Burnin  iteration 6:           51 seconds

Phasing iteration 1:           1 minute 10 seconds
Phasing iteration 2:           34 seconds
Phasing iteration 3:           35 seconds
Phasing iteration 4:           33 seconds
Phasing i

2.0948825807170516e-5

# Minimac4 error

Need to first convert reference vcf file to m3vcf using minimac3 (on Hoffman)

```Julia
minimac3 = "/u/home/b/biona001/haplotype_comparisons/minimac3/Minimac3/bin/Minimac3"
@time run(`$minimac3 --refHaps haplo_ref.vcf.gz --processReference --prefix haplo_ref`)
```

In [17]:
# run minimac 4
minimac4 = "/Users/biona001/Benjamin_Folder/UCLA/research/softwares/Minimac4/build/minimac4"
run(`$minimac4 --refHaps haplo_ref.m3vcf.gz --haps target_masked.vcf.gz --prefix minimac4.result`)
    
X_minimac = convert_gt(Float32, "minimac4.result.dose.vcf.gz", as_minorallele=false)
error_rate = sum(X_minimac .!= X_complete) / n / p



 -------------------------------------------------------------------------------- 
          Minimac4 - Fast Imputation Based on State Space Reduction HMM
 --------------------------------------------------------------------------------
           (c) 2014 - Sayantan Das, Christian Fuchsberger, David Hinds
                             Mary Kate Wing, Goncalo Abecasis 

 Version: 1.0.2;
 Built: Mon Sep 30 11:52:22 PDT 2019 by biona001

 Command Line Options: 
       Reference Haplotypes : --refHaps [haplo_ref.m3vcf.gz], --passOnly,
                              --rsid, --referenceEstimates [ON],
                              --mapFile [docs/geneticMapFile.b38.map.txt.gz]
          Target Haplotypes : --haps [target_masked.vcf.gz]
          Output Parameters : --prefix [minimac4.result], --estimate,
                              --nobgzip, --vcfBuffer [200], --format [GT,DS],
                              --allTypedSites, --meta, --memUsage
        Chunking Parameters : --ChunkLengthMb

0.00018399866284090594

# BLAS 2 

In [5]:
# haplopair_thin (doesn't accounts for allele freq), keep = 100 (1 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    thinning_factor=100, thinning_scale_allelefreq=false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:08[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:05[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 9.06753 seconds
    Computing haplotype pair        = 4.12708 seconds
        computing dist(X, H)           = 0.102362 seconds per thread
        BLAS3 mul! to get M and N      = 3.08088 seconds per thread
        haplopair search               = 0.115584 seconds per thread
        finding redundant happairs     = 0.077741 seconds per thread
    Phasing by dynamic programming  = 2.44924 seconds
    Imputation                      = 5.92068 seconds

 21.565006 seconds (73.97 M allocations: 6.675 GiB, 4.47% gc time)


0.00012427222330556873

# BLAS 3

In [4]:
# haplopair_thin (doesn't accounts for allele freq), keep = 100 (1 thread)
Random.seed!(2020)
tgtfile = "./compare1/target_masked.vcf.gz"
reffile = "./compare1/haplo_ref.jlso"
outfile = "./compare1/imputed_target.vcf.gz"
width   = 512
@time hs, ph = phase(tgtfile, reffile, outfile = outfile, width = width, 
    thinning_factor=100, thinning_scale_allelefreq=false);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "./compare1/target.vcf.gz")
n, p = size(X_mendel)
error_rate = sum(X_mendel .!= X_complete) / n / p

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:08[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:06[39m


Total windows = 70, averaging ~ 324 unique haplotypes per window.

Timings: 
    Data import                     = 8.99109 seconds
    Computing haplotype pair        = 4.14255 seconds
        computing dist(X, H)           = 0.103014 seconds per thread
        BLAS3 mul! to get M and N      = 3.12272 seconds per thread
        haplopair search               = 0.121277 seconds per thread
        finding redundant happairs     = 0.0745271 seconds per thread
    Phasing by dynamic programming  = 2.65593 seconds
    Imputation                      = 6.88047 seconds

 22.670608 seconds (73.97 M allocations: 6.675 GiB, 4.33% gc time)


0.00012427222330556873