# Test imputation on untyped SNPs chrom 18

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLD2, FileIO, JLSO
using BenchmarkTools
using GroupSlices

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


### Memory requirement

**Prephasing step:** 
+ Target data requies $people * snps * 4$ bytes of RAM
+ Reference haplotype data requires $haplotypes * snps$ bits of RAM
+ Redundant haplotype set for imputation target requires roughly
$people * windows * 1000$ (max haplotypes per win) $* 16 bytes$ of RAM

## Generate subset of markers for prephasing

In [3]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function filter_and_mask()
    for chr in [18]
        # filter chromosome data for unique snps
        data = "../beagle_raw/chr$chr.1kg.phase3.v5a.vcf.gz"
        full_record_index = .!find_duplicate_marker(data)
        @time VCFTools.filter(data, full_record_index, 1:nsamples(data), 
            des = "chr$chr.uniqueSNPs.vcf.gz")

        # summarize data
        total_snps, samples, _, _, _, maf_by_record, _ = gtstats("chr$chr.uniqueSNPs.vcf.gz")

        # generate target panel with all snps
        n = 100
        sample_idx = falses(samples)
        sample_idx[1:n] .= true
        shuffle!(sample_idx)
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
            sample_idx, des = "target.chr$chr.full.vcf.gz", allow_multiallelic=false)

        # also generate reference panel without target samples
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
            .!sample_idx, des = "ref.chr$chr.excludeTarget.vcf.gz", allow_multiallelic=false)
        
        for maf in [0.0005, 0.001, 0.01, 0.1, 0.45]
            
            # generate target file with 100 samples and typed snps with certain maf
            my_maf = findall(x -> x > maf, maf_by_record)  
            p = length(my_maf)
            record_idx = falses(total_snps)
            record_idx[my_maf] .= true
            @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, sample_idx, 
                des = "target.chr$chr.typedOnly.maf$maf.vcf.gz", allow_multiallelic=false)

            # unphase and mask 1% entries in target file
            masks = falses(p, n)
            missingprop = 0.001
            for j in 1:n, i in 1:p
                rand() < missingprop && (masks[i, j] = true)
            end
            @time mask_gt("target.chr$chr.typedOnly.maf$maf.vcf.gz", masks, 
                des="target.chr$chr.typedOnly.maf$maf.masked.vcf.gz", unphase=true)
        end
    end 
end
Random.seed!(2020)
@time filter_and_mask()

752.076342 seconds (6.51 G allocations: 615.353 GiB, 7.01% gc time)


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:08:17[39m


549.100648 seconds (6.86 G allocations: 650.723 GiB, 10.18% gc time)
1139.017251 seconds (16.68 G allocations: 1.249 TiB, 10.94% gc time)
599.893049 seconds (6.86 G allocations: 650.911 GiB, 10.61% gc time)
 36.158916 seconds (267.39 M allocations: 27.452 GiB, 6.96% gc time)
608.676819 seconds (6.83 G allocations: 648.780 GiB, 10.30% gc time)
 35.610763 seconds (249.89 M allocations: 25.655 GiB, 6.86% gc time)
560.687780 seconds (6.61 G allocations: 626.620 GiB, 10.27% gc time)
 16.352906 seconds (119.38 M allocations: 12.252 GiB, 6.40% gc time)
520.047088 seconds (6.51 G allocations: 615.447 GiB, 10.21% gc time)
  8.014085 seconds (53.29 M allocations: 5.482 GiB, 5.36% gc time)
516.890704 seconds (6.43 G allocations: 607.361 GiB, 10.60% gc time)
  0.831218 seconds (5.35 M allocations: 563.018 MiB, 5.66% gc time)
6410.931663 seconds (76.92 G allocations: 6.823 TiB, 10.06% gc time)


# Try compressing with JLD2

In [4]:
width = 2000    # use 2000 SNPs per window (which almost all haplotypes are unique)
reffile = "ref.chr18.excludeTarget.vcf.gz"
outfile = "ref.chr18.excludeTarget.jld2"
@time compress_haplotypes(reffile, outfile, width);

[32mimporting vcf data...100%|██████████████████████████████| Time: 0:04:24[39m


310.253834 seconds (4.17 G allocations: 309.774 GiB, 17.72% gc time)


In [5]:
@time @load "ref.chr18.excludeTarget.jld2" compressed_Hunique;

  4.488734 seconds (19.12 M allocations: 1.624 GiB, 16.99% gc time)


In [6]:
;ls -al ref.chr18.excludeTarget.jld2

-rw-r--r--  1 biona001  staff  754339754 May 28 10:06 ref.chr18.excludeTarget.jld2


In [7]:
;ls -al ref.chr18.excludeTarget.vcf.gz

-rw-r--r--@ 1 biona001  staff  247072174 May 19 23:43 ref.chr18.excludeTarget.vcf.gz


# Try jlso compression

In [5]:
width = 2000    # use 2000 SNPs per window (which almost all haplotypes are unique)
reffile = "ref.chr18.excludeTarget.vcf.gz"
outfile = "ref.$width.chr18.excludeTarget.jlso"
@time compress_haplotypes(reffile, outfile, width);

[32mimporting vcf data...100%|██████████████████████████████| Time: 0:04:14[39m


332.829952 seconds (4.13 G allocations: 309.558 GiB, 17.26% gc time)


In [12]:
;ls -al ref.1000.chr18.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  49410994 Jun  1 22:24 ref.1000.chr18.excludeTarget.jlso


In [13]:
;ls -al ref.2000.chr18.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  56525290 Jun  1 22:25 ref.2000.chr18.excludeTarget.jlso


In [14]:
;ls -al ref.4000.chr18.excludeTarget.jlso

-rw-r--r--  1 biona001  staff  65786356 Jun  1 22:26 ref.4000.chr18.excludeTarget.jlso


In [15]:
;ls -al ref.chr18.excludeTarget.vcf.gz

-rw-r--r--@ 1 biona001  staff  247072174 May 19 23:43 ref.chr18.excludeTarget.vcf.gz


# Number of unique haplotypes in each window

In [2]:
chr = 18
maf = 0.1
width = 2000

tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.$width.chr18.excludeTarget.jlso"

loaded = JLSO.load(reffile)
compressed_Hunique = loaded[:compressed_Hunique]

X, X_sampleID, X_chr, X_pos, X_ids, X_ref, X_alt = VCFTools.convert_gt(UInt8, tgtfile, trans=true, save_snp_info=true, msg = "Importing genotype file...")

people = size(X, 2)
ref_snps = compressed_Hunique.snps
windows = floor(Int, ref_snps / width)
redundant_haplotypes = [[Tuple{Int, Int}[] for i in 1:windows] for j in 1:people]
typed_snps = Vector{Vector{Int}}(undef, windows);

In [3]:
# print number of unique haplotypes in each window
avg_haps = 0
for w in 1:windows
    cur_range = compressed_Hunique.CWrange[w]
    Hw_pos = compressed_Hunique.pos[cur_range]
    XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
    XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
    Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
    Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :]
    typed_snps[w] = XtoH_rm_nothing # save typed snps index for current window
    print(size(Hw_aligned, 2), ", ")
    avg_haps += size(Hw_aligned, 2)
end
avg_haps / windows

4801, 4766, 4807, 4808, 4803, 4784, 4750, 4376, 4737, 4772, 4742, 4799, 4746, 4491, 4480, 4521, 4672, 4730, 4350, 4625, 4484, 4733, 4752, 4626, 4639, 4387, 4673, 4713, 4397, 4237, 4329, 4062, 4673, 4729, 4507, 3725, 4606, 4480, 3741, 4626, 4611, 4596, 4658, 4757, 4721, 4584, 4075, 4407, 3971, 3238, 4371, 4602, 4452, 4548, 4562, 4609, 3855, 3719, 4342, 4730, 4689, 4312, 4772, 4269, 4580, 4568, 4482, 4217, 4555, 4589, 4226, 3821, 4786, 4457, 4729, 4108, 4747, 4169, 4301, 4434, 4386, 3537, 4608, 4442, 4685, 4233, 3522, 4629, 4402, 4499, 4424, 4745, 4389, 4398, 4701, 4508, 4796, 4576, 4714, 4474, 4732, 4495, 4662, 4490, 3858, 4497, 4163, 4310, 4421, 3406, 4303, 4552, 4301, 4451, 3147, 4138, 4107, 4562, 4240, 3646, 4330, 3613, 4034, 4523, 4643, 4501, 4238, 4392, 4253, 4172, 4355, 4403, 4598, 3819, 3657, 4238, 2924, 3800, 3581, 4145, 4104, 3621, 3349, 4451, 3641, 3998, 3660, 4342, 4558, 3749, 3912, 4276, 4001, 4329, 4330, 3194, 4346, 4638, 3189, 3472, 4011, 3844, 3951, 3263, 3258, 3463, 2550

4205.532863849765

In [5]:
# save only best happair
w = 2
cur_range = compressed_Hunique.CWrange[w]
Hw_pos = compressed_Hunique.pos[cur_range]
XtoH_idx = indexin(X_pos, Hw_pos) # X_pos[i] == Hw_pos[XtoH_idx[i]]
XtoH_rm_nothing = Base.filter(!isnothing, XtoH_idx) # delete snps not in ref panel
Xw_aligned = X[findall(!isnothing, XtoH_idx), :]
Hw_aligned = compressed_Hunique[w].uniqueH[XtoH_rm_nothing, :]
typed_snps[w] = XtoH_rm_nothing

@time happairs, hapscore = haplopair(Xw_aligned, Hw_aligned)
@time compute_redundant_haplotypes!(redundant_haplotypes, compressed_Hunique, happairs, w)

multiplication took 0.125672402 seconds
haplopair      took 0.817886415 seconds
last step      took 4.1871e-5 seconds
initXfloat! time = 0.000196227
haplopair!  time = 0.943683277
  1.183185 seconds (79.30 k allocations: 101.379 MiB, 12.07% gc time)
  0.000459 seconds (11 allocations: 976 bytes)


# MendelImpute error rate and speed

In [2]:
Threads.nthreads()

8

In [4]:
# 8 threads, searching single bkpts, deleting suboptimal pairs in dp, skip windows with <50 typed snps
Random.seed!(2020)
chr = 18
maf = 0.1
width = 2000
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.$width.chr18.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width)

X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:03:03[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:06[39m


Data import time                    = 10.8066 seconds
Computing haplotype pair time       = 183.787 seconds
Phasing by dynamic programming time = 1.65179 seconds
Imputing time                       = 9.13026 seconds
229.475853 seconds (141.20 M allocations: 42.904 GiB, 3.60% gc time)
error overall = 0.005566501134175149 



In [3]:
# 8 threads, save only best happair, searching single bkpts
# deleting suboptimal pairs in dp, skip windows with <50 typed snps
Random.seed!(2020)
chr = 18
maf = 0.1
width = 2000
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.$width.chr18.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width)

X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:42[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:06[39m


Data import time                    = 10.23 seconds
Computing haplotype pair time       = 102.337 seconds
Phasing by dynamic programming time = 1.22405 seconds
Imputing time                       = 8.44946 seconds
144.593542 seconds (140.29 M allocations: 42.697 GiB, 5.00% gc time)
error overall = 0.005999012434875828 



In [2]:
# impute only typed snps
Random.seed!(2020)
chr = 18
maf = 0.45
width = 2000
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.$width.chr18.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=false, width=width)

X_complete, _, _, X_complete_pos, _, _, _ = convert_gt(Float64, "target.chr18.full.vcf.gz", trans=true, save_snp_info=true)
X_typedonly, _, _, X_typedonly_pos, _, _, _ = convert_gt(Float64, outfile, trans=true, save_snp_info=true)
typed_to_complete_idx = indexin(X_typedonly_pos, X_complete_pos)
X_complete_typedonly = X_complete[typed_to_complete_idx, :]
p, n = size(X_typedonly)
println("error overall = $(sum(X_typedonly .!= X_complete_typedonly) / n / p) \n")

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:57[39m


Data import time                    = 7.74952 seconds
Computing haplotype pair time       = 57.2797 seconds
Phasing by dynamic programming time = 1.90273 seconds
Imputing time                       = 0.57671 seconds
 89.702625 seconds (105.16 M allocations: 14.344 GiB, 3.63% gc time)
error overall = 0.0007017132128608308 



In [5]:
# searching single bkpts, deleting suboptimal pairs in dp, skip windows with <50 typed snps
Random.seed!(2020)
chr = 18
maf = 0.45
width = 2000
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr$chr.excludeTarget.jlso"
outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
@time ph, hs = phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
    min_typed_snps=50)

X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
n, p = size(X_complete)
X_mendel = convert_gt(UInt8, outfile)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:55[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


Data import time                    = 6.68637 seconds
Computing haplotype pair time       = 55.7005 seconds
Phasing by dynamic programming time = 1.49851 seconds
Imputing time                       = 13.3049 seconds
 77.173139 seconds (20.27 M allocations: 10.581 GiB, 3.04% gc time)
error overall = 0.0944998604272568 



In [17]:
# try different widths
# deleting suboptimal pairs in dp, not searching bkpts, skip windows with <5% typed snps
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
    n, p = size(X_complete)
    chr = 18
    maf = 0.1
    for width in [1000, 2000, 4000]
        println("MendelImpute with dynamic programming, width = $width, maf = $maf")
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.$width.chr$chr.excludeTarget.jlso"
        outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
            fast_method=false)
        X_mendel = convert_gt(UInt8, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

MendelImpute with dynamic programming, width = 1000, maf = 0.1


[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:25[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:13[39m


Data import time                    = 9.53489 seconds
Computing haplotype pair time       = 1919.31 seconds
Phasing by dynamic programming time = 25.534 seconds
Imputing time                       = 18.0856 seconds
1972.573863 seconds (60.84 M allocations: 222.162 GiB, 3.90% gc time)
error overall = 0.005454584905970195 

MendelImpute with dynamic programming, width = 2000, maf = 0.1


[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:08:51[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:12[39m


Data import time                    = 12.9095 seconds
Computing haplotype pair time       = 531.806 seconds
Phasing by dynamic programming time = 2.22416 seconds
Imputing time                       = 14.0015 seconds
560.924354 seconds (53.12 M allocations: 45.984 GiB, 11.69% gc time)
error overall = 0.005866805379297726 

MendelImpute with dynamic programming, width = 4000, maf = 0.1


[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:02:50[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


Data import time                    = 12.3763 seconds
Computing haplotype pair time       = 170.933 seconds
Phasing by dynamic programming time = 0.038868 seconds
Imputing time                       = 13.1781 seconds
196.524946 seconds (52.65 M allocations: 26.672 GiB, 3.82% gc time)
error overall = 0.008089859043258167 



# Beagle 5.0

In [3]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 18
    for maf in [0.0005, 0.001, 0.01, 0.1, 0.45]
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "beagle.imputed.maf$maf"
        Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
    end
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 10:09 AM PDT on 08 May 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr18.typedOnly.maf0.0005.masked.vcf.gz
  ref=ref.chr18.excludeTarget.vcf.gz
  out=beagle.imputed.maf0.0005
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (18:10644-40010629)
Reference markers:     414,911
Study markers:         414,911

Burnin  iteration 1:           48 seconds
Burnin  iteration 2:           1 minute 28 seconds
Burnin  iteration 3:           1 minute 42 seconds
Burnin  iteration 4:           1 minute 46 seconds
Burnin  iteration 5:           1 minute 53 seconds
Burnin  iteration 6:           2 minutes 5 seconds

Phasing iteration 1:           1 minute 59 seconds
Phasing iteration 2:           2 minutes 1 second
Phasing iterati

DimensionMismatch: DimensionMismatch("arrays could not be broadcast to a common size; got a dimension with lengths 678557 and 863592")

In [5]:
for maf in [0.0005]
    # beagle error rate    
    chr = 18
    X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
    X_beagle = convert_gt(UInt8, "beagle.imputed.maf$maf.vcf.gz")
    n, p = size(X_complete)
    println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")
end

error overall = 5.453964372064586e-6 



In [1]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 18
    for maf in [0.001, 0.01, 0.1, 0.45]
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "beagle.imputed.maf$maf"
        Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
    end
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 02:52 PM PDT on 12 May 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr18.typedOnly.maf0.001.masked.vcf.gz
  ref=ref.chr18.excludeTarget.vcf.gz
  out=beagle.imputed.maf0.001
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (18:10644-40010629)
Reference markers:     414,911
Study markers:         388,114

Burnin  iteration 1:           38 seconds
Burnin  iteration 2:           1 minute 26 seconds
Burnin  iteration 3:           1 minute 28 seconds
Burnin  iteration 4:           1 minute 35 seconds
Burnin  iteration 5:           1 minute 50 seconds
Burnin  iteration 6:           1 minute 47 seconds

Phasing iteration 1:           1 minute 43 seconds
Phasing iteration 2:           1 minute 47 seconds
Phasing iteratio

UndefVarError: UndefVarError: convert_gt not defined

In [3]:
# beagle error rate    
chr = 18
X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
X_beagle = convert_gt(UInt8, "beagle.imputed.maf0.001.vcf.gz")
n, p = size(X_complete)
println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")

ErrorException: GeneticVariation.VCF.Reader file format error on line 11 ~>"�;IMP\t"

In [7]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 18
    for maf in [0.01]
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "beagle.imputed.maf$maf"
        Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
    
        # beagle error rate    
        chr = 18
        X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
        X_beagle = convert_gt(UInt8, "beagle.imputed.maf$maf.vcf.gz")
        n, p = size(X_complete)
        println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")    
    end
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 04:03 PM PDT on 12 May 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr18.typedOnly.maf0.01.masked.vcf.gz
  ref=ref.chr18.excludeTarget.vcf.gz
  out=beagle.imputed.maf0.01
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (18:10644-40010629)
Reference markers:     414,911
Study markers:         183,460

Burnin  iteration 1:           22 seconds
Burnin  iteration 2:           31 seconds
Burnin  iteration 3:           34 seconds
Burnin  iteration 4:           33 seconds
Burnin  iteration 5:           38 seconds
Burnin  iteration 6:           38 seconds

Phasing iteration 1:           37 seconds
Phasing iteration 2:           35 seconds
Phasing iteration 3:           42 seconds
Phasing iteration 4:           38 secon

ErrorException: GeneticVariation.VCF.Reader file format error on line 11 ~>"�;IMP\t"

In [8]:
chr = 18
maf = 0.01
X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
X_beagle = convert_gt(UInt8, "beagle.imputed.maf$maf.vcf.gz")
n, p = size(X_complete)
println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")    

error overall = 0.0011677968299845297 



In [9]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 18
    for maf in [0.1, 0.45]
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "beagle.imputed.maf$maf"
        Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
    
        # beagle error rate
        chr = 18
        X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
        X_beagle = convert_gt(UInt8, "beagle.imputed.maf$maf.vcf.gz")
        n, p = size(X_complete)
        println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")
    end
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 04:47 PM PDT on 12 May 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr18.typedOnly.maf0.1.masked.vcf.gz
  ref=ref.chr18.excludeTarget.vcf.gz
  out=beagle.imputed.maf0.1
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (18:10644-40010629)
Reference markers:     414,911
Study markers:          82,786

Burnin  iteration 1:           18 seconds
Burnin  iteration 2:           19 seconds
Burnin  iteration 3:           19 seconds
Burnin  iteration 4:           19 seconds
Burnin  iteration 5:           22 seconds
Burnin  iteration 6:           19 seconds

Phasing iteration 1:           19 seconds
Phasing iteration 2:           18 seconds
Phasing iteration 3:           18 seconds
Phasing iteration 4:           17 seconds

# Eagle 2 + Minimac4

In order to use the reference panel in Eagle 2's prephase option, one must first convert it to `.bcf` format via e.g. `htslib` which is *extremely* difficult to install. Even after we went through all the hard work to obtain the final `.bcf` reference file (see commands below), eagle 2.4 STILL SAYS the file is not acceptable (not bgzipped or some processing error). Therefore, I have no choice but to prephase without the reference panel. 

In [None]:
# run eagle 2.4: 3367.79 sec on amd-2382 machine (can only run on linux systems)
eagle --vcf=target.chr20.typedOnly.masked.vcf.gz --outPrefix=eagle.phased.chr20 --numThreads=4 --geneticMapFile=../Eagle_v2.4.1/tables/genetic_map_hg19_withX.txt.gz

In [None]:
# convert ref file to m3vcf format (Total Run completed in 1 hours, 46 mins, 24 seconds)
/u/home/b/biona001/haplotype_comparisons/Minimac3/bin/Minimac3 --refHaps ref.chr20.excludeTarget.vcf.gz --processReference --prefix ref.chr20.excludeTarget

In [None]:
# run minimac4 (2619 seconds)
minimac4 --refHaps ref.chr20.excludeTarget.m3vcf.gz --haps eagle.phased.vcf.gz --prefix minimac.imputed.chr20 --format GT --cpus 4

In [None]:
# minimac4 error rate    
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz")
X_minimac = convert_gt(Float32, "minimac.imputed.chr20.dose.vcf.gz")
n, p = size(X_complete)
println("error overall = $(sum(X_minimac .!= X_complete) / n / p) \n")