# Test if MendelImpute can impute untyped SNPs

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase

└ @ Revise /Users/biona001/.julia/packages/Revise/439di/src/Revise.jl:1108
┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


## Generate subset of markers for prephasing

In [2]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function filter_and_mask()
    for chr in [22]
        # filter chromosome data for unique snps
#         data = "../raw/ALL.chr$chr.phase3_v5.shapeit2_mvncall_integrated.noSingleton.genotypes.vcf.gz"
#         full_record_index = .!find_duplicate_marker(data)
#         @time VCFTools.filter(data, full_record_index, 1:nsamples(data), 
#             des = "chr$chr.uniqueSNPs.vcf.gz")

        # summarize data
        total_snps, samples, _, _, _, maf_by_record, _ = gtstats("chr$chr.uniqueSNPs.vcf.gz")

        # generate target file with 250 samples and 100k snps with maf>0.05
        n = 250
        p = 100000
        record_idx = falses(total_snps)
        large_maf = findall(x -> x > 0.05, maf_by_record)  
        shuffle!(large_maf)
        record_idx[large_maf[1:p]] .= true
        sample_idx = falses(samples)
        sample_idx[1:n] .= true
        shuffle!(sample_idx)
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, sample_idx, 
            des = "target.chr$chr.typedOnly.vcf.gz")

        # generate target panel with all snps
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 
            1:total_snps, sample_idx, des = "target.chr$chr.full.vcf.gz")

        # also generate reference panel without target samples
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 
            1:total_snps, .!sample_idx, des = "ref.chr$chr.excludeTarget.vcf.gz")

        # unphase and mask 1% entries in target file
        masks = falses(p, n)
        missingprop = 0.01
        for j in 1:n, i in 1:p
            rand() < missingprop && (masks[i, j] = true)
        end
        @time mask_gt("target.chr$chr.typedOnly.vcf.gz", masks, 
            des="target.chr$chr.typedOnly.masked.vcf.gz", unphase=true)

        # generate subset of reference file that matches target file
        @time conformgt_by_pos("ref.chr$chr.excludeTarget.vcf.gz", 
            "target.chr$(chr).typedOnly.masked.vcf.gz", 
            "chr$chr.aligned", "$chr", 1:typemax(Int))
        if nrecords("chr$chr.aligned.tgt.vcf.gz") == p
            rm("chr$chr.aligned.tgt.vcf.gz", force=true) # perfect match
        else
            error("target file has SNPs not matching in reference file! Shouldn't happen!")
        end
        mv("chr22.aligned.ref.vcf.gz", "ref.chr22.aligned.vcf.gz", force=true)
    end 
end
Random.seed!(2020)
@time filter_and_mask()

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:08:32[39m


400.374871 seconds (4.98 G allocations: 468.120 GiB, 9.98% gc time)
472.964733 seconds (5.67 G allocations: 522.326 GiB, 11.03% gc time)
909.764993 seconds (12.13 G allocations: 935.917 GiB, 13.36% gc time)
 13.910180 seconds (76.61 M allocations: 8.666 GiB, 6.13% gc time)


┌ Info: Match target POS to reference POS
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt.jl:172
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:09:57[39m


616.684922 seconds (7.23 G allocations: 672.405 GiB, 10.84% gc time)


┌ Info: 100000 records are matched
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt.jl:239


ArgumentError: ArgumentError: 'ref.chr22.aligned.vcf.gz' exists. `force=true` is required to remove 'ref.chr22.aligned.vcf.gz' before moving.

### Missing rate

In typed markers, 1% of data is missing at random. In addition, only 84% of all markers are not typed (i.e. systematically missing). 

In [4]:
tgtfile = "target.chr22.typedOnly.masked.vcf.gz"
reffile = "ref.chr22.excludeTarget.vcf.gz"
missing_rate = 1 - nrecords(tgtfile) / nrecords(reffile)

0.844946576342879

# MendelImpute on typed and untyped markers with dp

In [2]:
Threads.nthreads()

8

In [3]:
tgtfile = "target.chr22.typedOnly.masked.vcf.gz"
reffile = "ref.chr22.excludeTarget.vcf.gz"
reffile_aligned = "ref.chr22.aligned.vcf.gz"
X_typedOnly_complete = "target.chr22.typedOnly.vcf.gz"
X_full_complete = "target.chr22.full.vcf.gz"
@show nrecords(tgtfile), nsamples(tgtfile)
@show nrecords(reffile), nsamples(reffile)
@show nrecords(reffile_aligned), nsamples(reffile_aligned)
@show nrecords(X_typedOnly_complete), nsamples(X_typedOnly_complete)
@show nrecords(X_full_complete), nsamples(X_full_complete)

(nrecords(tgtfile), nsamples(tgtfile)) = (100000, 250)
(nrecords(reffile), nsamples(reffile)) = (644939, 2254)
(nrecords(reffile_aligned), nsamples(reffile_aligned)) = (100000, 2254)
(nrecords(X_typedOnly_complete), nsamples(X_typedOnly_complete)) = (100000, 250)
(nrecords(X_full_complete), nsamples(X_full_complete)) = (644939, 250)


(644939, 250)

### Error rate on typed snps 

In [4]:
# using incomplete ref panel
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(Float32, "target.chr22.typedOnly.vcf.gz")
#     X_complete = convert_gt(Float32, "target.chr22.full.vcf.gz")
    n, p = size(X_complete)
    chr = 22
    for width in [500, 1000]
        println("Imputing typed SNPs only with dynamic programming, width = $width")
        tgtfile = "target.chr$chr.typedOnly.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.vcf.gz"
        reffile_aligned = "ref.chr$chr.aligned.vcf.gz"
        @time phase(tgtfile, reffile, reffile_aligned = reffile_aligned,
            outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error on typed data = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

Imputing typed SNPs only with dynamic programming, width = 500
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:29[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:02:42[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:03:49[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:14[39m


450.905790 seconds (651.75 M allocations: 72.804 GiB, 1.93% gc time)
error on typed data = 0.00038176000000000003 

Imputing typed SNPs only with dynamic programming, width = 1000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:32[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:36[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:07:36[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:12[39m


612.972352 seconds (634.26 M allocations: 65.501 GiB, 1.33% gc time)
error on typed data = 0.0006258 



### Error rate on untyped + typed snps

In [5]:
# dp method using incomplete ref panel
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
#     X_complete = convert_gt(Float32, "target.chr22.typedOnly.vcf.gz")
    X_complete = convert_gt(Float32, "target.chr22.full.vcf.gz")
    n, p = size(X_complete)
    chr = 22
    for width in [500, 1000]
        println("Imputing typed + untyped SNPs with dynamic programming, width = $width")
        tgtfile = "target.chr$chr.typedOnly.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.vcf.gz"
        reffile_aligned = "ref.chr$chr.aligned.vcf.gz"
        @time phase(tgtfile, reffile, reffile_aligned = reffile_aligned, impute=true, 
            outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

Imputing typed + untyped SNPs with dynamic programming, width = 500
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:30[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:02:36[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:03:40[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:06:49[39m


1055.139365 seconds (8.67 G allocations: 810.348 GiB, 9.00% gc time)
error overall = 0.0040779236485931234 

Imputing typed + untyped SNPs with dynamic programming, width = 1000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:29[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:02:06[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:08:19[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:06:59[39m


1312.585610 seconds (8.67 G allocations: 803.741 GiB, 7.05% gc time)
error overall = 0.005744133941349492 



In [23]:
# dp method using incomplete ref panel (using impute!)
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
#     X_complete = convert_gt(Float32, "target.chr22.typedOnly.vcf.gz")
    X_complete = convert_gt(Float32, "target.chr22.full.vcf.gz")
    n, p = size(X_complete)
    chr = 22
    for width in [500, 1000]
        println("Imputing typed + untyped SNPs with dynamic programming, width = $width")
        tgtfile = "target.chr$chr.typedOnly.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.vcf.gz"
        reffile_aligned = "ref.chr$chr.aligned.vcf.gz"
        @time phase(tgtfile, reffile, reffile_aligned = reffile_aligned, impute=true, 
            outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

Imputing typed + untyped SNPs with dynamic programming, width = 500
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:27[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:03:45[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:03:24[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:03:02[39m


1247.082654 seconds (10.87 G allocations: 1019.611 GiB, 9.67% gc time)
error overall = 0.008522145505233829 

Imputing typed + untyped SNPs with dynamic programming, width = 1000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:28[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:30[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:07:22[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:03:03[39m


1331.498374 seconds (10.87 G allocations: 1012.988 GiB, 8.83% gc time)
error overall = 0.014499281327381349 



In [2]:
width = 500
chr = 22
tgtfile = "target.chr$chr.typedOnly.masked.vcf.gz"
reffile = "ref.chr$chr.excludeTarget.vcf.gz"
outfile = "mendel.imputed.dp$width.vcf.gz"
reffile_aligned = "ref.chr$chr.aligned.vcf.gz"
tgt_snps, ref_snps = nrecords(tgtfile), nrecords(reffile_aligned)

# declare some constants
people = nsamples(tgtfile)
haplotypes = 2nsamples(reffile_aligned)
ph = [HaplotypeMosaicPair(tgt_snps) for i in 1:people] # phase information

# decide how to partition the data based on available memory 
snps_per_chunk = MendelImpute.chunk_size(people, haplotypes)
chunks = ceil(Int, tgt_snps / snps_per_chunk)

# setup reader to convert vcf files to numeric matrices
Xreader = VCF.Reader(openvcf(tgtfile, "r"))
Hreader = VCF.Reader(openvcf(reffile_aligned, "r"));

# sync data to phase last (possibly only) chunk
remaining_snps = tgt_snps - ((chunks - 1) * snps_per_chunk)
X = Matrix{Union{Float32, Missing}}(undef, remaining_snps, people)
H = BitArray{2}(undef, remaining_snps, haplotypes)
copy_gt_trans!(X, Xreader, msg = "Importing genotype file...")
copy_ht_trans!(H, Hreader, msg = "Importing reference haplotype files...");

hs = compute_optimal_halotype_set(X, H, width = width, fast_method=false)

phase!(ph, X, H, hs, width=width)
ph_copy = deepcopy(ph);

[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:33[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:02:34[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:03:16[39m


In [3]:
ph[1].strand1.start

182-element Array{Int64,1}:
     1
   419
   680
  1646
  2094
  2395
  3445
  3001
  3667
  4625
  4951
  5377
  5986
     ⋮
 93001
 93881
 94509
 94668
 95550
 95883
 96501
 96908
 97657
 98424
 98801
 99406

# MendelImpute on typed/untyped markers with fast method

In [14]:
# fast method using incomplete ref panel
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
#     X_complete = convert_gt(Float32, "target.chr22.typedOnly.vcf.gz")
    X_complete = convert_gt(Float32, "target.chr22.full.vcf.gz")
    n, p = size(X_complete)
    chr = 22
    for width in 100:100:500
        println("Imputing typed + untyped SNPs with dynamic programming, width = $width")
        tgtfile = "target.chr$chr.typedOnly.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.vcf.gz"
        reffile_aligned = "ref.chr$chr.aligned.vcf.gz"
        @time phase(tgtfile, reffile, reffile_aligned = reffile_aligned, impute=true, 
            outfile = outfile, width = width, fast_method=true)
        X_mendel = convert_gt(Float32, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

Imputing typed + untyped SNPs with dynamic programming, width = 100
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:27[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:28[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:06:44[39m


689.956454 seconds (8.67 G allocations: 798.973 GiB, 14.26% gc time)
error overall = 0.010857702821507149 

Imputing typed + untyped SNPs with dynamic programming, width = 200
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:27[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:09[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:14[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:06:48[39m


740.672125 seconds (8.67 G allocations: 801.593 GiB, 12.78% gc time)
error overall = 0.010387438191828994 

Imputing typed + untyped SNPs with dynamic programming, width = 300
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:33[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:32[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:33[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:06:55[39m


816.463186 seconds (8.67 G allocations: 803.592 GiB, 11.90% gc time)
error overall = 0.010579456351685973 

Imputing typed + untyped SNPs with dynamic programming, width = 400
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:29[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:37[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:06:30[39m


790.604516 seconds (8.67 G allocations: 804.614 GiB, 11.56% gc time)
error overall = 0.011033117860758924 

Imputing typed + untyped SNPs with dynamic programming, width = 500
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:27[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:34[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:01:19[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:06:50[39m


854.401966 seconds (8.67 G allocations: 805.029 GiB, 11.55% gc time)
error overall = 0.011707159901944215 



# Complete ref panel gives good error % for typed snps

In [13]:
# complete ref panel, includes time to generate ref.aligned
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(Float32, "target.chr22.typedOnly.vcf.gz")
#     X_complete = convert_gt(Float32, "target.chr22.full.vcf.gz")
    n, p = size(X_complete)
    chr = 22
    for width in [500, 1000]
        println("Imputing typed SNPs only with dynamic programming, width = $width")
        tgtfile = "target.chr$chr.typedOnly.masked.vcf.gz"
        reffile = "chr$chr.uniqueSNPs.vcf.gz"
        outfile = "mendel.imputed.dp$width.vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error on typedOnly data = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

┌ Info: Match target POS to reference POS
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt.jl:172
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:13:31[39m[32mImputing samples... 89%|████████████████████████████▌   |  ETA: 0:01:38[39m
┌ Info: 100000 records are matched
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt.jl:239
[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:34[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:02:13[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:00:07[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m
┌ Info: Match target POS to reference POS
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt.jl:172
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:10:00[39m
┌ Info: 100000 r

Imputing typed SNPs only with dynamic programming, width = 500
Running chunk 1 / 1
1059.540364 seconds (8.67 G allocations: 808.684 GiB, 7.92% gc time)
error on prephase data = 5.7199999999999994e-6 

Imputing typed SNPs only with dynamic programming, width = 1000
Running chunk 1 / 1
823.782148 seconds (8.67 G allocations: 806.468 GiB, 12.22% gc time)
error on prephase data = 5.6e-7 



In [14]:
# complete ref panel
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
#     X_complete = convert_gt(Float32, "target.chr22.typedOnly.vcf.gz")
    X_complete = convert_gt(Float32, "target.chr22.full.vcf.gz")
    n, p = size(X_complete)
    chr = 22
    for width in [500, 1000, 2000]
        println("Imputing typed + untyped SNPs with dynamic programming, width = $width")
        tgtfile = "target.chr$chr.typedOnly.masked.vcf.gz"
        reffile = "chr$chr.uniqueSNPs.vcf.gz"
        outfile = "mendel.imputed.dp$width.vcf.gz"
        @time phase(tgtfile, reffile, impute=true, outfile = outfile, 
            width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

┌ Info: Match target POS to reference POS
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt.jl:172
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:09:09[39m
┌ Info: 100000 records are matched
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt.jl:239
[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:30[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:51[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:00:06[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:07:20[39m
┌ Info: Match target POS to reference POS
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt.jl:172
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:09:28[39m
┌ Info: 100000 records are matched
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt

Imputing typed + untyped SNPs with dynamic programming, width = 500
Running chunk 1 / 1
1428.346320 seconds (17.51 G allocations: 1.576 TiB, 13.55% gc time)
error overall = 0.0002901545727580438 

Imputing typed + untyped SNPs with dynamic programming, width = 1000
Running chunk 1 / 1
1464.793405 seconds (17.51 G allocations: 1.573 TiB, 13.52% gc time)
error overall = 1.1852283704350334e-5 

Imputing typed + untyped SNPs with dynamic programming, width = 2000
Running chunk 1 / 1
1399.438588 seconds (17.51 G allocations: 1.571 TiB, 13.52% gc time)
error overall = 1.8606410838854526e-8 



# Beagle 5.0 on complete ref panel

In [9]:
# rename all samples in tgtfile to tmp_i, store result in tgt_tmp
chr = 22
tgtfile = "target.chr$chr.typedOnly.masked.vcf.gz"
tgt_tmp = "target.chr$chr.typedOnly.masked.tmp.vcf.gz"
reader = VCF.Reader(openvcf(tgtfile, "r"))
metainfo = VCF.header(reader).metainfo
sampleID = VCF.header(reader).sampleID
[sampleID[i] = "tmp$i" for i in 1:length(sampleID)]

#output tmp target 
writer = VCF.Writer(openvcf(tgt_tmp, "w"), VCF.Header(metainfo, sampleID))
[VCF.write(writer, record) for record in reader]
flush(writer); close(reader); close(writer)

In [11]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 22
    tgtfile = "target.chr$chr.typedOnly.masked.tmp.vcf.gz"
    reffile = "chr$chr.uniqueSNPs.vcf.gz"
    outfile = "beagle.imputed.completeREF"
    Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)

    # beagle error rate    
    X_complete = convert_gt(Float32, "target.chr22.full.vcf.gz")
    X_beagle = convert_gt(Float32, "beagle.imputed.completeREF.vcf.gz")
    n, p = size(X_complete)
    println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 03:16 PM PDT on 23 Apr 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr22.prephase.masked.tmp.vcf.gz
  ref=chr22.uniqueSNPs.vcf.gz
  out=beagle.imputed.completeREF
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,504
Study samples:             250

Window 1 (22:16050115-51244237)
Reference markers:     644,939
Study markers:         100,000

Burnin  iteration 1:           41 seconds
Burnin  iteration 2:           47 seconds
Burnin  iteration 3:           49 seconds
Burnin  iteration 4:           46 seconds
Burnin  iteration 5:           51 seconds
Burnin  iteration 6:           52 seconds

Phasing iteration 1:           50 seconds
Phasing iteration 2:           49 seconds
Phasing iteration 3:           48 seconds
Phasing iteration 4:           53 seconds
Ph

# Beagle 5.0 on incomplete ref panel

In [26]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 22
    tgtfile = "target.chr$chr.typedOnly.masked.vcf.gz"
    reffile = "ref.chr$chr.excludeTarget.vcf.gz"
    outfile = "beagle.imputed"
    Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
        
    # beagle error rate    
    X_complete = convert_gt(Float32, "target.chr22.full.vcf.gz")
    X_beagle = convert_gt(Float32, "beagle.imputed.vcf.gz")
    n, p = size(X_complete)
    println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 06:01 PM PDT on 24 Apr 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr22.typedOnly.masked.vcf.gz
  ref=ref.chr22.excludeTarget.vcf.gz
  out=beagle.imputed
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,254
Study samples:             250

Window 1 (22:16050115-51244237)
Reference markers:     644,939
Study markers:         100,000

Burnin  iteration 1:           19 seconds
Burnin  iteration 2:           45 seconds
Burnin  iteration 3:           48 seconds
Burnin  iteration 4:           53 seconds
Burnin  iteration 5:           58 seconds
Burnin  iteration 6:           57 seconds

Phasing iteration 1:           55 seconds
Phasing iteration 2:           51 seconds
Phasing iteration 3:           52 seconds
Phasing iteration 4:           51 seconds
Phasing it

In [5]:
# beagle error rate    
X_complete = convert_gt(Float32, "target.chr22.full.vcf.gz")
X_beagle = convert_gt(Float32, "beagle.imputed.vcf.gz")
n, p = size(X_complete)
println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")

error overall = 0.006312125642890258 



# Eagle 2 + Minimac4

In order to use the reference panel in Eagle 2's prephase option, one must first convert it to `.bcf` format via e.g. `htslib` which is *extremely* difficult to install. Even after we went through all the hard work to obtain the final `.bcf` reference file (see commands below), eagle 2.4 STILL SAYS the file is not acceptable (not bgzipped or some processing error). Therefore, I have no choice but to prephase without the reference panel. 

In [None]:
# THIS CHUNK OF CODE DID NOT WORK
module load htslib/1.9                   # load htslib on hoffman
gunzip ref.chr22.excludeTarget.vcf.gz    # unzip .vcf.gz
bgzip ref.chr22.excludeTarget.vcf        # bgzip the vcf file
bcftools index -f ref.chr22.excludeTarget.vcf.gz   # load some required index, not sure if needed
bcftools view ref.chr22.excludeTarget.vcf > ref.chr22.excludeTarget.bcf # create .bcf -> CREATED FILE BUT says not bgzipped
bgzip ref.chr22.excludeTarget.bcf       # try to bgzip .bcf file -> cannot do so

In [None]:
# run eagle 2.4: 566.355 sec on amd-2431 machine (can only run on linux systems)
eagle --vcf=target.chr22.typedOnly.masked.vcf.gz --outPrefix=eagle.phased --numThreads=4 --geneticMapFile=../Eagle_v2.4.1/tables/genetic_map_hg19_withX.txt.gz

In [None]:
# convert ref file to m3vcf format (Total Run completed in 1 hours, 46 mins, 24 seconds)
/u/home/b/biona001/haplotype_comparisons/Minimac3/bin/Minimac3 --refHaps ref.chr22.excludeTarget.vcf.gz --processReference --prefix ref.chr22.excludeTarget

In [None]:
# run minimac4 (43 mins, 16 seconds)
minimac4 --refHaps ref.chr22.excludeTarget.m3vcf.gz --haps eagle.phased.vcf.gz --prefix minimac.imputed.chr22 --format GT --cpus 8

In [4]:
# minimac4 error rate    
X_complete = convert_gt(Float32, "target.chr22.full.vcf.gz")
X_minimac = convert_gt(Float32, "minimac.imputed.chr22.dose.vcf.gz")
n, p = size(X_complete)
println("error overall = $(sum(X_minimac .!= X_complete) / n / p) \n")

error overall = 0.007229769016914778 

