# Test if MendelImpute can impute untyped SNPs

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase

└ @ Revise /Users/biona001/.julia/packages/Revise/439di/src/Revise.jl:1108
┌ Info: Precompiling VCFTools [a620830f-fdd7-5ebc-8d26-3621ab35fbfe]
└ @ Base loading.jl:1273
┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


## Generate subset of markers for prephasing

In [3]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function filter_and_mask()
    for chr in [21]
        # filter chromosome data for unique snps
        data = "../raw/ALL.chr$chr.phase3_v5.shapeit2_mvncall_integrated.noSingleton.genotypes.vcf.gz"
        full_record_index = .!find_duplicate_marker(data)
        @time VCFTools.filter(data, full_record_index, 1:nsamples(data), 
            des = "chr$chr.uniqueSNPs.vcf.gz")

        # summarize data
        total_snps, samples, _, _, _, maf_by_record, _ = gtstats("chr$chr.uniqueSNPs.vcf.gz")
        large_maf = findall(x -> x > 0.1, maf_by_record)  

        # generate target file with 2 samples and keep snps with maf>0.05 as typed SNPs
        n = 2
        p = length(large_maf)
        record_idx = falses(total_snps)
        record_idx[large_maf] .= true
        sample_idx = falses(samples)
        sample_idx[1:n] .= true
        shuffle!(sample_idx)
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, sample_idx, 
            des = "target.chr$chr.typedOnly.vcf.gz")

        # generate target panel with all snps
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 
            1:total_snps, sample_idx, des = "target.chr$chr.full.vcf.gz")

        # also generate reference panel without target samples
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 
            1:total_snps, .!sample_idx, des = "ref.chr$chr.excludeTarget.vcf.gz")

        # unphase and mask 1% entries in target file
        masks = falses(p, n)
        missingprop = 0.001
        for j in 1:n, i in 1:p
            rand() < missingprop && (masks[i, j] = true)
        end
        @time mask_gt("target.chr$chr.typedOnly.vcf.gz", masks, 
            des="target.chr$chr.typedOnly.masked.vcf.gz", unphase=true)

        # generate subset of reference file that matches target file
        @time conformgt_by_pos("ref.chr$chr.excludeTarget.vcf.gz", 
            "target.chr$chr.typedOnly.masked.vcf.gz", 
            "chr$chr.aligned", "$chr", 1:typemax(Int))
        if nrecords("chr$chr.aligned.tgt.vcf.gz") == p
            rm("chr$chr.aligned.tgt.vcf.gz", force=true) # perfect match
        else
            error("target file has SNPs not matching in reference file! Shouldn't happen!")
        end
        mv("chr$chr.aligned.ref.vcf.gz", "ref.chr$chr.aligned.vcf.gz", force=true)
    end 
end
Random.seed!(2020)
@time filter_and_mask()

527.966603 seconds (4.92 G allocations: 464.636 GiB, 7.19% gc time)


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:08:23[39m


383.300855 seconds (4.87 G allocations: 461.590 GiB, 9.76% gc time)
388.491073 seconds (4.88 G allocations: 473.436 GiB, 9.75% gc time)
857.118673 seconds (12.96 G allocations: 1001.384 GiB, 11.30% gc time)
  0.505452 seconds (1.45 M allocations: 150.259 MiB, 3.11% gc time)


┌ Info: Match target POS to reference POS
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt.jl:172
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:09:52[39m


613.525680 seconds (7.38 G allocations: 680.798 GiB, 11.27% gc time)
3688.288659 seconds (48.07 G allocations: 4.196 TiB, 10.18% gc time)


┌ Info: 90481 records are matched
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt.jl:239


### Missing rate

In typed markers, 0.1% of data is missing at random. In addition, 86% of all markers are not typed (i.e. systematically missing). 

In [4]:
tgtfile = "target.chr21.typedOnly.masked.vcf.gz"
reffile = "ref.chr21.excludeTarget.vcf.gz"
missing_rate = 1 - nrecords(tgtfile) / nrecords(reffile)

0.8600524333562761

In [2]:
Threads.nthreads()

8

In [6]:
tgtfile = "target.chr21.typedOnly.masked.vcf.gz"
reffile = "ref.chr21.excludeTarget.vcf.gz"
reffile_aligned = "ref.chr21.aligned.vcf.gz"
X_typedOnly_complete = "target.chr21.typedOnly.vcf.gz"
X_full_complete = "target.chr21.full.vcf.gz"
@show nrecords(tgtfile), nsamples(tgtfile)
@show nrecords(reffile), nsamples(reffile)
@show nrecords(reffile_aligned), nsamples(reffile_aligned)
@show nrecords(X_typedOnly_complete), nsamples(X_typedOnly_complete)
@show nrecords(X_full_complete), nsamples(X_full_complete)

(nrecords(tgtfile), nsamples(tgtfile)) = (90481, 2)
(nrecords(reffile), nsamples(reffile)) = (646535, 2502)
(nrecords(reffile_aligned), nsamples(reffile_aligned)) = (90481, 2502)
(nrecords(X_typedOnly_complete), nsamples(X_typedOnly_complete)) = (90481, 2)
(nrecords(X_full_complete), nsamples(X_full_complete)) = (646535, 2)


(646535, 2)

# MendelImpute on untyped markers with dp

In [4]:
# dp method using incomplete ref panel
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
#     X_complete = convert_gt(Float32, "target.chr21.typedOnly.vcf.gz")
    X_complete = convert_gt(Float32, "target.chr21.full.vcf.gz")
    n, p = size(X_complete)
    chr = 21
    for width in [250, 500, 1000]
        println("Imputing typed + untyped SNPs with dynamic programming, width = $width")
        tgtfile = "target.chr$chr.typedOnly.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.vcf.gz"
        reffile_aligned = "ref.chr$chr.aligned.vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, impute=true, 
            width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

Imputing typed + untyped SNPs with dynamic programming, width = 250


[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:48[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:29[39m


293.920990 seconds (4.31 G allocations: 325.521 GiB, 17.38% gc time)
error overall = 0.0028382067482812224 

Imputing typed + untyped SNPs with dynamic programming, width = 500


[32mImporting reference haplotype files...100%|█████████████| Time: 0:04:04[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:39[39m


320.116762 seconds (4.85 G allocations: 334.447 GiB, 17.62% gc time)
error overall = 0.0029975175357869256 

Imputing typed + untyped SNPs with dynamic programming, width = 1000


[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:59[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:38[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:08[39m


319.218235 seconds (5.31 G allocations: 336.063 GiB, 19.55% gc time)
error overall = 0.003932501720711175 



In [3]:
# dp method using incomplete ref panel
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
#     X_complete = convert_gt(Float32, "target.chr21.typedOnly.vcf.gz")
    X_complete = convert_gt(Float32, "target.chr21.full.vcf.gz")
    n, p = size(X_complete)
    chr = 21
    for width in [200, 400]
        println("Imputing typed + untyped SNPs with dynamic programming, width = $width")
        tgtfile = "target.chr$chr.typedOnly.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.vcf.gz"
        reffile_aligned = "ref.chr$chr.aligned.vcf.gz"
        @time phase(tgtfile, reffile, reffile_aligned = reffile_aligned, impute=true, 
            outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

Imputing typed + untyped SNPs with dynamic programming, width = 200
Running chunk 1 / 1


[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:24[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:14[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:06:05[39m


630.604613 seconds (8.61 G allocations: 815.055 GiB, 12.89% gc time)
error overall = 0.0028621807017408184 

Imputing typed + untyped SNPs with dynamic programming, width = 400
Running chunk 1 / 1


[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:27[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:15[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:06:02[39m


631.205289 seconds (8.59 G allocations: 815.977 GiB, 12.72% gc time)
error overall = 0.002871460941789694 



In [9]:
# dp method using incomplete ref panel
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
#     X_complete = convert_gt(Float32, "target.chr21.typedOnly.vcf.gz")
    X_complete = convert_gt(Float32, "target.chr21.full.vcf.gz")
    n, p = size(X_complete)
    chr = 21
    for width in [200, 400]
        println("Imputing typed + untyped SNPs with dynamic programming, width = $width")
        tgtfile = "target.chr$chr.typedOnly.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.vcf.gz"
        reffile_aligned = "ref.chr$chr.aligned.vcf.gz"
        @time phase(tgtfile, reffile, reffile_aligned = reffile_aligned, impute=true, 
            outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

Imputing typed + untyped SNPs with dynamic programming, width = 200
Running chunk 1 / 1


[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:28[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:11[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:06:03[39m


636.635822 seconds (8.59 G allocations: 814.260 GiB, 13.57% gc time)
error overall = 0.0028505804016797235 

Imputing typed + untyped SNPs with dynamic programming, width = 400
Running chunk 1 / 1


[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:27[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:18[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:06:25[39m


666.282339 seconds (8.59 G allocations: 815.880 GiB, 12.66% gc time)
error overall = 0.0028668208217652565 



In [7]:
# dp method using incomplete ref panel
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
#     X_complete = convert_gt(Float32, "target.chr21.typedOnly.vcf.gz")
    X_complete = convert_gt(Float32, "target.chr21.full.vcf.gz")
    n, p = size(X_complete)
    chr = 21
    for width in [500, 1000, 2000]
        println("Imputing typed + untyped SNPs with dynamic programming, width = $width")
        tgtfile = "target.chr$chr.typedOnly.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.vcf.gz"
        reffile_aligned = "ref.chr$chr.aligned.vcf.gz"
        @time phase(tgtfile, reffile, reffile_aligned = reffile_aligned, impute=true, 
            outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

Imputing typed + untyped SNPs with dynamic programming, width = 500
Running chunk 1 / 1


[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:26[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:20[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:06:01[39m


638.769465 seconds (8.60 G allocations: 814.981 GiB, 12.70% gc time)
error overall = 0.0029975175357869256 

Imputing typed + untyped SNPs with dynamic programming, width = 1000
Running chunk 1 / 1


[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:27[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:14[39m
[32mDynamic programming + merging breakpoints...100%|███████| Time: 0:00:08[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:06:06[39m


646.665749 seconds (8.59 G allocations: 808.193 GiB, 12.42% gc time)
error overall = 0.003906981060576767 

Imputing typed + untyped SNPs with dynamic programming, width = 2000
Running chunk 1 / 1


[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:27[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:10[39m
[32mDynamic programming + merging breakpoints...100%|███████| Time: 0:00:16[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:06:00[39m


635.662692 seconds (8.59 G allocations: 802.893 GiB, 12.37% gc time)
error overall = 0.005551903609240026 



# Beagle 5.0

In [8]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 21
    tgtfile = "target.chr$chr.typedOnly.masked.vcf.gz"
    reffile = "ref.chr$chr.excludeTarget.vcf.gz"
    outfile = "beagle.imputed"
    Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
        
    # beagle error rate    
    X_complete = convert_gt(Float32, "target.chr$chr.full.vcf.gz")
    X_beagle = convert_gt(Float32, "beagle.imputed.vcf.gz")
    n, p = size(X_complete)
    println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 12:04 AM PDT on 26 Apr 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr21.typedOnly.masked.vcf.gz
  ref=ref.chr21.excludeTarget.vcf.gz
  out=beagle.imputed
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,502
Study samples:               2

Window 1 (21:9411245-48119740)
Reference markers:     646,535
Study markers:          90,481

Burnin  iteration 1:           1 second
Burnin  iteration 2:           1 second
Burnin  iteration 3:           2 seconds
Burnin  iteration 4:           1 second
Burnin  iteration 5:           1 second
Burnin  iteration 6:           1 second

Phasing iteration 1:           2 seconds
Phasing iteration 2:           1 second
Phasing iteration 3:           1 second
Phasing iteration 4:           1 second
Phasing iteration 5:         

# Eagle 2 + Minimac4

In order to use the reference panel in Eagle 2's prephase option, one must first convert it to `.bcf` format via e.g. `htslib` which is *extremely* difficult to install. Even after we went through all the hard work to obtain the final `.bcf` reference file (see commands below), eagle 2.4 STILL SAYS the file is not acceptable (not bgzipped or some processing error). Therefore, I have no choice but to prephase without the reference panel. 

In [None]:
# run eagle 2.4: 39.2919 sec on amd-2382 machine (can only run on linux systems)
eagle --vcf=target.chr21.typedOnly.masked.vcf.gz --outPrefix=eagle.phased.chr21 --numThreads=4 --geneticMapFile=../Eagle_v2.4.1/tables/genetic_map_hg19_withX.txt.gz

In [None]:
# convert ref file to m3vcf format (Total Run completed in 1 hours, 54 mins, 57 seconds)
/u/home/b/biona001/haplotype_comparisons/Minimac3/bin/Minimac3 --refHaps ref.chr21.excludeTarget.vcf.gz --processReference --prefix ref.chr21.excludeTarget --cpus 4

In [None]:
# run minimac4 (3 mins, 35 seconds)
#  Time Taken for Reading File             = 23 seconds
#  Time Taken for Re-compression           = 164 seconds
#  Time Taken for Imputation               = 23 seconds
#  Time Taken for Writing File             = 2 seconds
minimac4 --refHaps ref.chr21.excludeTarget.m3vcf.gz --haps eagle.phased.chr21.vcf.gz --prefix minimac.imputed.chr21 --format GT --cpus 4

In [10]:
# minimac4 error rate    
X_complete = convert_gt(Float32, "target.chr21.full.vcf.gz")
X_minimac = convert_gt(Float32, "minimac4.imputed.chr21.dose.vcf.gz")
n, p = size(X_complete)
println("error overall = $(sum(X_minimac .!= X_complete) / n / p) \n")

error overall = 0.009178930761675702 



# Eagle 2 + Minimac3
According to Minimac people, minimac4 uses approximations that may not play well if the reference panel is too small (i.e. smaller than HRC). Thus, we try minimac3

In [None]:
minimac4 --refHaps ref.chr21.excludeTarget.m3vcf.gz --haps eagle.phased.chr21.vcf.gz --prefix minimac3.imputed.chr21 --format GT --cpus 4 --minimac3