# Test imputation on untyped SNPs chrom 19 with different sample sizes

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase

└ @ Revise /Users/biona001/.julia/packages/Revise/439di/src/Revise.jl:1108


### Memory requirement

**Prephasing step:** 
+ Target data requies $people * snps * 4$ bytes of RAM
+ Reference haplotype data requires $haplotypes * snps$ bits of RAM
+ Redundant haplotype set for imputation target requires roughly
$people * windows * 1000$ (max haplotypes per win) $* 16 bytes$ of RAM

## Generate subset of markers for prephasing

We simulate $n = 2, 20, ..., 20000$ samples from chromosome 19 of 1000 genotype data. 

In [62]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function filter_and_mask()
    # filter chromosome data for unique snps
    data = "../beagle_raw/chr$chr.1kg.phase3.v5a.vcf.gz"
    full_record_index = .!find_duplicate_marker(data)
    @time VCFTools.filter(data, full_record_index, 1:nsamples(data), 
        des = "chr$chr.uniqueSNPs.vcf.gz")

    # summarize data
    total_snps, samples, _, _, _, maf_by_record, _ = gtstats("chr$chr.uniqueSNPs.vcf.gz")
    large_maf = findall(x -> x > 0.1, maf_by_record)
    
    # keep snps with maf>0.1 as typed SNPs
    p = length(large_maf)
    record_idx = falses(total_snps)
    record_idx[large_maf] .= true
    
    # import haplotype reference panel
    H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, "chr$chr.uniqueSNPs.vcf.gz", 
        trans=true, save_snp_info=true, msg = "Importing reference haplotype files...")

    for n in [2, 20, 200, 2000, 20000]
        
        # simulate full target genotype
        X = simulate_genotypes(H, n, T=UInt8, width=2000)
        make_tgtvcf_file(X, vcffilename="target.chr$chr.n$n.full.vcf.gz", marker_chrom=H_chr, 
            marker_pos=H_pos, marker_ID=[H_ids[i][1] for i in 1:length(H_ids)], 
            marker_REF=H_ref, marker_ALT=[H_alt[i][1] for i in 1:length(H_alt)]) 
        
        # write typed only X to file
        X_aligned = X[record_idx, :]
        X_aligned_chr = H_chr[record_idx]
        X_aligned_pos = H_pos[record_idx]
        X_aligned_ids = H_ids[record_idx]
        X_aligned_ref = H_ref[record_idx]
        X_aligned_alt = H_alt[record_idx]
        make_tgtvcf_file(X_aligned, vcffilename="target.chr$chr.n$n.typedOnly.vcf.gz", 
            marker_chrom=X_aligned_chr, marker_pos=X_aligned_pos, 
            marker_ID=[X_aligned_ids[i][1] for i in 1:length(X_aligned_ids)], 
            marker_REF=X_aligned_ref, 
            marker_ALT=[X_aligned_alt[i][1] for i in 1:length(X_aligned_alt)]) 
        
        # mask entries in X typed only and write to file
        p = size(X_aligned, 1)
        masks = falses(p, n)
        missingprop = 0.01
        for j in 1:n, i in 1:p
            rand() < missingprop && (masks[i, j] = true)
        end
        X_aligned[masks] .= missing
        make_tgtvcf_file(X_aligned, vcffilename="target.chr$chr.n$n.typedOnly.masked.vcf.gz", 
            marker_chrom=X_aligned_chr, marker_pos=X_aligned_pos, 
            marker_ID=[X_aligned_ids[i][1] for i in 1:length(X_aligned_ids)], 
            marker_REF=X_aligned_ref, 
            marker_ALT=[X_aligned_alt[i][1] for i in 1:length(X_aligned_alt)]) 
    end
end
Random.seed!(2020)
@time filter_and_mask()

718.677263 seconds (5.32 G allocations: 501.945 GiB, 9.58% gc time)


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:09:54[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:04:10[39m


InterruptException: InterruptException:

### Missing rate

In typed markers, 1% of data is missing at random. In addition, 79% of all markers are not typed (i.e. systematically missing). 

In [7]:
tgtfile = "target.chr19.n20.typedOnly.masked.vcf.gz"
reffile = "chr19.uniqueSNPs.vcf.gz"
missing_rate = 1 - nrecords(tgtfile) / nrecords(reffile)

0.7900461446394791

In [3]:
Threads.nthreads()

8

# MendelImpute on untyped markers with dp

In [2]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    n = 200 # tested 250, 500, 1000, 2000
    for width in [250, 500, 1000, 2000]
        println("Imputing n = $n, dynamic programming method, width = $width")
        tgtfile = "target.chr19.n$n.typedOnly.masked.vcf.gz"
        reffile = "chr19.uniqueSNPs.vcf.gz"
        outfile = "mendel.imputed.chr19.n$n.dp$width.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, fast_method=false)
        X_complete = convert_gt(UInt8, "target.chr19.n$n.full.vcf.gz")
        X_mendel = convert_gt(UInt8, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / size(X_mendel, 1) / size(X_mendel, 2)) \n")
    end
end
run()

Imputing n = 200, dynamic programming method, width = 250


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:08[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:56[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:11:09[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:18[39m


965.391180 seconds (3.77 G allocations: 364.873 GiB, 9.21% gc time)
error = 0.00010005332017346074 

Imputing n = 200, dynamic programming method, width = 500


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:09[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:05:50[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:04:25[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:18[39m


671.745223 seconds (3.69 G allocations: 352.282 GiB, 17.61% gc time)
error = 2.0448853758079565e-5 

Imputing n = 200, dynamic programming method, width = 1000


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:55[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:02:25[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:18[39m


437.393690 seconds (3.69 G allocations: 338.966 GiB, 11.73% gc time)
error = 3.143195863715905e-5 

Imputing n = 200, dynamic programming method, width = 2000


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:53[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:16[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:18[39m


365.795041 seconds (3.69 G allocations: 329.049 GiB, 13.81% gc time)
error = 4.389696727332544e-5 



### Fastest

In [2]:
# ad-hoc dp method, keep pairs within 3 of best pair, keep all pairs minimizing diff w/ observed error
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    width = 2000 # tested 100, 250, 500 (most accurate), 1000, 2000 (fastest)
    for n in [2, 20, 200, 2000, 20000]
        println("Imputing n = $n, dynamic programming method, width = $width")
        tgtfile = "target.chr19.n$n.typedOnly.masked.vcf.gz"
        reffile = "chr19.uniqueSNPs.vcf.gz"
        outfile = "mendel.imputed.chr19.n$n.dp$width.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, fast_method=false)
        X_complete = convert_gt(UInt8, "target.chr19.n$n.full.vcf.gz")
        X_mendel = convert_gt(UInt8, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / size(X_mendel, 1) / size(X_mendel, 2)) \n")
    end
end
run()

Imputing n = 2, dynamic programming method, width = 2000


[32mImporting reference haplotype files...100%|█████████████| Time: 0:04:01[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:19[39m


292.592744 seconds (3.60 G allocations: 319.014 GiB, 18.10% gc time)
error = 1.2762807477303474e-5 

Imputing n = 20, dynamic programming method, width = 2000


[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:59[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:23[39m


294.015803 seconds (3.58 G allocations: 318.872 GiB, 17.86% gc time)
error = 5.353288691868957e-5 

Imputing n = 200, dynamic programming method, width = 2000


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:58[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:23[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:18[39m


380.193738 seconds (3.69 G allocations: 329.049 GiB, 14.12% gc time)
error = 4.389696727332544e-5 

Imputing n = 2000, dynamic programming method, width = 2000


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:01[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:53[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:12:18[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:26[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:03:12[39m


1332.674077 seconds (4.76 G allocations: 429.557 GiB, 4.72% gc time)
error = 4.4281978632224105e-5 

Imputing n = 20000, dynamic programming method, width = 2000


[32mImporting genotype file... 89%|██████████████████████▍  |  ETA: 0:01:44[39m

InterruptException: InterruptException:

### Most accurate

In [4]:
# ad-hoc dp method, keep pairs within 3 of best pair, keep all pairs minimizing diff w/ observed error
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    width = 500 # tested 100, 250, 1000, 2000
    for n in [2, 20, 200, 2000, 20000]
        println("Imputing n = $n, dynamic programming method, width = $width")
        tgtfile = "target.chr19.n$n.typedOnly.masked.vcf.gz"
        reffile = "chr19.uniqueSNPs.vcf.gz"
        outfile = "mendel.imputed.chr19.n$n.dp$width.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, fast_method=false)
        X_complete = convert_gt(UInt8, "target.chr19.n$n.full.vcf.gz")
        X_mendel = convert_gt(UInt8, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / size(X_mendel, 1) / size(X_mendel, 2)) \n")
    end
end
run()

Imputing n = 2, dynamic programming method, width = 500


[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:55[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:36[39m


303.689603 seconds (3.60 G allocations: 342.132 GiB, 17.65% gc time)
error = 3.545224299250965e-6 

Imputing n = 20, dynamic programming method, width = 500


[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:58[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:53[39m


323.542015 seconds (3.58 G allocations: 342.004 GiB, 17.10% gc time)
error = 2.063320542164062e-5 

Imputing n = 200, dynamic programming method, width = 500


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:53[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:04:28[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:18[39m


556.035765 seconds (3.69 G allocations: 352.352 GiB, 9.86% gc time)
error = 2.0392130169291552e-5 

Imputing n = 2000, dynamic programming method, width = 500


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:01[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:48[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:52:44[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:05[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:03:11[39m


3738.561148 seconds (4.81 G allocations: 454.548 GiB, 2.05% gc time)
error = 2.1545746156267816e-5 

Imputing n = 20000, dynamic programming method, width = 500


[32mImporting genotype file...  4%|█▏                       |  ETA: 0:13:05[39m

InterruptException: InterruptException:

# Beagle 5.0

In [5]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 19
    for n in [2, 20, 200, 2000]
        println("Running beagle with n = $n samples")
        tgtfile = "target.chr19.n$n.typedOnly.masked.vcf.gz"
        reffile = "chr19.uniqueSNPs.vcf.gz"
        outfile = "beagle.imputed.chr19.n$n"
        Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=8`)

        # beagle error rate    
        X_complete = convert_gt(UInt8, "target.chr$chr.n$n.full.vcf.gz")
        X_beagle = convert_gt(UInt8, "beagle.imputed.chr19.n$n.vcf.gz")
        nn, pp = size(X_complete)
        println("error overall = $(sum(X_beagle .!= X_complete) / nn / pp) \n")
    end
end
beagle()

Running beagle with n = 2 samples
beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 09:53 PM PDT on 03 May 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr19.n2.typedOnly.masked.vcf.gz
  ref=chr19.uniqueSNPs.vcf.gz
  out=beagle.imputed.chr19.n2
  nthreads=8

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,504
Study samples:               2

Window 1 (19:62935-40062750)
Reference markers:     456,604
Study markers:          93,065

Burnin  iteration 1:           1 second
Burnin  iteration 2:           2 seconds
Burnin  iteration 3:           2 seconds
Burnin  iteration 4:           1 second
Burnin  iteration 5:           2 seconds
Burnin  iteration 6:           1 second

Phasing iteration 1:           1 second
Phasing iteration 2:           1 second
Phasing iteration 3:           1 second
Phasing iteration 4:           

# Eagle 2 + Minimac4

In order to use the reference panel in Eagle 2's prephase option, one must first convert it to `.bcf` format via e.g. `htslib` which is *extremely* difficult to install. Even after we went through all the hard work to obtain the final `.bcf` reference file (see commands below), eagle 2.4 STILL SAYS the file is not acceptable (not bgzipped or some processing error). Therefore, I have no choice but to prephase without the reference panel. 

In [None]:
# run eagle 2.4: 3367.79 sec on amd-2382 machine (can only run on linux systems)
eagle --vcf=target.chr20.typedOnly.masked.vcf.gz --outPrefix=eagle.phased.chr20 --numThreads=4 --geneticMapFile=../Eagle_v2.4.1/tables/genetic_map_hg19_withX.txt.gz

In [None]:
# convert ref file to m3vcf format (Total Run completed in 1 hours, 46 mins, 24 seconds)
/u/home/b/biona001/haplotype_comparisons/Minimac3/bin/Minimac3 --refHaps ref.chr20.excludeTarget.vcf.gz --processReference --prefix ref.chr20.excludeTarget

In [None]:
# run minimac4 (2619 seconds)
minimac4 --refHaps ref.chr20.excludeTarget.m3vcf.gz --haps eagle.phased.vcf.gz --prefix minimac.imputed.chr20 --format GT --cpus 4

In [None]:
# minimac4 error rate    
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz")
X_minimac = convert_gt(Float32, "minimac.imputed.chr20.dose.vcf.gz")
n, p = size(X_complete)
println("error overall = $(sum(X_minimac .!= X_complete) / n / p) \n")