# Test imputation on untyped SNPs chrom 18

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase

└ @ Revise /Users/biona001/.julia/packages/Revise/439di/src/Revise.jl:1108


### Memory requirement

**Prephasing step:** 
+ Target data requies $people * snps * 4$ bytes of RAM
+ Reference haplotype data requires $haplotypes * snps$ bits of RAM
+ Redundant haplotype set for imputation target requires roughly
$people * windows * 1000$ (max haplotypes per win) $* 16 bytes$ of RAM

## Generate subset of markers for prephasing

In [2]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function filter_and_mask()
    for chr in [18]
        # filter chromosome data for unique snps
        data = "../beagle_raw/chr$chr.1kg.phase3.v5a.vcf.gz"
        full_record_index = .!find_duplicate_marker(data)
        @time VCFTools.filter(data, full_record_index, 1:nsamples(data), 
            des = "chr$chr.uniqueSNPs.vcf.gz")

        # summarize data
        total_snps, samples, _, _, _, maf_by_record, _ = gtstats("chr$chr.uniqueSNPs.vcf.gz")

        # generate target panel with all snps
        n = 100
        sample_idx = falses(samples)
        sample_idx[1:n] .= true
        shuffle!(sample_idx)
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 
            1:total_snps, sample_idx, des = "target.chr$chr.full.vcf.gz")

        # also generate reference panel without target samples
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 
            1:total_snps, .!sample_idx, des = "ref.chr$chr.excludeTarget.vcf.gz")
        
        for maf in [0.0005, 0.001, 0.01, 0.1, 0.45]
            
            # generate target file with 100 samples and typed snps with certain maf
            my_maf = findall(x -> x > maf, maf_by_record)  
            p = length(my_maf)
            record_idx = falses(total_snps)
            record_idx[my_maf] .= true
            @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, sample_idx, 
                des = "target.chr$chr.typedOnly.maf$maf.vcf.gz")

            # unphase and mask 1% entries in target file
            masks = falses(p, n)
            missingprop = 0.001
            for j in 1:n, i in 1:p
                rand() < missingprop && (masks[i, j] = true)
            end
            @time mask_gt("target.chr$chr.typedOnly.maf$maf.vcf.gz", masks, 
                des="target.chr$chr.typedOnly.maf$maf.masked.vcf.gz", unphase=true)
        end
    end 
end
Random.seed!(2020)
@time filter_and_mask()

757.914216 seconds (6.51 G allocations: 615.484 GiB, 5.77% gc time)


[32mProgress: 100%|█████████████████████████████████████████| Time: 0:11:01[39m


543.267281 seconds (6.94 G allocations: 658.873 GiB, 8.94% gc time)
1133.618067 seconds (16.89 G allocations: 1.264 TiB, 10.11% gc time)
537.259210 seconds (6.94 G allocations: 658.872 GiB, 8.92% gc time)
 33.649063 seconds (270.84 M allocations: 27.799 GiB, 6.49% gc time)
529.321183 seconds (6.91 G allocations: 656.539 GiB, 9.14% gc time)
 31.127583 seconds (253.23 M allocations: 25.974 GiB, 6.39% gc time)
508.066933 seconds (6.70 G allocations: 634.398 GiB, 9.00% gc time)
 16.370128 seconds (121.43 M allocations: 12.480 GiB, 5.66% gc time)
526.043739 seconds (6.59 G allocations: 622.839 GiB, 9.08% gc time)
  8.930132 seconds (54.36 M allocations: 5.582 GiB, 5.16% gc time)
556.634232 seconds (6.51 G allocations: 614.804 GiB, 9.66% gc time)
  0.952794 seconds (5.51 M allocations: 578.815 MiB, 6.34% gc time)
6361.681938 seconds (82.06 G allocations: 7.271 TiB, 8.92% gc time)


### Missing rate

In typed markers, 0.1% of data is missing at random, and the untyped markers missing rate are:

In [4]:
reffile = "ref.chr18.excludeTarget.vcf.gz"
@show ref_records = nrecords(reffile)
for maf in [0.0005, 0.001, 0.01, 0.1, 0.45]
    tgtfile = "target.chr18.typedOnly.maf$maf.masked.vcf.gz"
    tgt_records = nrecords(tgtfile)
    missing_rate = 1 - tgt_records / ref_records
    println("there are $tgt_records snps w/ maf > $maf. Missing rate = $missing_rate")
end

ref_records = nrecords(reffile) = 863592
there are 863592 snps w/ maf > 0.0005. Missing rate = 0.0
there are 807477 snps w/ maf > 0.001. Missing rate = 0.06497860100603059
there are 387193 snps w/ maf > 0.01. Missing rate = 0.5516482320354983
there are 173336 snps w/ maf > 0.1. Missing rate = 0.7992848474742702
there are 17554 snps w/ maf > 0.45. Missing rate = 0.9796732716375325


In [2]:
Threads.nthreads()

4

# MendelImpute on untyped markers with dp

In [3]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
    n, p = size(X_complete)
    chr = 18
    width = 500
    for maf in [0.0005] #missing rate = 0.0
        println("MendelImpute with dynamic programming, width = $width, maf = $maf")
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
            fast_method=false)
        X_mendel = convert_gt(UInt8, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

MendelImpute with dynamic programming, width = 500, maf = 0.0005


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:24[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:04:40[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:07:46[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:02:39[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


995.157271 seconds (4.88 G allocations: 452.982 GiB, 14.19% gc time)
error overall = 6.0329414816255824e-6 



In [2]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(UInt8, "target.chr18.full.vcf.gz")
    n, p = size(X_complete)
    chr = 18
    width = 500
    for maf in [0.45, 0.1, 0.01, 0.001, 0.0005] #missing rate = 0.97, 0.79, 0.55, 0.06, 0.0
        println("MendelImpute with dynamic programming, width = $width, maf = $maf")
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.maf$maf.vcf.gz"
        @time phase(tgtfile, reffile, outfile=outfile, impute=true, width=width, 
            fast_method=false)
        X_mendel = convert_gt(UInt8, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

MendelImpute with dynamic programming, width = 500, maf = 0.45


[32mImporting reference haplotype files...100%|█████████████| Time: 0:04:33[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:23[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:00:07[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


346.711156 seconds (4.22 G allocations: 378.095 GiB, 16.36% gc time)
error overall = 0.0773094354741591 

MendelImpute with dynamic programming, width = 500, maf = 0.1


[32mImporting reference haplotype files...100%|█████████████| Time: 0:04:37[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:03:00[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:01:09[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:12[39m


579.281690 seconds (4.28 G allocations: 405.618 GiB, 11.25% gc time)
error overall = 0.005182725175777451 

MendelImpute with dynamic programming, width = 500, maf = 0.01


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:09[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:04:40[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:05:23[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:01:52[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


774.723315 seconds (4.41 G allocations: 422.091 GiB, 12.05% gc time)
error overall = 0.0015744819312823648 

MendelImpute with dynamic programming, width = 500, maf = 0.001


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:19[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:07:33[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:06:49[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:01:49[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:12[39m


1048.067625 seconds (4.78 G allocations: 446.409 GiB, 23.29% gc time)
error overall = 0.00010440115239603887 

MendelImpute with dynamic programming, width = 500, maf = 0.0005


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:48[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:07:41[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:06:35[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:01:47[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:11[39m


1070.168690 seconds (4.85 G allocations: 451.895 GiB, 23.41% gc time)
error overall = 6.0329414816255824e-6 



# Beagle 5.0

In [3]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    chr = 18
    for maf in [0.0005, 0.001, 0.01, 0.1, 0.45]
        tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "beagle.imputed.maf$maf"
        Base.run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=$tgtfile ref=$reffile out=$outfile nthreads=4`)
    end
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 10:09 AM PDT on 08 May 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=target.chr18.typedOnly.maf0.0005.masked.vcf.gz
  ref=ref.chr18.excludeTarget.vcf.gz
  out=beagle.imputed.maf0.0005
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (18:10644-40010629)
Reference markers:     414,911
Study markers:         414,911

Burnin  iteration 1:           48 seconds
Burnin  iteration 2:           1 minute 28 seconds
Burnin  iteration 3:           1 minute 42 seconds
Burnin  iteration 4:           1 minute 46 seconds
Burnin  iteration 5:           1 minute 53 seconds
Burnin  iteration 6:           2 minutes 5 seconds

Phasing iteration 1:           1 minute 59 seconds
Phasing iteration 2:           2 minutes 1 second
Phasing iterati

DimensionMismatch: DimensionMismatch("arrays could not be broadcast to a common size; got a dimension with lengths 678557 and 863592")

In [5]:
for maf in [0.0005]
    # beagle error rate    
    chr = 18
    X_complete = convert_gt(UInt8, "target.chr$chr.full.vcf.gz")
    X_beagle = convert_gt(UInt8, "beagle.imputed.maf$maf.vcf.gz")
    n, p = size(X_complete)
    println("error overall = $(sum(X_beagle .!= X_complete) / n / p) \n")
end

error overall = 5.453964372064586e-6 



# Eagle 2 + Minimac4

In order to use the reference panel in Eagle 2's prephase option, one must first convert it to `.bcf` format via e.g. `htslib` which is *extremely* difficult to install. Even after we went through all the hard work to obtain the final `.bcf` reference file (see commands below), eagle 2.4 STILL SAYS the file is not acceptable (not bgzipped or some processing error). Therefore, I have no choice but to prephase without the reference panel. 

In [None]:
# run eagle 2.4: 3367.79 sec on amd-2382 machine (can only run on linux systems)
eagle --vcf=target.chr20.typedOnly.masked.vcf.gz --outPrefix=eagle.phased.chr20 --numThreads=4 --geneticMapFile=../Eagle_v2.4.1/tables/genetic_map_hg19_withX.txt.gz

In [None]:
# convert ref file to m3vcf format (Total Run completed in 1 hours, 46 mins, 24 seconds)
/u/home/b/biona001/haplotype_comparisons/Minimac3/bin/Minimac3 --refHaps ref.chr20.excludeTarget.vcf.gz --processReference --prefix ref.chr20.excludeTarget

In [None]:
# run minimac4 (2619 seconds)
minimac4 --refHaps ref.chr20.excludeTarget.m3vcf.gz --haps eagle.phased.vcf.gz --prefix minimac.imputed.chr20 --format GT --cpus 4

In [None]:
# minimac4 error rate    
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz")
X_minimac = convert_gt(Float32, "minimac.imputed.chr20.dose.vcf.gz")
n, p = size(X_complete)
println("error overall = $(sum(X_minimac .!= X_complete) / n / p) \n")