# Test if MendelImpute can impute untyped SNPs

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase

└ @ Revise /Users/biona001/.julia/packages/Revise/439di/src/Revise.jl:1108
┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


## Generate subset of markers for prephasing

In [9]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function filter_and_mask()
    for chr in [22]
        # filter chromosome data for unique snps
        data = "../raw/ALL.chr$chr.phase3_v5.shapeit2_mvncall_integrated.noSingleton.genotypes.vcf.gz"
        full_record_index = .!find_duplicate_marker(data)
        @time VCFTools.filter(data, full_record_index, 1:nsamples(data), 
            des = "chr$chr.uniqueSNPs.vcf.gz")
    
        # generate target file with 250 samples and 100k snps
        n = 250
        p = 100000
        Random.seed!(2020)
        record_idx = falses(nrecords("chr$chr.uniqueSNPs.vcf.gz"))
        record_idx[1:p] .= true
        shuffle!(record_idx)
        sample_idx = falses(nsamples("chr$chr.uniqueSNPs.vcf.gz"))
        sample_idx[1:n] .= true
        shuffle!(sample_idx)
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, sample_idx, 
            des = "target.chr$chr.vcf.gz")

        # also generate reference panel without target samples
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, 
            .!sample_idx, des = "ref.chr$chr.excludeTarget.vcf.gz")
        
        # unphase and mask 1% entries in target file
        masks = falses(p, n)
        missingprop = 0.01
        Random.seed!(2020)
        for j in 1:n, i in 1:p
            rand() < missingprop && (masks[i, j] = true)
        end
        @time mask_gt("target.chr$chr.vcf.gz", masks, 
            des="target.chr$(chr).masked.vcf.gz", unphase=true)
    
        # generate subset of reference file that matches target file
        @time conformgt_by_pos("ref.chr$chr.excludeTarget.vcf.gz", 
            "target.chr$(chr).masked.vcf.gz", 
            "chr$chr.aligned", "$chr", 1:typemax(Int))
        if nrecords("chr$chr.aligned.tgt.vcf.gz") == p
            rm("chr$chr.aligned.tgt.vcf.gz", force=true) # perfect match
        else
            error("target file has SNPs not matching in reference file! Shouldn't happen!")
        end
    end 
end
@time filter_and_mask()

518.674055 seconds (4.91 G allocations: 463.590 GiB, 6.91% gc time)
408.077262 seconds (4.98 G allocations: 468.249 GiB, 9.76% gc time)
468.039349 seconds (5.98 G allocations: 532.367 GiB, 10.40% gc time)
  9.026809 seconds (76.61 M allocations: 8.666 GiB, 7.27% gc time)


┌ Info: Match target POS to reference POS
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt.jl:172
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:03:39[39m


222.652131 seconds (3.53 G allocations: 319.207 GiB, 13.04% gc time)


┌ Info: 100000 records are matched
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt.jl:239


2033.064735 seconds (24.41 G allocations: 2.204 TiB, 9.36% gc time)


In [10]:
nrecords("ref.chr$chr.excludeTarget.vcf.gz"), nrecords("target.chr$(chr).masked.vcf.gz")

(637756, 100000)

In [16]:
tgtfile = "target.chr$(chr).masked.vcf.gz"
reader_tgt = VCF.Reader(openvcf(tgtfile, "r"))
how_many_missed = 0
for record in reader_tgt
    VCF.findgenokey(record, "GT") == nothing && (how_many_missed += 1)
    VCF.chrom(record) == "22" || (how_many_missed += 1)
    pos_tgt = VCF.pos(record)
    pos_tgt > last(1:typemax(Int)) && (how_many_missed += 1)
    pos_tgt < first(1:typemax(Int)) && (how_many_missed += 1)
end
how_many_missed

0

In [4]:
function get_tgtmarkers()
    chr = 22
    tgt_marker_pos = zeros(Int, nrecords("target.chr$(chr).masked.vcf.gz"))
    tgtfile = "target.chr$(chr).masked.vcf.gz"
    reader_tgt = VCF.Reader(openvcf(tgtfile, "r"))
    for (i, record) in enumerate(reader_tgt)
        tgt_marker_pos[i] = VCF.pos(record)
    end
    close(reader_tgt)
    return tgt_marker_pos
end
@time tgt_marker_pos = get_tgtmarkers()

  7.105112 seconds (78.77 M allocations: 8.777 GiB, 8.02% gc time)


100000-element Array{Int64,1}:
 16050627
 16050954
 16051477
 16051816
 16052097
 16052167
 16052394
 16052957
 16053444
 16053782
 16053814
 16053843
 16054070
        ⋮
 51234199
 51234894
 51236013
 51236137
 51237063
 51237071
 51237364
 51237712
 51238349
 51239651
 51239794
 51241101

In [7]:
function get_refmarkers()
    chr = 22
    ref_marker_pos = zeros(Int, nrecords("ref.chr$chr.excludeTarget.vcf.gz"))
    reffile = "ref.chr$chr.excludeTarget.vcf.gz"
    reader_ref = VCF.Reader(openvcf(reffile, "r"))
    for (i, record) in enumerate(reader_ref)
        ref_marker_pos[i] = VCF.pos(record)
    end
    close(reader_ref)
    return ref_marker_pos
end
@time ref_marker_pos = get_refmarkers()

367.495032 seconds (4.33 G allocations: 413.289 GiB, 9.63% gc time)


637756-element Array{Int64,1}:
 16050115
 16050213
 16050568
 16050607
 16050627
 16050654
 16050840
 16050847
 16050922
 16050954
 16050984
 16050994
 16051075
        ⋮
 51239651
 51239652
 51239678
 51239794
 51240084
 51240820
 51241101
 51241102
 51241285
 51241386
 51244163
 51244237

In [8]:
counter = 0
for i in tgt_marker_pos
    i in ref_marker_pos || (counter += 1)
end
counter

1089

In [8]:
nrecords("chr$chr.aligned.tgt.vcf.gz")

98911

In [9]:
nrecords("chr$chr.aligned.ref.vcf.gz")

98911

In [None]:
"ref.chr$chr.excludeTarget.vcf.gz"

# Using complete ref panel, phase subset of markers with dp

In [3]:
Threads.nthreads()

1

In [11]:
chr = 22
tgtfile = "target.chr$chr.masked.vcf.gz"
reffile = "ref.chr$chr.excludeTarget.vcf.gz"
reffile_aligned = "chr$chr.aligned.ref.vcf.gz"
@show nrecords(tgtfile), nsamples(tgtfile)
@show nrecords(reffile), nsamples(reffile)
@show nrecords(reffile_aligned), nsamples(reffile_aligned)

(nrecords(tgtfile), nsamples(tgtfile)) = (100000, 250)
(nrecords(reffile), nsamples(reffile)) = (100000, 2254)
(nrecords(reffile_aligned), nsamples(reffile_aligned)) = (100000, 2254)


(100000, 2254)

In [14]:
# using complete ref panel
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(Float32, "target.chr22.vcf.gz")
    n, p = size(X_complete)
    chr = 22
    for width in [500, 1000, 1500, 2000]
        println("Imputing typed SNPs only with dynamic programming, width = $width")
        tgtfile = "target.chr$chr.masked.vcf.gz"
        reffile = "chr$chr.conformgt.matched.ref.vcf.gz"
        outfile = "mendel.imputed.typedOnly.dp$width.vcf.gz"
        reffile_aligned = "chr$chr.conformgt.matched.ref.vcf.gz"
        @time phase(tgtfile, reffile, reffile_aligned = reffile_aligned,
            outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

Imputing typed SNPs only with dynamic programming, width = 500
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:30[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:57[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:00:06[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:10[39m


180.914743 seconds (685.21 M allocations: 72.913 GiB, 4.97% gc time)
error = 6.0e-6 

Imputing typed SNPs only with dynamic programming, width = 1000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:31[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:30[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:09[39m


148.181139 seconds (684.21 M allocations: 70.699 GiB, 5.99% gc time)
error = 5.6e-7 

Imputing typed SNPs only with dynamic programming, width = 1500
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:29[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:05[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:09[39m


119.986834 seconds (683.78 M allocations: 68.668 GiB, 6.90% gc time)
error = 1.2e-7 

Imputing typed SNPs only with dynamic programming, width = 2000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:30[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:53[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:09[39m


108.554973 seconds (683.55 M allocations: 67.577 GiB, 7.45% gc time)
error = 4.0e-8 

