# Test if MendelImpute can impute untyped SNPs

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase

└ @ Revise /Users/biona001/.julia/packages/Revise/439di/src/Revise.jl:1108
┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


## Generate subset of markers for prephasing

In [5]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function filter_and_mask()
    for chr in [22]
        # filter chromosome data for unique snps
        data = "../raw/ALL.chr$chr.phase3_v5.shapeit2_mvncall_integrated.noSingleton.genotypes.vcf.gz"
        full_record_index = .!find_duplicate_marker(data)
        @time VCFTools.filter(data, full_record_index, 1:nsamples(data), 
            des = "chr$chr.uniqueSNPs.vcf.gz")
    
        # generate target file with 250 samples and 100k snps
        n = 250
        p = 100000
        total_snps = nrecords("chr$chr.uniqueSNPs.vcf.gz")
        Random.seed!(2020)
        record_idx = falses(total_snps)
        record_idx[1:p] .= true
        shuffle!(record_idx)
        sample_idx = falses(nsamples("chr$chr.uniqueSNPs.vcf.gz"))
        sample_idx[1:n] .= true
        shuffle!(sample_idx)
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, sample_idx, 
            des = "target.chr$chr.prephase.vcf.gz")

        # generate target panel with all snps
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 
            1:total_snps, sample_idx, des = "target.chr$chr.full.vcf.gz")
        
        # also generate reference panel without target samples
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 
            1:total_snps, .!sample_idx, des = "ref.chr$chr.excludeTarget.vcf.gz")
        
        # unphase and mask 1% entries in target file
        masks = falses(p, n)
        missingprop = 0.01
        Random.seed!(2020)
        for j in 1:n, i in 1:p
            rand() < missingprop && (masks[i, j] = true)
        end
        @time mask_gt("target.chr$chr.prephase.vcf.gz", masks, 
            des="target.chr$chr.prephase.masked.vcf.gz", unphase=true)
    
        # generate subset of reference file that matches target file
        @time conformgt_by_pos("ref.chr$chr.excludeTarget.vcf.gz", 
            "target.chr$(chr).prephase.masked.vcf.gz", 
            "chr$chr.aligned", "$chr", 1:typemax(Int))
        if nrecords("chr$chr.aligned.tgt.vcf.gz") == p
            rm("chr$chr.aligned.tgt.vcf.gz", force=true) # perfect match
        else
            error("target file has SNPs not matching in reference file! Shouldn't happen!")
        end
        mv("chr22.aligned.ref.vcf.gz", "ref.chr22.aligned.vcf.gz")
    end 
end
@time filter_and_mask()

547.645319 seconds (4.91 G allocations: 463.567 GiB, 6.91% gc time)
433.979604 seconds (4.98 G allocations: 468.237 GiB, 10.20% gc time)
463.505887 seconds (5.67 G allocations: 523.322 GiB, 10.87% gc time)
1022.502225 seconds (12.13 G allocations: 936.019 GiB, 12.45% gc time)
 13.274375 seconds (76.61 M allocations: 8.666 GiB, 6.78% gc time)


SystemError: SystemError: opening file "target.chr22.masked.vcf.gz": No such file or directory

In [7]:
chr = 22
p = 100000
@time conformgt_by_pos("ref.chr$chr.excludeTarget.vcf.gz", 
    "target.chr$(chr).prephase.masked.vcf.gz", 
    "chr$chr.aligned", "$chr", 1:typemax(Int))
if nrecords("chr$chr.aligned.tgt.vcf.gz") == p
    rm("chr$chr.aligned.tgt.vcf.gz", force=true) # perfect match
else
    error("target file has SNPs not matching in reference file! Shouldn't happen!")
end

┌ Info: Match target POS to reference POS
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt.jl:172
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:12:23[39m


769.477247 seconds (7.23 G allocations: 672.427 GiB, 13.24% gc time)


┌ Info: 100000 records are matched
└ @ VCFTools /Users/biona001/.julia/dev/VCFTools/src/conformgt.jl:239


# MendelImpute on typed and untyped markers with dp

In [31]:
Threads.nthreads()

8

In [11]:
tgtfile = "target.chr22.prephase.masked.vcf.gz"
reffile = "ref.chr22.excludeTarget.vcf.gz"
reffile_aligned = "ref.chr22.aligned.vcf.gz"
X_prephase_complete = "target.chr22.prephase.vcf.gz"
X_full_complete = "target.chr22.full.vcf.gz"
@show nrecords(tgtfile), nsamples(tgtfile)
@show nrecords(reffile), nsamples(reffile)
@show nrecords(reffile_aligned), nsamples(reffile_aligned)
@show nrecords(X_prephase_complete), nsamples(X_prephase_complete)
@show nrecords(X_full_complete), nsamples(X_full_complete)

(nrecords(tgtfile), nsamples(tgtfile)) = (100000, 250)
(nrecords(reffile), nsamples(reffile)) = (644939, 2254)
(nrecords(reffile_aligned), nsamples(reffile_aligned)) = (100000, 2254)
(nrecords(X_prephase_complete), nsamples(X_prephase_complete)) = (100000, 250)
(nrecords(X_full_complete), nsamples(X_full_complete)) = (644939, 250)


(644939, 250)

### Error rate on typed snps 

In [15]:
# using incomplete ref panel
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(Float32, "target.chr22.prephase.vcf.gz")
#     X_complete = convert_gt(Float32, "target.chr22.full.vcf.gz")
    n, p = size(X_prephase_complete)
    chr = 22
    for width in [1000, 2000]
        println("Imputing typed SNPs only with dynamic programming, width = $width")
        tgtfile = "target.chr$chr.prephase.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.vcf.gz"
        reffile_aligned = "ref.chr$chr.aligned.vcf.gz"
        @time phase(tgtfile, reffile, reffile_aligned = reffile_aligned,
            outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error on typed data = $(sum(X_mendel .!= X_complete) / n / p)")
    end
end
run()

Imputing typed SNPs only with dynamic programming, width = 1000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:28[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:25[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:05:59[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:09[39m


497.316897 seconds (634.08 M allocations: 65.385 GiB, 1.54% gc time)
error on typed data = 0.00017911999999999998
Imputing typed SNPs only with dynamic programming, width = 2000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:29[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:45[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:12:44[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:10[39m


863.565807 seconds (633.44 M allocations: 62.893 GiB, 0.87% gc time)
error on typed data = 0.00027012


### Error rate on untyped + typed snps

In [84]:
# fast method using incomplete ref panel
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
#     X_complete = convert_gt(Float32, "target.chr22.prephase.vcf.gz")
    X_complete = convert_gt(Float32, "target.chr22.full.vcf.gz")
    n, p = size(X_complete)
    chr = 22
    for width in [1000, 2000]
        println("Imputing typed SNPs only with dynamic programming, width = $width")
        tgtfile = "target.chr$chr.prephase.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.vcf.gz"
        reffile_aligned = "ref.chr$chr.aligned.vcf.gz"
        @time phase(tgtfile, reffile, reffile_aligned = reffile_aligned, impute=true, 
            outfile = outfile, width = width, fast_method=true)
        X_mendel = convert_gt(Float32, outfile)
        println("error overall = $(sum(X_mendel .!= X_complete) / n / p)")
    end
end
run()

Imputing typed SNPs only with dynamic programming, width = 1000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:32[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:49[39m
[32mMerging breakpoints...100%|█████████████████████████████| Time: 0:03:11[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:07:28[39m


1036.880082 seconds (8.67 G allocations: 803.597 GiB, 11.00% gc time)
error overall = 0.016043166873146143
Imputing typed SNPs only with dynamic programming, width = 2000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files... 48%|██████▎      |  ETA: 0:00:16[39m

InterruptException: InterruptException:

In [80]:
# dp error
X_complete = convert_gt(Float32, "target.chr22.full.vcf.gz")
X_mendel = convert_gt(Float32, outfile)
n, p = size(X_mendel)
println("error on typed data = $(sum(X_mendel .!= X_complete) / n / p)")

error on typed data = 0.015489868034031126


# Complete ref panel gives good error % for typed snps

In [14]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(Float32, "target.chr22.prephase.vcf.gz")
#     X_complete = convert_gt(Float32, "target.chr22.full.vcf.gz")
    n, p = size(X_prephase_complete)
    nn, pp = size(X_full_complete)
    chr = 22
    for width in [1000, 2000]
        println("Imputing typed SNPs only with dynamic programming, width = $width")
        tgtfile = "target.chr$chr.prephase.masked.vcf.gz"
        reffile = "ref.chr$chr.excludeTarget.vcf.gz"
        outfile = "mendel.imputed.dp$width.vcf.gz"
        reffile_aligned = "chr$chr.aligned.ref.vcf.gz"
        @time phase(tgtfile, reffile, reffile_aligned = reffile_aligned,
            outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error on prephase data = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

Imputing typed SNPs only with dynamic programming, width = 500
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:30[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:57[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:00:06[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:10[39m


180.914743 seconds (685.21 M allocations: 72.913 GiB, 4.97% gc time)
error = 6.0e-6 

Imputing typed SNPs only with dynamic programming, width = 1000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:31[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:30[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:09[39m


148.181139 seconds (684.21 M allocations: 70.699 GiB, 5.99% gc time)
error = 5.6e-7 

Imputing typed SNPs only with dynamic programming, width = 1500
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:29[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:05[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:09[39m


119.986834 seconds (683.78 M allocations: 68.668 GiB, 6.90% gc time)
error = 1.2e-7 

Imputing typed SNPs only with dynamic programming, width = 2000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:30[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:00:53[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:09[39m


108.554973 seconds (683.55 M allocations: 67.577 GiB, 7.45% gc time)
error = 4.0e-8 

