# Test if MendelImpute can impute untyped SNPs

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase

└ @ Revise /Users/biona001/.julia/packages/Revise/439di/src/Revise.jl:1108
┌ Info: Precompiling VCFTools [a620830f-fdd7-5ebc-8d26-3621ab35fbfe]
└ @ Base loading.jl:1273
┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


## Generate subset of markers for phasing & imputation

In [None]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function filter_and_mask()
    for chr in [22]
        # filter chromosome data for unique snps
        data = "../raw/ALL.chr$chr.phase3_v5.shapeit2_mvncall_integrated.noSingleton.genotypes.vcf.gz"
        full_record_index = .!find_duplicate_marker(data)
        @time VCFTools.filter(data, full_record_index, 1:nsamples(data), des = "chr$chr.uniqueSNPs.vcf.gz")
    
        # generate phased target file with 250 samples and 100k snps
        n = 250
        p = 100000
        Random.seed!(2020)
        record_idx = falses(nrecords("chr$chr.uniqueSNPs.vcf.gz"))
        record_idx[1:p] .= true
        shuffle!(record_idx)
        sample_idx = falses(nsamples("chr$chr.uniqueSNPs.vcf.gz"))
        sample_idx[1:n] .= true
        shuffle!(sample_idx)
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, sample_idx, des = "target.chr$chr.vcf.gz")

        # also generate reference panel without target samples
        @time VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", full_record_index, .!sample_idx, des = "ref.chr$chr.excludeTarget.vcf.gz")
        
        # unphase and mask 1% entries in target file
        samples = length(1:10:2000)
        masks = falses(p, samples)
        missingprop = 0.01
        Random.seed!(2020)
        for j in 1:samples, i in 1:p
            rand() < missingprop && (masks[i, j] = true)
        end
        @time mask_gt("target.chr$chr.vcf.gz", masks, des="target.chr$(chr).masked.vcf.gz", unphase=true)
    
        # construct ref file that matches the target file
        @time conformgt_by_pos("chr$chr.uniqueSNPs.vcf.gz", "target.chr$(chr).masked.vcf.gz", 
            "chr$chr.conformgt.matched", "$chr", 1:typemax(Int))
        if nrecords("chr$chr.conformgt.matched.tgt.vcf.gz") == p
            rm("chr$chr.conformgt.matched.tgt.vcf.gz", force=true) # perfect match
        else
            error("target file has SNPs not matching in reference file! Shouldn't happen!")
        end
    end 
end
filter_and_mask()

## Phase subset of markers with dp

In [2]:
Threads.nthreads()

4

In [4]:
chr = 22
tgtfile = "target_chr$(chr)_masked.vcf.gz"
reffile = "chr$chr.conformgt.matched.ref.vcf.gz"
@show nrecords(tgtfile), nsamples(tgtfile)
@show nrecords(reffile), nsamples(reffile)

(nrecords(tgtfile), nsamples(tgtfile)) = (100000, 200)
(nrecords(reffile), nsamples(reffile)) = (100000, 2504)


(100000, 2504)

In [None]:
# using imcomplete ref panel
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(Float32, "target_chr22.vcf.gz")
    n, p = size(X_complete)
    for width in [500, 1000, 1500, 2000]
        println("Imputing typed SNPs only with dynamic programming, width = $width")
        tgtfile = "target_chr$(chr)_masked.vcf.gz"
        reffile = "chr$chr.conformgt.matched.ref.vcf.gz"
        outfile = "mendel_imputed_typedOnly_dp$width.vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

In [8]:
# using complete ref panel can produce 0 error
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(Float32, "target_chr22.vcf.gz")
    n, p = size(X_complete)
    for width in [500, 1000, 1500, 2000]
        println("Imputing typed SNPs only with dynamic programming, width = $width")
        tgtfile = "target_chr$(chr)_masked.vcf.gz"
        reffile = "chr$chr.conformgt.matched.ref.vcf.gz"
        outfile = "mendel_imputed_typedOnly_dp$width.vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

Imputing typed SNPs only with dynamic programming, width = 500
Running chunk 1 / 1


[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:30[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:02:23[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:00:08[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:08[39m


205.627845 seconds (650.88 M allocations: 69.592 GiB, 3.70% gc time)
error = 5.55e-6 

Imputing typed SNPs only with dynamic programming, width = 1000
Running chunk 1 / 1


[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:31[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:45[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:08[39m


161.765668 seconds (648.85 M allocations: 67.338 GiB, 5.38% gc time)
error = 5.0e-7 

Imputing typed SNPs only with dynamic programming, width = 1500
Running chunk 1 / 1


[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:29[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:16[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:08[39m


128.432985 seconds (648.51 M allocations: 65.166 GiB, 5.61% gc time)
error = 0.0 

Imputing typed SNPs only with dynamic programming, width = 2000
Running chunk 1 / 1


[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:31[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:01:04[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:08[39m


118.403594 seconds (648.33 M allocations: 64.046 GiB, 6.27% gc time)
error = 0.0 

