# Write VCF file

Currently writing an imputed matrix to a `.vcf.gz` file is very slow compared to other parts of MendelImpute. This notebook contains simplified code that performs this routine. The write routine mimicks the [write_dlm](https://github.com/JuliaLang/julia/blob/3608c84e6093594fe86923339fc315231492484c/stdlib/DelimitedFiles/src/DelimitedFiles.jl#L736) function in Base. 

In [1]:
using VCFTools
using Random
using ProgressMeter

In [2]:
"""
    write(outfile, X)

Writes imputed `X` into `outfile`. All genotypes in `outfile` are non-missing and unphased. 

# Notes
Here the writing routine is emulating `write_dlm` in Base at 
https://github.com/JuliaLang/julia/blob/3608c84e6093594fe86923339fc315231492484c/stdlib/DelimitedFiles/src/DelimitedFiles.jl#L736
"""
function Base.write(
    outfile::AbstractString,
    X::AbstractMatrix,
    )
    # write minimal meta information to outfile
    io = openvcf(outfile, "w")
    pb = PipeBuffer()
    print(pb, "##fileformat=VCFv4.2\n")
    print(pb, "##source=MendelImpute\n")
    print(pb, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")

    pmeter = Progress(size(X, 1), 5, "Writing to file...")
    @inbounds for i in 1:size(X, 1)
        # print ith record
        write_snp!(pb, @view(X[i, :]))

        (bytesavailable(pb) > (1024*1024)) && write(io, take!(pb)) # this is bottleneck
        next!(pmeter)
    end
    write(io, take!(pb))

    # close & return
    close(io); close(pb)
    return nothing
end

"""
Helper function for saving a record (SNP), not tracking phase information.
"""
function write_snp!(pb::IOBuffer, X::AbstractVector)
    n = length(X)
    @inbounds for j in 1:n
        if X[j] == 0
            print(pb, "\t0/0")
        elseif X[j] == 1
            print(pb, "\t1/0")
        elseif X[j] == 2
            print(pb, "\t1/1")
        else
            error("imputed genotypes can only be 0, 1, 2 but got $(X[j])")
        end
    end
    print(pb, "\n")
    nothing
end

write_snp!

In [3]:
function write_threaded(
    outfile::AbstractString,
    X::AbstractMatrix,
    )
    threads = Threads.nthreads()
    snps = size(X, 1)
    len = div(snps, threads)
    files = ["tmp$i.vcf.gz" for i in 1:threads]

    # write minimal meta information to outfile
    io = [openvcf(files[i], "w") for i in 1:threads]
    pb = [PipeBuffer() for _ in 1:threads]
    print(pb[1], "##fileformat=VCFv4.2\n")
    print(pb[1], "##source=MendelImpute\n")
    print(pb[1], "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
    pmeter = Progress(snps, 5, "Writing to file...")

    # each thread writes `len` SNPs
    Threads.@threads for t in 1:threads
        id = Threads.threadid()
        cur_ranges = (id == threads ? ((threads-1)*len+1:snps) : (1:len) .+ (t-1)*len)
        @inbounds for i in cur_ranges
            write_snp!(pb[id], @view(X[i, :]))
            (bytesavailable(pb[id]) > (1024*1024)) && write(io[id], take!(pb[id])) 
            next!(pmeter)
        end
        write(io[id], take!(pb[id]))
    end
    close.(io); close.(pb) # close io and buffer

    # concatenate all files into 1 VCF file
    run(pipeline(`cat $files`, stdout=outfile))

    # delete intermediate files
    for i in 1:threads
        rm("tmp$i.vcf.gz", force=true)
    end

    return nothing
end
            

write_threaded (generic function with 1 method)

In [4]:
Random.seed!(2020)
p = 100_000 # number of SNPs (in practice this can go up to 2 million)
n = 1000    # number of samples
X = convert(Matrix{UInt8}, rand(0:2, p, n)); # MendelImpute use UInt8 unless given dosage data

In [6]:
# single threaded write
@time write("test.vcf.gz", X)

[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:23[39m


 24.400715 seconds (3.48 M allocations: 560.516 MiB, 0.32% gc time)


In [8]:
# 8 threaded write
@time write_threaded("test.vcf.gz", X)

  3.959356 seconds (6.33 k allocations: 406.323 MiB, 5.77% gc time)


In [6]:
# 16 threaded write
@time write_threaded("test.vcf.gz", X)

  2.508199 seconds (23.03 k allocations: 431.113 MiB)


## Profile code

In [4]:
using ProfileView
Y = convert(Matrix{UInt8}, rand(0:2, 10, 10))
@profview write("test.vcf.gz", Y) 
@profview write("test.vcf.gz", X) 

[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:26[39m


Gtk.GtkWindowLeaf(name="", parent, width-request=-1, height-request=-1, visible=TRUE, sensitive=TRUE, app-paintable=FALSE, can-focus=FALSE, has-focus=FALSE, is-focus=FALSE, focus-on-click=TRUE, can-default=FALSE, has-default=FALSE, receives-default=FALSE, composite-child=FALSE, style, events=0, no-show-all=FALSE, has-tooltip=FALSE, tooltip-markup=NULL, tooltip-text=NULL, window, opacity=1.000000, double-buffered, halign=GTK_ALIGN_FILL, valign=GTK_ALIGN_FILL, margin-left, margin-right, margin-start=0, margin-end=0, margin-top=0, margin-bottom=0, margin=0, hexpand=FALSE, vexpand=FALSE, hexpand-set=FALSE, vexpand-set=FALSE, expand=FALSE, scale-factor=2, border-width=0, resize-mode, child, type=GTK_WINDOW_TOPLEVEL, title="Profile", role=NULL, resizable=TRUE, modal=FALSE, window-position=GTK_WIN_POS_NONE, default-width=800, default-height=600, destroy-with-parent=FALSE, hide-titlebar-when-maximized=FALSE, icon, icon-name=NULL, screen, type-hint=GDK_WINDOW_TYPE_HINT_NORMAL, skip-taskbar-hint