# Test JLD2 compression

In [29]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using SparseArrays
using JLD2, FileIO
using CodecZlib
using BSON
using HDF5
using JLSO
using BenchmarkTools
# using Plots
# using ProfileView

## Check reading speed and file size for simulated data

In [7]:
reffile = "haplo_ref.vcf.gz"
outfile = "haplo_ref.jld2"
@time hapset = save_jld2(reffile, outfile, column_major=true);

[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:07[39m


 10.659887 seconds (78.87 M allocations: 5.672 GiB, 8.19% gc time)


In [37]:
;gzip haplo_ref.jld2 haplo_ref.jld2.gz

gzip: haplo_ref.jld2.gz already exists -- skipping
gzip: haplo_ref.jld2.gz already has .gz suffix -- unchanged


In [10]:
;ls -al haplo_ref.vcf.gz # original size of ref file (bytes)

-rw-r--r--@ 1 biona001  staff  5449864 Apr  5 19:59 haplo_ref.vcf.gz


In [11]:
;ls -al haplo_ref.jld2 # size of .jld2 file (bytes)

-rw-r--r--  1 biona001  staff  20318574 May 27 22:50 haplo_ref.jld2


In [35]:
;ls -al haplo_ref.jld2.gz # size of .jld2.gz file (bytes)

-rw-r--r--@ 1 biona001  staff  6754174 May 27 22:50 haplo_ref.jld2.gz


In [33]:
20318574 / 5449864 # .jld2 is larger file size

3.7282717513684744

In [36]:
6754174 / 5449864 # .jld2.gz is larger file size

1.2393289080241268

In [19]:
#difference in reading speed
@time H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, reffile, trans=true, save_snp_info=true, msg = "Importing reference haplotype files...")
@time @load "haplo_ref.jld2" hapset;

[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:06[39m


  7.085228 seconds (72.34 M allocations: 5.389 GiB, 9.18% gc time)
  0.065357 seconds (585.73 k allocations: 49.473 MiB, 16.52% gc time)


In [32]:
7.085228 / 0.065357

108.40809706687884

## Check reading speed and file size for chr22 data in 1000 genomes

In [20]:
reffile = "chr22.uniqueSNPs.vcf.gz"
outfile = "chr22.uniqueSNPs.jld2"
@time hapset = save_jld2(reffile, outfile, column_major=true);

[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:27[39m


233.106226 seconds (3.27 G allocations: 242.340 GiB, 18.42% gc time)


In [38]:
;gzip chr22.uniqueSNPs.jld2 chr22.uniqueSNPs.jld2.gz

gzip: chr22.uniqueSNPs.jld2.gz already has .gz suffix -- unchanged


In [27]:
;ls -al chr22.uniqueSNPs.vcf.gz # original size of ref file (bytes)

-rw-r--r--  1 biona001  staff  142889955 Apr 20 11:38 chr22.uniqueSNPs.vcf.gz


In [28]:
;ls -al chr22.uniqueSNPs.jld2 # size of .jld2 file (bytes)

-rw-r--r--  1 biona001  staff  615840218 May 27 23:06 chr22.uniqueSNPs.jld2


In [39]:
;ls -al chr22.uniqueSNPs.jld2.gz # size of .jld2.gz file (bytes)

-rw-r--r--  1 biona001  staff  187644621 May 27 23:06 chr22.uniqueSNPs.jld2.gz


In [30]:
615840218 / 142889955 # .jld2 is larger file size

4.309891608545891

In [40]:
187644621 / 142889955 # .jld2.gz is larger file size

1.3132107221952727

In [26]:
#difference in reading speed
@time H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, reffile, trans=true, save_snp_info=true, msg = "Importing reference haplotype files...")
@time @load "chr22.uniqueSNPs.jld2" hapset;

[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:42[39m


242.880364 seconds (3.24 G allocations: 241.675 GiB, 17.22% gc time)
  1.496514 seconds (10.41 M allocations: 1.045 GiB, 23.09% gc time)


In [31]:
242.880364 / 1.496514

162.29742187510442

## .gz compressed jld2

Not sure how to make this work

In [20]:
reffile = "chr22.uniqueSNPs.jld2.gz"
io = GzipDecompressorStream(open(reffile, "r"))
x = read(io)

hapset = jldopen(x, "r")
# hapset = read(io, "hapset")
# jldopen(GzipDecompressorStream(open(reffile)), "r") do file
#     file["bigdata"] = randn(5)
# end

MethodError: MethodError: no method matching jldopen(::Array{UInt8,1}, ::String)
Closest candidates are:
  jldopen(!Matched::AbstractString, ::AbstractString; kwargs...) at /Users/biona001/.julia/packages/JLD2/2W9VX/src/JLD2.jl:288
  jldopen(!Matched::Function, ::Any...; kws...) at /Users/biona001/.julia/packages/JLD2/2W9VX/src/loadsave.jl:2

In [22]:
typeof(io) <: IOStream

false

# Try BSON format

In [10]:
reffile = "chr22.uniqueSNPs.vcf.gz"
H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, reffile, trans=true, save_snp_info=true, msg = "Importing reference haplotype files...")
hapset = MendelImpute.RefHaplotypes(H, true, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt);

[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:36[39m


LoadError: syntax: incomplete: premature end of input

In [25]:
@time bson("chr22.uniqueSNPs.bson", hapset = hapset)

 18.113662 seconds (48.01 M allocations: 8.182 GiB, 25.31% gc time)


In [38]:
@time BSON.@load "chr22.uniqueSNPs.bson" hapset

  6.822258 seconds (15.49 M allocations: 1.755 GiB, 40.00% gc time)


In [31]:
;ls -al chr22.uniqueSNPs.vcf.gz # original size of ref file (bytes)

-rw-r--r--  1 biona001  staff  142889955 Apr 20 11:38 chr22.uniqueSNPs.vcf.gz


In [30]:
;ls -al chr22.uniqueSNPs.bson # BSON is also large file

-rw-r--r--  1 biona001  staff  468468844 May 28 13:35 chr22.uniqueSNPs.bson


In [32]:
468468844 / 142889955

3.2785288790944054

In [33]:
;gzip chr22.uniqueSNPs.bson chr22.uniqueSNPs.bson.gz

gzip: chr22.uniqueSNPs.bson.gz already has .gz suffix -- unchanged


In [34]:
;ls -al chr22.uniqueSNPs.bson.gz 

-rw-r--r--  1 biona001  staff  169768322 May 28 13:35 chr22.uniqueSNPs.bson.gz


In [36]:
468468844 / 169768322

2.7594597065051985

In [35]:
pwd()

"/Users/biona001/.julia/dev/MendelImpute/simulation"

In [39]:
hapset = bsonopen(x, "r")

UndefVarError: UndefVarError: bsonopen not defined

## Try HDF5

In [41]:
reffile = "chr22.uniqueSNPs.vcf.gz"
H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, reffile, trans=true, save_snp_info=true, msg = "Importing reference haplotype files...")
hapset = MendelImpute.RefHaplotypes(H, true, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt)

MendelImpute.RefHaplotypes(Bool[0 0 … 0 0; 0 0 … 0 0; … ; 0 0 … 0 0; 0 0 … 0 0], true, ["HG00096", "HG00097", "HG00099", "HG00100", "HG00101", "HG00102", "HG00103", "HG00105", "HG00106", "HG00107"  …  "NA21128", "NA21129", "NA21130", "NA21133", "NA21135", "NA21137", "NA21141", "NA21142", "NA21143", "NA21144"], ["22", "22", "22", "22", "22", "22", "22", "22", "22", "22"  …  "22", "22", "22", "22", "22", "22", "22", "22", "22", "22"], [16050115, 16050213, 16050568, 16050607, 16050627, 16050654, 16050678, 16050739, 16050783, 16050840  …  51239678, 51239794, 51240084, 51240820, 51241101, 51241102, 51241285, 51241386, 51244163, 51244237], Array{String,1}[["22:16050115:G:A"], ["22:16050213:C:T"], ["22:16050568:C:A"], ["22:16050607:G:A"], ["22:16050627:G:T"], ["DUP_gs_CNV_22_16050654_16063474:16050654:A:<CN0>"], ["rs139377059:16050678:C:T"], ["22:16050739:TA:T"], ["22:16050783:A:G"], ["22:16050840:C:G"]  …  ["22:51239678:G:T"], ["22:51239794:C:A"], ["22:51240084:G:C"], ["rs202228854:51240820:

In [44]:
?h5write

search: [0m[1mh[22m[0m[1m5[22m[0m[1mw[22m[0m[1mr[22m[0m[1mi[22m[0m[1mt[22m[0m[1me[22m [0m[1mh[22m[0m[1m5[22m[0m[1mw[22m[0m[1mr[22m[0m[1mi[22m[0m[1mt[22m[0m[1me[22mattr [0m[1mh[22m[0m[1m5[22mre[0m[1mw[22m[0m[1mr[22m[0m[1mi[22m[0m[1mt[22m[0m[1me[22m



No documentation found.

`HDF5.h5write` is a `Function`.

```
# 1 method for generic function "h5write":
[1] h5write(filename, name::String, data, pv...) in HDF5 at /Users/biona001/.julia/packages/HDF5/pAi1D/src/HDF5.jl:716
```


## Try JLSO

In [14]:
reffile = "haplo_ref.vcf.gz"
H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, reffile, trans=true, save_snp_info=true, msg = "Importing reference haplotype files...")
hapset = MendelImpute.RefHaplotypes(H, true, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt);

[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:05[39m


In [38]:
@time JLSO.save("haplo_ref.jlso", :hapset => hapset, format=:bson, compression=:gzip_smallest)

 11.912740 seconds (2.45 M allocations: 458.037 MiB, 0.39% gc time)


In [20]:
;ls -al haplo_ref.vcf.gz

-rw-r--r--@ 1 biona001  staff  5449864 Apr  5 19:59 haplo_ref.vcf.gz


In [39]:
;ls -al haplo_ref.jlso

-rw-r--r--  1 biona001  staff  5851597 May 28 15:35 haplo_ref.jlso


In [31]:
# filesize = 5505991, format=:julia_serialize, compression=:gzip_smallest
@btime loaded = JLSO.load("haplo_ref.jlso");

  189.543 ms (502942 allocations: 35.94 MiB)


In [34]:
# filesize = 5758746, format=:julia_serialize, compression=:gzip_fastest
@btime loaded = JLSO.load("haplo_ref.jlso");

  195.815 ms (502942 allocations: 36.18 MiB)


In [37]:
# filesize = 6133533, format=:bson, compression=:gzip_fastest
@btime loaded = JLSO.load("haplo_ref.jlso");

  178.737 ms (1081423 allocations: 86.95 MiB)


In [40]:
# filesize = 5851597, format=:bson, compression=:gzip_smallest
@btime loaded = JLSO.load("haplo_ref.jlso");

  171.238 ms (1081423 allocations: 86.68 MiB)


In [42]:
;ls -al chr22.uniqueSNPs.vcf.gz

-rw-r--r--  1 biona001  staff  142889955 Apr 20 11:38 chr22.uniqueSNPs.vcf.gz


In [43]:
reffile = "chr22.uniqueSNPs.vcf.gz"
H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, reffile, trans=true, save_snp_info=true, msg = "Importing reference haplotype files...")
hapset = MendelImpute.RefHaplotypes(H, true, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt);

[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:25[39m


In [64]:
@time JLSO.save("chr22.uniqueSNPs.jlso", :hapset => hapset, format=:bson, compression=:none)

 34.483278 seconds (44.19 M allocations: 16.693 GiB, 39.23% gc time)


In [65]:
;ls -al chr22.uniqueSNPs.jlso

-rw-r--r--  1 biona001  staff  468472432 May 28 16:40 chr22.uniqueSNPs.jlso


In [47]:
# filesize = 156750465, format=:julia_serialize, compression=:gzip_smallest
@btime loaded = JLSO.load("chr22.uniqueSNPs.jlso") seconds=30;

  6.701 s (9032548 allocations: 901.81 MiB)


In [51]:
# filesize = 163328088, format=:julia_serialize, compression=:gzip
@btime loaded = JLSO.load("chr22.uniqueSNPs.jlso") seconds=30;

  6.717 s (9032546 allocations: 908.09 MiB)


In [54]:
# filesize = 179698900, format=:julia_serialize, compression=:gzip_fastest
@btime loaded = JLSO.load("chr22.uniqueSNPs.jlso") seconds=30;

  7.173 s (9032547 allocations: 923.70 MiB)


In [57]:
# filesize = 163082396, format=:bson, compression=:gzip_smallest
@btime loaded = JLSO.load("chr22.uniqueSNPs.jlso") seconds=30;

  7.189 s (19360229 allocations: 1.96 GiB)


In [60]:
# filesize = 169771730, format=:bson, compression=:gzip
@btime loaded = JLSO.load("chr22.uniqueSNPs.jlso") seconds=30;

  7.469 s (19360228 allocations: 1.97 GiB)


In [63]:
# filesize = 186600622, format=:bson, compression=:gzip_fastest
@btime loaded = JLSO.load("chr22.uniqueSNPs.jlso") seconds=30;

  8.083 s (19360229 allocations: 1.99 GiB)


In [66]:
# filesize = 468472432, format=:bson, compression=:none
@btime loaded = JLSO.load("chr22.uniqueSNPs.jlso") seconds=30;

  5.195 s (18070248 allocations: 2.11 GiB)


In [67]:
@time loaded = JLSO.load("chr22.uniqueSNPs.jlso")
hs = loaded[:hapset];

  6.562851 seconds (18.07 M allocations: 2.108 GiB, 51.79% gc time)


In [68]:
all(hs.H .== hapset.H)

true