# Test JLD2 compression

In [4]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using SparseArrays
using JLD2, FileIO
# using Plots
# using ProfileView

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


## Check reading speed and file size for simulated data

In [7]:
reffile = "haplo_ref.vcf.gz"
outfile = "haplo_ref.jld2"
@time hapset = save_jld2(reffile, outfile, column_major=true);

[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:07[39m


 10.659887 seconds (78.87 M allocations: 5.672 GiB, 8.19% gc time)


In [37]:
;gzip haplo_ref.jld2 haplo_ref.jld2.gz

gzip: haplo_ref.jld2.gz already exists -- skipping
gzip: haplo_ref.jld2.gz already has .gz suffix -- unchanged


In [10]:
;ls -al haplo_ref.vcf.gz # original size of ref file (bytes)

-rw-r--r--@ 1 biona001  staff  5449864 Apr  5 19:59 haplo_ref.vcf.gz


In [11]:
;ls -al haplo_ref.jld2 # size of .jld2 file (bytes)

-rw-r--r--  1 biona001  staff  20318574 May 27 22:50 haplo_ref.jld2


In [35]:
;ls -al haplo_ref.jld2.gz # size of .jld2.gz file (bytes)

-rw-r--r--@ 1 biona001  staff  6754174 May 27 22:50 haplo_ref.jld2.gz


In [33]:
20318574 / 5449864 # .jld2 is larger file size

3.7282717513684744

In [36]:
6754174 / 5449864 # .jld2.gz is larger file size

1.2393289080241268

In [19]:
#difference in reading speed
@time H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, reffile, trans=true, save_snp_info=true, msg = "Importing reference haplotype files...")
@time @load "haplo_ref.jld2" hapset;

[32mImporting reference haplotype files...100%|█████████████| Time: 0:00:06[39m


  7.085228 seconds (72.34 M allocations: 5.389 GiB, 9.18% gc time)
  0.065357 seconds (585.73 k allocations: 49.473 MiB, 16.52% gc time)


In [32]:
7.085228 / 0.065357

108.40809706687884

## Check reading speed and file size for chr22 data in 1000 genomes

In [20]:
reffile = "chr22.uniqueSNPs.vcf.gz"
outfile = "chr22.uniqueSNPs.jld2"
@time hapset = save_jld2(reffile, outfile, column_major=true);

[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:27[39m


233.106226 seconds (3.27 G allocations: 242.340 GiB, 18.42% gc time)


In [38]:
;gzip chr22.uniqueSNPs.jld2 chr22.uniqueSNPs.jld2.gz

gzip: chr22.uniqueSNPs.jld2.gz already has .gz suffix -- unchanged


In [27]:
;ls -al chr22.uniqueSNPs.vcf.gz # original size of ref file (bytes)

-rw-r--r--  1 biona001  staff  142889955 Apr 20 11:38 chr22.uniqueSNPs.vcf.gz


In [28]:
;ls -al chr22.uniqueSNPs.jld2 # size of .jld2 file (bytes)

-rw-r--r--  1 biona001  staff  615840218 May 27 23:06 chr22.uniqueSNPs.jld2


In [39]:
;ls -al chr22.uniqueSNPs.jld2.gz # size of .jld2.gz file (bytes)

-rw-r--r--  1 biona001  staff  187644621 May 27 23:06 chr22.uniqueSNPs.jld2.gz


In [30]:
615840218 / 142889955 # .jld2 is larger file size

4.309891608545891

In [40]:
187644621 / 142889955 # .jld2.gz is larger file size

1.3132107221952727

In [26]:
#difference in reading speed
@time H, H_sampleID, H_chr, H_pos, H_ids, H_ref, H_alt = convert_ht(Bool, reffile, trans=true, save_snp_info=true, msg = "Importing reference haplotype files...")
@time @load "chr22.uniqueSNPs.jld2" hapset;

[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:42[39m


242.880364 seconds (3.24 G allocations: 241.675 GiB, 17.22% gc time)
  1.496514 seconds (10.41 M allocations: 1.045 GiB, 23.09% gc time)


In [31]:
242.880364 / 1.496514

162.29742187510442