# Test optimal window with for MendelImpute on simulated data

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using SparseArrays

└ @ Revise /Users/biona001/.julia/packages/Revise/439di/src/Revise.jl:1108
┌ Info: Precompiling VCFTools [a620830f-fdd7-5ebc-8d26-3621ab35fbfe]
└ @ Base loading.jl:1273
┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1273


### Step 0. Install `msprime`

[msprime download Link](https://msprime.readthedocs.io/en/stable/installation.html).

Some people might need to activate conda environment via `conda config --set auto_activate_base True`. You can turn it off once simulation is done by executing `conda config --set auto_activate_base False`.


### Step 1. Simulate data in terminal

```
python3 msprime_script.py 4000 10000 5000000 2e-8 2e-8 2019 > full.vcf
```

Arguments: 
+ Number of haplotypes = 40000
+ Effective population size = 10000 ([source](https://www.the-scientist.com/the-nutshell/ancient-humans-more-diverse-43556))
+ Sequence length = 10 million (same as Beagle 5's choice)
+ Rrecombination rate = 2e-8 (default)
+ mutation rate = 2e-8 (default)
+ seed = 2019

### Step 2: Convert simulated haplotypes to reference haplotypes and target genotype files (in .vcf format)

The last 200 people (400 haplotypes) are treated as imputation targets. The remaining haplotypes become reference panels. 

In [16]:
records, samples = nrecords("full.vcf"), nsamples("full.vcf")
@show records
@show samples;

records = 35897
samples = 2000


In [17]:
# compute target and reference index
tgt_index = falses(samples)
tgt_index[end-999:end] .= true
ref_index = .!tgt_index

# save all records
record_index = trues(records);

In [4]:
@time VCFTools.filter("full.vcf", record_index, tgt_index, des = "target.vcf")
@time VCFTools.filter("full.vcf", record_index, ref_index, des = "haplo_ref.vcf")

 24.835489 seconds (399.92 M allocations: 33.439 GiB, 13.62% gc time)
 23.296331 seconds (395.78 M allocations: 33.237 GiB, 14.09% gc time)


In [8]:
@time VCFTools.filter("full.vcf", record_index, tgt_index, des = "target.vcf.gz")
@time VCFTools.filter("full.vcf", record_index, ref_index, des = "haplo_ref.vcf.gz")

 29.828560 seconds (396.10 M allocations: 33.252 GiB, 12.33% gc time)
 30.136284 seconds (395.78 M allocations: 33.237 GiB, 12.28% gc time)


### Step 3: Randomly mask entries to create target file with missing entries

+ `X` is the genotype matrix (without missing entries). 
+ `Xm` is the masked genotype matrix (imputation target)

We need transpose this matrix later because `MendelImpute` currently (11/13/2019) requires transposed version.

In [9]:
# import full genotype matrix
@time X = convert_gt(Float32, "target.vcf.gz"; as_minorallele = false)

# each row is a sample. Transpose so that columns are samples. 
X = copy(X')

# mask 10% entries
p, n = size(X)
Random.seed!(123)
missingprop = 0.1
Xm = copy(X)
Xm = ifelse.(rand(Float32, p, n) .< missingprop, missing, Xm)
Xm_original = copy(Xm)
masks = ismissing.(Xm)

# save Xm to new VCF file
mask_gt("target.vcf.gz", masks, des="target_masked.vcf.gz")

  5.138160 seconds (72.66 M allocations: 6.250 GiB, 18.38% gc time)


### Step 4: Import haplotype reference panels, then transpose it

In `haplo_ref.vcf`:
+ Each column is a haplotype

We need transpose this matrix later because `MendelImpute` currently (11/13/2019) requires transposed version.

In [10]:
@time H = convert_ht(Float32, "haplo_ref.vcf.gz", as_minorallele=false)
@time H = copy(H')
size(H)

  5.307197 seconds (72.59 M allocations: 6.348 GiB, 19.69% gc time)
  0.179460 seconds (7 allocations: 273.873 MiB)


(35897, 2000)

### Step 5: Test MendelImpute by calculating error

In [23]:
# warmup (as_minorallele = false)
width = 1200
ph = @timed phase(Xm, H, width=width)
impute2!(Xm, H, ph[1])
missing_idx    = ismissing.(Xm_original)
total_missing  = sum(missing_idx)
actual_missing_values  = convert(Vector{Int64}, X[missing_idx])  #true values of missing entries
imputed_missing_values = convert(Vector{Int64}, Xm[missing_idx]) #imputed values of missing entries
error_rate = sum(actual_missing_values .!= imputed_missing_values) / total_missing
println("width $width: error = $error_rate, time = $(ph[2]) sec, memory = $(ph[3]/1e6) MB")
copyto!(Xm, Xm_original);

[204, 357, 493, 549, 1118, 1205, 1278, 1288]
[381, 461, 882, 1384, 1492, 1886]
width 1200: error = 0.0037959967059601743, time = 5.792771109 sec, memory = 133.468146 MB


In [19]:
ph

(HaplotypeMosaicPair[HaplotypeMosaicPair(HaplotypeMosaic(35897, [1, 6001, 8549, 22801, 26239, 27752, 29244], [549, 345, 1369, 1236, 137, 354, 986]), HaplotypeMosaic(35897, [1, 414, 6001, 8401, 19078, 19201, 21622, 25216, 26137, 27584, 27779], [381, 345, 26, 549, 1746, 549, 1488, 217, 157, 186, 1236])), HaplotypeMosaicPair(HaplotypeMosaic(35897, [1, 3085, 24302, 32107], [97, 632, 676, 350]), HaplotypeMosaic(35897, [1, 1, 3155, 4181, 15522, 16251, 30506, 32560, 33909], [381, 632, 1673, 1074, 39, 301, 92, 1269, 79])), HaplotypeMosaicPair(HaplotypeMosaic(35897, [1, 1222, 5990, 6001, 13201, 14890, 23436, 24366, 32401], [263, 684, 133, 122, 436, 1860, 69, 114, 778]), HaplotypeMosaic(35897, [1, 1, 1223, 2401, 10801, 14870], [381, 391, 902, 1189, 1860, 1717])), HaplotypeMosaicPair(HaplotypeMosaic(35897, [1, 27928], [1670, 1025]), HaplotypeMosaic(35897, [1, 1, 10801, 14401, 18001, 27844, 28801], [381, 343, 1025, 900, 1025, 175, 1670])), HaplotypeMosaicPair(HaplotypeMosaic(35897, [1], [273]), Ha

In [32]:
# warmup (as_is = false)
width = 1200
ph = @timed phase(Xm, H, width=width)
impute2!(Xm, H, ph[1])
missing_idx    = ismissing.(Xm_original)
total_missing  = sum(missing_idx)
actual_missing_values  = convert(Vector{Int64}, X[missing_idx])  #true values of missing entries
imputed_missing_values = convert(Vector{Int64}, Xm[missing_idx]) #imputed values of missing entries
error_rate = sum(actual_missing_values .!= imputed_missing_values) / total_missing
println("width $width: error = $error_rate, time = $(ph[2]) sec, memory = $(ph[3]/1e6) MB")
copyto!(Xm, Xm_original);

width 1200: error = 0.005161274875819265, time = 5.520036333 sec, memory = 110.177456 MB


### 1000 samples samples, 3600 haplotypes, 35897 markers

In [15]:
# as_is = true
width = [50; 200; 400; 800; 1200; 2000; 3000; 4000; 5000]
for w in width
    ph = @timed phase(Xm, H, width=w)
    impute2!(Xm, H, ph[1])
    missing_idx    = ismissing.(Xm_original)
    total_missing  = sum(missing_idx)
    actual_missing_values  = convert(Vector{Int64}, X[missing_idx])  #true values of missing entries
    imputed_missing_values = convert(Vector{Int64}, Xm[missing_idx]) #imputed values of missing entries
    error_rate = sum(actual_missing_values .!= imputed_missing_values) / total_missing
    println("width $w: error = $error_rate, time = $(ph[2]) sec, memory = $(ph[3]/1e6) MB")
    copyto!(Xm, Xm_original)
end

width 50: error = 0.016241353563058645, time = 7.736097151 sec, memory = 1091.642784 MB
width 200: error = 0.003792655894777811, time = 4.182049198 sec, memory = 273.796128 MB
width 400: error = 0.002375038349728364, time = 4.575971305 sec, memory = 154.566112 MB
width 800: error = 0.002746703593766269, time = 5.611958913 sec, memory = 107.732064 MB
width 1200: error = 0.0037959967059601743, time = 6.009479787 sec, memory = 110.055488 MB
width 2000: error = 0.006347819647421924, time = 6.254486756 sec, memory = 142.620768 MB
width 3000: error = 0.009728442163041608, time = 6.213389195 sec, memory = 206.076176 MB
width 4000: error = 0.013991595632668902, time = 5.993645679 sec, memory = 266.464544 MB
width 5000: error = 0.015579037746155143, time = 5.994658345 sec, memory = 285.508784 MB


In [11]:
# as_is = false
width = [50; 200; 400; 800; 1200; 2000; 3000; 4000; 5000]
for w in width
    ph = @timed phase(Xm, H, width=w)
    impute2!(Xm, H, ph[1])
    missing_idx    = ismissing.(Xm_original)
    total_missing  = sum(missing_idx)
    actual_missing_values  = convert(Vector{Int64}, X[missing_idx])  #true values of missing entries
    imputed_missing_values = convert(Vector{Int64}, Xm[missing_idx]) #imputed values of missing entries
    error_rate = sum(actual_missing_values .!= imputed_missing_values) / total_missing
    println("width $w: error = $error_rate, time = $(ph[2]) sec, memory = $(ph[3]/1e6) MB")
    copyto!(Xm, Xm_original)
end

width 50: error = 0.01892263293783697, time = 7.560888095 sec, memory = 1100.014944 MB
width 200: error = 0.005909616580668619, time = 4.511297279 sec, memory = 276.790352 MB
width 400: error = 0.004010087022563282, time = 4.535729067 sec, memory = 155.301872 MB
width 800: error = 0.0041389866540161285, time = 5.669351368 sec, memory = 107.906128 MB
width 1200: error = 0.005161274875819265, time = 6.076758724 sec, memory = 110.177104 MB
width 2000: error = 0.0076482304001567955, time = 6.310865611 sec, memory = 142.657136 MB
width 3000: error = 0.011014654468251436, time = 6.401430612 sec, memory = 207.707328 MB
width 4000: error = 0.015300080012427818, time = 6.243830048 sec, memory = 269.739312 MB
width 5000: error = 0.016841585972156566, time = 6.083279975 sec, memory = 289.624608 MB


### 200 samples samples, 39600 haplotypes, 45086 markers

In [11]:
width = [50; 200; 400; 800; 1200; 2000; 3000; 4000; 5000]
for w in width
    ph = @timed phase(Xm, H, width=w)
    impute2!(Xm, H, ph[1])
    missing_idx    = ismissing.(Xm_original)
    total_missing  = sum(missing_idx)
    actual_missing_values  = convert(Vector{Int64}, X[missing_idx])  #true values of missing entries
    imputed_missing_values = convert(Vector{Int64}, Xm[missing_idx]) #imputed values of missing entries
    error_rate = sum(actual_missing_values .!= imputed_missing_values) / total_missing
    println("width $w: error = $error_rate, time = $(ph[2]) sec, memory = $(ph[3]/1e6) MB")
    copyto!(Xm, Xm_original)
end

width 50: error = 0.02108025201370305, time = 53.186390706 sec, memory = 2850.043568 MB
width 200: error = 0.007351693528979247, time = 24.983143501 sec, memory = 727.184896 MB
width 400: error = 0.004604651008043722, time = 21.781636026 sec, memory = 373.909136 MB
width 800: error = 0.003454597276025095, time = 21.660624886 sec, memory = 222.238256 MB
width 1200: error = 0.002700463681258782, time = 22.564362254 sec, memory = 190.133616 MB
width 2000: error = 0.003009880259111431, time = 23.189537012 sec, memory = 213.424624 MB
width 3000: error = 0.0026017609019437795, time = 24.424956618 sec, memory = 263.062832 MB
width 4000: error = 0.002606196981912993, time = 26.004524506 sec, memory = 422.452464 MB
width 5000: error = 0.003191759537849189, time = 27.569003756 sec, memory = 529.3919 MB
