# Systematic comparison of MendelImpute against Minimac4 and Beagle5

In [8]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using Suppressor

# Simulate data of various sizes 

## Step 0. Install `msprime`

[msprime download Link](https://msprime.readthedocs.io/en/stable/installation.html).

Some people might need to activate conda environment via `conda config --set auto_activate_base True`. You can turn it off once simulation is done by executing `conda config --set auto_activate_base False`.


## Step 1. Simulate data in terminal

```
python3 msprime_script.py 50000 10000 10000000 2e-8 2e-8 2020 > ./compare_sys/data1.vcf
python3 msprime_script.py 50000 10000 10000000 4e-8 4e-8 2020 > ./compare_sys/data2.vcf
python3 msprime_script.py 50000 10000 100000000 2e-8 2e-8 2020 > ./compare_sys/data3.vcf
python3 msprime_script.py 50000 10000 5000000 2e-8 2e-8 2020 > ./compare_sys/data4.vcf
```

###  Arguments: 
+ Number of haplotypes = 40000
+ Effective population size = 10000 ([source](https://www.the-scientist.com/the-nutshell/ancient-humans-more-diverse-43556))
+ Sequence length = 10 million (same as Beagle 5's choice)
+ Rrecombination rate = 2e-8 (default)
+ mutation rate = 2e-8 (default)
+ seed = 2019

In [36]:
@show nsamples("./compare_sys/data1.vcf.gz")
@show nrecords("./compare_sys/data1.vcf.gz")
@show nsamples("./compare_sys/data2.vcf.gz")
@show nrecords("./compare_sys/data2.vcf.gz")
@show nsamples("./compare_sys/data3.vcf.gz")
@show nrecords("./compare_sys/data3.vcf.gz")
@show nsamples("./compare_sys/data4.vcf.gz")
@show nrecords("./compare_sys/data4.vcf.gz")

nsamples("./compare_sys/data1.vcf.gz") = 25000
nrecords("./compare_sys/data1.vcf.gz") = 90127
nsamples("./compare_sys/data2.vcf.gz") = 25000
nrecords("./compare_sys/data2.vcf.gz") = 182414
nsamples("./compare_sys/data3.vcf.gz") = 25000
nrecords("./compare_sys/data3.vcf.gz") = 909708
nsamples("./compare_sys/data4.vcf") = 25000
nrecords("./compare_sys/data4.vcf") = 44492


44492

## Step 2: Compress files to .gz

In [34]:
# run these in terminal
run(`cat data1.vcf | gzip > data1.vcf.gz`)
run(`cat data2.vcf | gzip > data2.vcf.gz`)
run(`cat data3.vcf | gzip > data3.vcf.gz`)
run(`cat data4.vcf | gzip > data4.vcf.gz`)

ProcessFailedException: failed process: Process(`cat ./compare_sys/data3.vcf '|' gzip '>' ./compare_sys/data3.vcf.gz`, ProcessSignaled(2)) [0]


In [32]:
compress_vcf_to_gz("./compare_sys/data1.vcf"); rm("./compare_sys/data1.vcf", force=true)
compress_vcf_to_gz("./compare_sys/data2.vcf"); rm("./compare_sys/data2.vcf", force=true)
compress_vcf_to_gz("./compare_sys/data3.vcf"); rm("./compare_sys/data3.vcf", force=true)

[32mCreating ./compare_sys/data1.vcf.gz...100%|█████████████| Time: 0:14:14[39m
[32mCreating ./compare_sys/data2.vcf.gz...  0%|             |  ETA: 0:30:23[39m

InterruptException: InterruptException:

# Move over to hoffman now.

Put scripts below in under `/u/home/b/biona001/haplotype_comparisons/data`:

## `filter_and_mask.jl`
```Julia
using VCFTools
using MendelImpute
using Random

"""
    filter_and_mask(data::String, samples::Int)

Creates reference haplotypes and (unphased) target genotype files from `data`. 

# Inputs
`data`: The full (phased) data simulated by msprime.
`samples`: Number of samples (genotypes) desired in target file. Remaining haplotypes will become the reference panel
"""
function filter_and_mask(data::String, samples::Int)
    missingprop = 0.1
    n = nsamples(data)
    p = nrecords(data)
    samples > n && error("requested samples exceed total number of genotypes in $data.")

    # output filenames (tgt_data1.vcf.gz, ref_data1.vcf.gz, and tgt_masked_data1.vcf.gz)
    tgt = "./tgt_" * data
    ref = "./ref_" * data
    tgt_mask = "./tgt_masked_" * data
    tgt_mask_unphase = "./tgt_masked_unphased_" * data

    # compute target and reference index
    tgt_index = falses(n)
    tgt_index[1:samples] .= true
    ref_index = .!tgt_index
    record_index = 1:p # save all records (SNPs) 

    # generate masking matrix with `missingprop`% of trues (true = convert to missing)
    Random.seed!(2020)
    masks = falses(p, samples)
    for j in 1:samples, i in 1:p
        rand() < missingprop && (masks[i, j] = true)
    end

    # create outputs 
    VCFTools.filter(data, record_index, tgt_index, des = tgt)
    VCFTools.filter(data, record_index, ref_index, des = ref)
    mask_gt(tgt, masks, des=tgt_mask)

    # finally, unphase the target data
    unphase(tgt_mask, outfile=tgt_mask_unphase)
end

data = ARGS[1]
samples = parse(Int, ARGS[2])
filter_and_mask(data, samples)
```

## `mendel_fast.jl`
```Julia
using VCFTools
using MendelImpute
using GeneticVariation

function run(data::String, width::Int)
    tgtfile = "./tgt_masked_unphased_" * data
    reffile = "./ref_" * data
    outfile = "./mendel_imputed_" * data
    phase(tgtfile, reffile, outfile = outfile, width = width, fast_method=true)
end

data  = ARGS[1]
width = parse(Int, ARGS[2])
run(data, width)
```

## `mendel_dp.jl`
```Julia
using VCFTools
using MendelImpute
using GeneticVariation

function run(data::String, width::Int)
    tgtfile = "./tgt_masked_unphased_" * data
    reffile = "./ref_" * data
    outfile = "./mendel_imputed_" * data
    phase(tgtfile, reffile, outfile = outfile, width = width, fast_method=false)
end

data  = ARGS[1]
width = parse(Int, ARGS[2])
run(data, width)
```

## Step 1: filter files

+ `ref_data1.vcf.gz`: haplotype reference files
+ `tgt_data1.vcf.gz`: complete genotype information
+ `tgt_masked_data1.vcf.gz`: the same as `tgt_data1.vcf.gz` except some entries are masked
+ `tgt_masked_unphased_data1.vcf.gz`: the same as `tgt_data1.vcf.gz` except some entries are masked and heterzygotes are unphased. 

In [39]:
# specify simulation parameters
target_data = ["data1.vcf.gz", "data4.vcf.gz"]
memory = 47 
missingprop = 0.1
samples = 1000

1000

In [None]:
cd("/u/home/b/biona001/haplotype_comparisons/data")
for data in target_data
    # if unphased target genotype file exist already, move to next step
    if isfile(data * "./tgt_masked_unphased_" * data)
        continue 
    end

    open("filter.sh", "w") do io
        println(io, "#!/bin/bash")
        println(io, "#\$ -cwd")
        println(io, "# error = Merged with joblog")
        println(io, "#\$ -o joblog.\$JOB_ID")
        println(io, "#\$ -j y")
        println(io, "#\$ -l arch=intel-X5650,exclusive,h_rt=24:00:00,h_data=$(memory)G")
        println(io, "# Email address to notify")
        println(io, "#\$ -M \$USER@mail")
        println(io, "# Notify when")
        println(io, "#\$ -m a")
        println(io)
        println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
        println(io)
        println(io, "# load the job environment:")
        println(io, ". /u/local/Modules/default/init/modules.sh")
        println(io, "module load julia/1.2.0")
        println(io, "module load R/3.5.1")
        println(io, "module load java/1.8.0_111")
        println(io)
        println(io, "# filter/mask data")
        println(io, "julia ./filter_and_mask.jl $data $samples")
        println(io)
        println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
        println(io, "echo \"Job \JOB_ID ended on:   \" `date `")
        println(io)
    end

    # submit job
    run(`qsub filter.sh`)
    rm("filter.sh", force=true)
    sleep(2)
end

## Step 2: prephasing using beagle 4.1

In [None]:
for data in target_data
    open("prephase.sh", "w") do io
        println(io, "#!/bin/bash")
        println(io, "#\$ -cwd")
        println(io, "# error = Merged with joblog")
        println(io, "#\$ -o joblog.\$JOB_ID")
        println(io, "#\$ -j y")
        println(io, "#\$ -l arch=intel-X5650,exclusive,h_rt=24:00:00,h_data=$(memory)G")
        println(io, "# Email address to notify")
        println(io, "#\$ -M \$USER@mail")
        println(io, "# Notify when")
        println(io, "#\$ -m a")
        println(io)
        println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
        println(io)
        println(io, "# load the job environment:")
        println(io, ". /u/local/Modules/default/init/modules.sh")
        println(io, "module load julia/1.2.0")
        println(io, "module load R/3.5.1")
        println(io, "module load java/1.8.0_111")
        println(io)
        println(io, "# run prephasing using beagle 4.1")
        println(io, "java -Xss5m -Xmx$(memory)g -jar beagle4.1.jar gt=./tgt_masked_unphased_$(data) ref=./ref_$(data) niterations=0 out=./tgt_masked_phased_$(data)")
        println(io)
        println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
        println(io)
    end
    # submit job
    run(`qsub prephase.sh`)
    rm("prephase.sh", force=true)
    sleep(2)
end

## Step 3: run MendelImpute 

In [None]:
widths = [400; 800; 1600]
for data in target_data, width in widths
    # fast version
    open("mendel$width.sh", "w") do io
        println(io, "#!/bin/bash")
        println(io, "#\$ -cwd")
        println(io, "# error = Merged with joblog")
        println(io, "#\$ -o joblog.\$JOB_ID")
        println(io, "#\$ -j y")
        println(io, "#\$ -l arch=intel-X5650,exclusive,h_rt=24:00:00,h_data=$(memory)G")
        println(io, "# Email address to notify")
        println(io, "#\$ -M \$USER@mail")
        println(io, "# Notify when")
        println(io, "#\$ -m a")
        println(io)
        println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
        println(io)
        println(io, "# load the job environment:")
        println(io, ". /u/local/Modules/default/init/modules.sh")
        println(io, "module load julia/1.2.0")
        println(io, "module load R/3.5.1")
        println(io, "module load java/1.8.0_111")
        println(io)
        println(io, "# run MendelImpute (fast)")
        println(io, "julia mendel_fast.jl $data $width")
        println(io)
        println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
        println(io)
    end
    # submit job
    run(`qsub mendel$width.sh`)
    rm("mendel$width.sh", force=true)
    sleep(2)

    # dynamic programming version
    open("dp_mendel$width.sh", "w") do io
        println(io, "#!/bin/bash")
        println(io, "#\$ -cwd")
        println(io, "# error = Merged with joblog")
        println(io, "#\$ -o joblog.\$JOB_ID")
        println(io, "#\$ -j y")
        println(io, "#\$ -l arch=intel-X5650,exclusive,h_rt=24:00:00,h_data=$(memory)G")
        println(io, "# Email address to notify")
        println(io, "#\$ -M \$USER@mail")
        println(io, "# Notify when")
        println(io, "#\$ -m a")
        println(io)
        println(io, "echo \"Job \$JOB_ID started on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID started on:   \" `date `")
        println(io)
        println(io, "# load the job environment:")
        println(io, ". /u/local/Modules/default/init/modules.sh")
        println(io, "module load julia/1.2.0")
        println(io, "module load R/3.5.1")
        println(io, "module load java/1.8.0_111")
        println(io)
        println(io, "# run MendelImpute (dynamic programming)")
        println(io, "julia mendel_dp.jl $data $width")
        println(io)
        println(io, "echo \"Job \$JOB_ID ended on:   \" `hostname -s`")
        println(io, "echo \"Job \$JOB_ID ended on:   \" `date `")
        println(io)
    end
    # submit job
    run(`qsub dp_mendel$width.sh`)
    rm("dp_mendel$width.sh", force=true)
    sleep(2)
end