# Simulate genotype data from 1000 genome data 

Notes:
+ Using imputation server requires the target `vcf` file be `bgzip` compressed, but the software to do so cannot be installed locally. Wait for [this issue](https://github.com/samtools/bcftools/issues/1204) to get resolved
+ Use minimac3 if reference panel is small (e.g. less than HRC) because Minimac4 uses approximations that may not give good accuracy for smaller panels

The purpose of this document is to see whether Minimac4 on Michigan server produces roughly equivalent error rate as ran locally.

In [3]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase

└ @ Revise /Users/biona001/.julia/packages/Revise/439di/src/Revise.jl:1108


## Simulate genotype data

In [10]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function filter_and_mask()
    for chr in [22]
        # filter chromosome data for unique snps
        data = "../raw/ALL.chr$chr.phase3_v5.shapeit2_mvncall_integrated.noSingleton.genotypes.vcf.gz"
        println("generating unique snps for chromosome $chr")
        record_index = .!find_duplicate_marker(data)
        VCFTools.filter(data, record_index, 1:nsamples(data), des = "chr$chr.uniqueSNPs.vcf.gz")
    
        # import haplotype data
        H = convert_ht(Bool, "chr$chr.uniqueSNPs.vcf.gz", trans=true)

        # simulate 500 samples: each sample's 2 haplotypes are formed from 1~6 haplotypes from H
        Random.seed!(2020)
        println("simulating genotypes for chr $chr")
        samples = 500
        X = simulate_genotypes(H, samples)

        # extract each marker's info
        println("extracting marker infos for chr $chr")
        reader = VCF.Reader(openvcf("chr$chr.uniqueSNPs.vcf.gz"))
        marker_chrom = ["$chr" for i in 1:size(X, 1)]
        marker_pos = zeros(Int, size(X, 1))
        marker_ID = Vector{String}(undef, size(X, 1))
        marker_REF = Vector{String}(undef, size(X, 1))
        marker_ALT = Vector{String}(undef, size(X, 1))
        for (i, record) in enumerate(reader)
            marker_pos[i] = VCF.pos(record)
            marker_ID[i]  = VCF.id(record)[1] 
            marker_REF[i] = VCF.ref(record)
            marker_ALT[i] = VCF.alt(record)[1]
        end

        # save complete genotype file to disk 
        println("saving complete genotype for chr $chr")
        make_tgtvcf_file(X, vcffilename = "chr$(chr)_simulated.vcf.gz", marker_chrom=marker_chrom,
            marker_pos=marker_pos, marker_ID=marker_ID, marker_REF=marker_REF, marker_ALT=marker_ALT)

        # generate masking matrix with `missingprop`% of trues (true = convert to missing)
        p = size(X, 1)
        masks = falses(p, samples)
        missingprop = 0.01
        for j in 1:samples, i in 1:p
            rand() < missingprop && (masks[i, j] = true)
        end

        # save genotype data with 1% missing data to disk 
        println("masking entries for chr $chr")
        mask_gt("chr$(chr)_simulated.vcf.gz", masks, des="chr$(chr)_simulated_masked.vcf.gz")
        println("")
    end
end
@time filter_and_mask()

generating unique snps for chromosome 22
simulating genotypes for chr 22
extracting marker infos for chr 22
saving complete genotype for chr 22
masking entries for chr 22

1700.722180 seconds (18.92 G allocations: 1.736 TiB, 12.49% gc time)


## Run MendelImpute

In [5]:
Threads.nthreads()

8

In [3]:
# unique happairs only method (8 threads, no bkpt search, 10% overlapping window)
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function run()
    X_complete = convert_gt(Float32, "chr22_simulated.vcf.gz")
    n, p = size(X_complete)
    for width in [500, 1000]
        println("running unique happair only, width = $width")
        tgtfile = "./chr22_simulated_masked.vcf.gz"
        reffile = "./chr22.uniqueSNPs.vcf.gz"
        outfile = "./mendel_imputed_uniqonly_$(width).vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, unique_only=true)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
run()

running unique happair only, width = 500
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:06[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:15[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:10:45[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:02:00[39m


1095.460113 seconds (5.57 G allocations: 522.031 GiB, 3.88% gc time)
error = 3.353805553703529e-5 

running unique happair only, width = 1000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:05[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:19[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:14:32[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:57[39m


1323.292636 seconds (5.55 G allocations: 528.525 GiB, 3.27% gc time)
error = 1.3365605119243835e-5 



In [7]:
# dynamic programming method (8 threads, 10% overlapping window)
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
function test()
    X_complete = convert_gt(Float32, "chr22_simulated.vcf.gz")
    n, p = size(X_complete)
    for width in 1000
        println("running unique happair only, width = $width")
        tgtfile = "./chr22_simulated_masked.vcf.gz"
        reffile = "./chr22.uniqueSNPs.vcf.gz"
        outfile = "./mendel_imputed_uniqonly_$(width).vcf.gz"
        @time phase(tgtfile, reffile, outfile = outfile, width = width, fast_method=false)
        X_mendel = convert_gt(Float32, outfile)
        println("error = $(sum(X_mendel .!= X_complete) / n / p) \n")
    end
end
test()

running unique happair only, width = 1000
Running chunk 1 / 1


[32mImporting genotype file...100%|█████████████████████████| Time: 0:01:06[39m
[32mImporting reference haplotype files...100%|█████████████| Time: 0:03:08[39m
[32mComputing optimal haplotype pairs...100%|███████████████| Time: 0:14:50[39m
[32mImputing samples...100%|████████████████████████████████| Time: 0:01:18[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:01:48[39m


1399.472408 seconds (5.56 G allocations: 530.606 GiB, 3.33% gc time)
error = 3.070057788410997e-6 



# Run Beagle5 locally

In [2]:
# beagle 5
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    run(`java -Xmx15g -jar beagle.28Sep18.793.jar gt=chr22_simulated_masked.vcf.gz ref=chr22.uniqueSNPs.vcf.gz out=beagle_imputed nthreads=4`)
end
beagle()

beagle.28Sep18.793.jar (version 5.0)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.28Sep18.793.jar" to list command line argument
Start time: 05:23 PM PDT on 16 Apr 2020

Command line: java -Xmx13653m -jar beagle.28Sep18.793.jar
  gt=chr22_simulated_masked.vcf.gz
  ref=chr22.uniqueSNPs.vcf.gz
  out=beagle_imputed
  nthreads=4

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,504
Study samples:             500

Window 1 (22:16050115-51244237)
Reference markers:     644,939
Study markers:         644,939

Burnin  iteration 1:           3 minutes 53 seconds
Burnin  iteration 2:           4 minutes 1 second
Burnin  iteration 3:           7 minutes 18 seconds
Burnin  iteration 4:           8 minutes 55 seconds
Burnin  iteration 5:           10 minutes 25 seconds
Burnin  iteration 6:           10 minutes 43 seconds

Phasing iteration 1:           10 minutes 45 seconds
Phasing iteration 2:           10 minutes 7 seconds
Phasing iteration 3:       

UndefVarError: UndefVarError: convert_gt not defined

In [4]:
# error rate
X_complete = convert_gt(Float32, "chr22_simulated.vcf.gz")
X_beagle = convert_gt(Float32, "beagle_imputed.vcf.gz")
n, p = size(X_complete)
println("error = $(sum(X_beagle .!= X_complete) / n / p)")

error = 4.7446347639079044e-7


# Prephase using Beagle 4.1

In [None]:
# beagle 4.1 for prephasing
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function beagle()
    run(`java -Xmx15g -jar beagle.27Jan18.7e1.jar gt=chr22_simulated_masked.vcf.gz ref=chr22.uniqueSNPs.vcf.gz out=beagle_prephased niterations=0 nthreads=4`)
end
beagle()

# Run Minimac4 locally

# run in terminal

```minimac4 --refHaps chr22.uniqueSNPs.vcf.gz --haps chr22_simulated_masked.vcf.gz --prefix minimac_imputed_chr22 --format GT --cpus 8```

# Submit on Michigan server (Minimac4 v1.2.4)
+ Reference panel = 1000 genome phase3 v5
+ Input files = `chr22_simulated_masked.vcf` (needed VCF v4.2 instead of 4.3)
+ Array build = GRCh37/hg19 
+ rsq filter = off
+ Phasing = Eagle 2.4
+ Population = EUR
+ Mode = QC + imputation
