# Find which SNPs in reference panels are on GWAS chip

One of the reviewers' main comment is we should use real SNPs in our simulations. Thus, let's try to see if SNPs on the UK biobank is also on the HRC/1000-genomes reference panel. 

In [1]:
using Revise
using VCFTools
using SnpArrays
using MendelImpute
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLSO
using BenchmarkTools
using GroupSlices
using LinearAlgebra
using Plots
# using ProfileView

BLAS.set_num_threads(1)
Threads.nthreads()

8

# 1000 genomes

In [7]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function filter_and_mask()
    for chr in [10]
        # filter chromosome data for unique snps
        data = "../beagle_raw/chr$chr.1kg.phase3.v5a.vcf.gz"
        full_record_index = .!find_duplicate_marker(data)
        VCFTools.filter(data, full_record_index, 1:nsamples(data), 
            des = "chr$chr.uniqueSNPs.vcf.gz", allow_multiallelic=false)

        # import VCF data with only unique SNPs
        _, vcf_sampleID, _, _, vcf_record_ids, _, _ = convert_gt(Float32, 
            "chr$chr.uniqueSNPs.vcf.gz", save_snp_info=true, msg="importing")
        total_snps = length(vcf_record_ids)
        samples = length(vcf_sampleID)

        # generate target panel with all snps
        n = 100
        sample_idx = falses(samples)
        sample_idx[1:n] .= true
        shuffle!(sample_idx)
        VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
            sample_idx, des = "target.chr$chr.full.vcf.gz", allow_multiallelic=false)

        # generate reference panel without target samples
        VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
            .!sample_idx, des = "ref.chr$chr.excludeTarget.vcf.gz", allow_multiallelic=false)
        
        # generate target file with 100 samples whose snps are in the UK Biobank
        ukbb = SnpData("/Users/biona001/.julia/dev/MendelImpute/data/ukbb/ukb.plink.filtered")
        ukbb_snpids = ukbb.snp_info[findall(x -> x == string(chr), ukbb.snp_info[!, :chromosome]), :snpid]
        match_idx = indexin(vcat(vcf_record_ids...), ukbb_snpids)
        record_idx = falses(total_snps)
        record_idx[findall(!isnothing, match_idx)] .= true
        VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, sample_idx, 
            des = "target.chr$chr.typedOnly.vcf.gz", allow_multiallelic=false)

        # unphase and mask 1% entries in target file
        p = nrecords("target.chr$chr.typedOnly.vcf.gz")
        masks = falses(p, n)
        missingprop = 0.001
        for j in 1:n, i in 1:p
            rand() < missingprop && (masks[i, j] = true)
        end
        mask_gt("target.chr$chr.typedOnly.vcf.gz", masks, 
            des="target.chr$chr.typedOnly.masked.vcf.gz", unphase=true)
    end 
end
Random.seed!(2020)
@time filter_and_mask()

[32mfinding duplicate markers...100%|███████████████████████| Time: 0:15:54[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 0:30:00[39m
[32mimporting100%|██████████████████████████████████████████| Time: 0:13:19[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 0:16:39[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 0:29:59[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 0:16:48[39m


7507.164928 seconds (76.23 G allocations: 6.868 TiB, 16.87% gc time)


## MendelImpute on simulated data

In [2]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")

# compress reference haplotypes from .vcf.gz to .jlso format
reffile = "ref.chr10.excludeTarget.vcf.gz"
tgtfile = "target.chr10.typedOnly.masked.vcf.gz"
outfile = "ref.chr10.excludeTarget.jlso"
@time compress_haplotypes(reffile, tgtfile, outfile);

[32mimporting reference data...100%|████████████████████████| Time: 0:13:43[39m


1220.397483 seconds (7.36 G allocations: 677.930 GiB, 40.51% gc time)


In [3]:
# phase & impute
tgtfile = "target.chr10.typedOnly.masked.vcf.gz"
reffile = "ref.chr10.excludeTarget.jlso"
outfile = "mendel.imputed.chr10.vcf.gz"
phase(tgtfile, reffile, outfile);

Number of threads = 8
Importing reference haplotype data...
Total windows = 949, averaging ~ 444 unique haplotypes per window.

Timings: 
    Data import                     = 21.6813 seconds
        import target data             = 0.462248 seconds
        import compressed haplotypes   = 21.219 seconds
    Computing haplotype pair        = 5.502 seconds
        BLAS3 mul! to get M and N      = 0.0694939 seconds per thread
        haplopair search               = 1.04007 seconds per thread
        initializing missing           = 0.00319779 seconds per thread
        allocating and viewing         = 0.00717046 seconds per thread
        index conversion               = 0.000639812 seconds per thread
    Phasing by win-win intersection = 1.6372 seconds
        Window-by-window intersection  = 0.0335358 seconds per thread
        Breakpoint search              = 0.368803 seconds per thread
        Recording result               = 0.146441 seconds per thread
    Imputation               

## Beagle 5.1 on simulated data

In [4]:
# run beagle 5.1 (8 thread)
run(`java -jar beagle.18May20.d20.jar gt=target.chr10.typedOnly.masked.vcf.gz ref=ref.chr10.excludeTarget.vcf.gz out=beagle.chr10.result nthreads=8`)

beagle.18May20.d20.jar (version 5.1)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.18May20.d20.jar" to list command line argument
Start time: 11:00 PM PST on 10 Feb 2021

Command line: java -Xmx3641m -jar beagle.18May20.d20.jar
  gt=target.chr10.typedOnly.masked.vcf.gz
  ref=ref.chr10.excludeTarget.vcf.gz
  out=beagle.chr10.result
  nthreads=8

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (10:60494-39154888)
Reference markers:     490,237
Study markers:           7,691

Burnin  iteration 1:           2 seconds
Burnin  iteration 2:           2 seconds
Burnin  iteration 3:           2 seconds
Burnin  iteration 4:           1 second
Burnin  iteration 5:           1 second
Burnin  iteration 6:           2 seconds

Phasing iteration 1:           2 seconds
Phasing iteration 2:           2 seconds
Phasing iteration 3:           2 seconds
Phasing iteration 4:           2 seconds
Phasing iteration 5: 

Process(`[4mjava[24m [4m-jar[24m [4mbeagle.18May20.d20.jar[24m [4mgt=target.chr10.typedOnly.masked.vcf.gz[24m [4mref=ref.chr10.excludeTarget.vcf.gz[24m [4mout=beagle.chr10.result[24m [4mnthreads=8[24m`, ProcessExited(0))

## Check error rate

In [7]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")

X_mendel = convert_gt(Float64, "mendel.imputed.chr10.vcf.gz")
X_beagle = convert_gt(Float64, "beagle.chr10.result.vcf.gz")
# X_impute = convert_gt(Float64, "impute5.chr10.result.vcf.gz")
Xtrue = convert_gt(Float64, "target.chr10.full.vcf.gz")
m, n = size(Xtrue) # matrix dimensions
@show mendel_error = sum(Xtrue .!= X_mendel) / m / n
@show beagle_error = sum(Xtrue .!= X_beagle) / m / n;
# @show impute5_error = sum(Xtrue .!= X_impute) / m / n;

mendel_error = (sum(Xtrue .!= X_mendel) / m) / n = 0.028378425943385303
beagle_error = (sum(Xtrue .!= X_beagle) / m) / n = 0.013444339688179194


## Impute5

On Hua's machine, Impute5's runtime = ??? seconds (with 8 threads), and its error rate is ???

# HRC experiment

In [4]:
chr = 10
nrecords("chr$chr.uniqueSNPs.vcf.gz"), nsamples("chr$chr.uniqueSNPs.vcf.gz")

(1920530, 27165)

In [None]:
# this script was ran on Hua's computer

# cd("/Users/biona001/.julia/dev/MendelImpute/data/HRC")
# function filter_and_mask()
#     for chr in [10]
#         # filter chromosome data for unique snps
# #         data = "../beagle_raw/chr$chr.1kg.phase3.v5a.vcf.gz"
# #         full_record_index = .!find_duplicate_marker(data)
# #         VCFTools.filter(data, full_record_index, 1:nsamples(data), 
# #             des = "chr$chr.uniqueSNPs.vcf.gz", allow_multiallelic=false)

#         # import VCF data with only unique SNPs
#         _, vcf_sampleID, _, _, vcf_record_ids, _, _ = convert_ht(Bool, 
#             "chr$chr.uniqueSNPs.vcf.gz", save_snp_info=true, msg="importing")
#         total_snps = length(vcf_record_ids)
#         samples = length(vcf_sampleID)

#         # generate target panel with all snps
#         n = 1000
#         sample_idx = falses(samples)
#         sample_idx[1:n] .= true
#         shuffle!(sample_idx)
#         VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
#             sample_idx, des = "target.chr$chr.full.vcf.gz", allow_multiallelic=false)

#         # generate reference panel without target samples
#         VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
#             .!sample_idx, des = "ref.chr$chr.excludeTarget.vcf.gz", allow_multiallelic=false)
        
#         # generate target file with 1000 samples whose snps are in the UK Biobank
#         ukbb = SnpData("/Users/biona001/.julia/dev/MendelImpute/data/ukbb/ukb.plink.filtered")
#         ukbb_snpids = ukbb.snp_info[findall(x -> x == string(chr), ukbb.snp_info[!, :chromosome]), :snpid]
#         match_idx = indexin(vcat(vcf_record_ids...), ukbb_snpids)
#         record_idx = falses(total_snps)
#         record_idx[findall(!isnothing, match_idx)] .= true
#         VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, sample_idx, 
#             des = "target.chr$chr.typedOnly.vcf.gz", allow_multiallelic=false)

#         # unphase and mask 1% entries in target file
#         p = nrecords("target.chr$chr.typedOnly.vcf.gz")
#         masks = falses(p, n)
#         missingprop = 0.001
#         for j in 1:n, i in 1:p
#             rand() < missingprop && (masks[i, j] = true)
#         end
#         mask_gt("target.chr$chr.typedOnly.vcf.gz", masks, 
#             des="target.chr$chr.typedOnly.masked.vcf.gz", unphase=true)
#     end
# end
# Random.seed!(2020)
# @time filter_and_mask()

## MendelImpute on simulated HRC data

In [None]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/HRC")

# compress reference haplotypes from .vcf.gz to .jlso format
reffile = "ref.chr10.excludeTarget.vcf.gz"
tgtfile = "target.chr10.typedOnly.masked.vcf.gz"
outfile = "ref.chr10.excludeTarget.jlso"
@time compress_haplotypes(reffile, tgtfile, outfile);

In [None]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/HRC")

# phase & impute
tgtfile = "target.chr20.typedOnly.masked.vcf.gz"
reffile = "ref.chr20.excludeTarget.jlso"
outfile = "mendel.imputed.chr20.vcf.gz"
phase(tgtfile, reffile, outfile);

## Beagle 5.1

In [None]:
# run beagle 5.1 (8 thread)
cd("/Users/biona001/.julia/dev/MendelImpute/data/HRC")
run(`java -Xmx15g -jar beagle.18May20.d20.jar gt=target.chr10.typedOnly.masked.vcf.gz ref=ref.chr10.excludeTarget.vcf.gz out=beagle.chr10.result nthreads=8`)

In [None]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/HRC")

X_mendel = convert_gt(Float64, "mendel.imputed.chr10.vcf.gz")
X_beagle = convert_gt(Float64, "beagle.chr10.result.vcf.gz")
Xtrue = convert_gt(Float64, "target.chr10.full.vcf.gz")
m, n = size(Xtrue) # matrix dimensions
@show mendel_error = sum(Xtrue .!= X_mendel) / m / n
@show beagle_error = sum(Xtrue .!= X_beagle) / m / n;