# Find which SNPs in reference panels are on GWAS chip

One of the reviewers' main comment is we should use real SNPs in our simulations. Thus, let's try to see if SNPs on the UK biobank is also on the HRC/1000-genomes reference panel. 

In [6]:
using Revise
using VCFTools
using SnpArrays
using MendelImpute
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLSO
using BenchmarkTools
using GroupSlices
using LinearAlgebra
using Plots
# using ProfileView

BLAS.set_num_threads(1)
Threads.nthreads()

8

### UK Biobank SNPs

In [7]:
ukbb = SnpData("/Users/biona001/.julia/dev/MendelImpute/data/ukbb/ukb.plink.filtered")
chrom_10_snpids = ukbb.snp_info[findall(x -> x == "10", ukbb.snp_info[!, :chromosome]), :snpid]
chrom_20_snpids = ukbb.snp_info[findall(x -> x == "20", ukbb.snp_info[!, :chromosome]), :snpid]

@show ukbb.snps
@show length(chrom_10_snpids)
@show length(chrom_20_snpids);

ukbb.snps = 470228
length(chrom_10_snpids) = 23580
length(chrom_20_snpids) = 12545


### 1000 genomes SNPs

In [8]:
# SNP IDs are in record_ids
geno, sampleID, record_chr, record_pos, record_ids, record_ref, record_alt = convert_gt(Float32, 
    "/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered/chr20.uniqueSNPs.vcf.gz",
    save_snp_info=true, msg="importing")

[32mimporting100%|██████████████████████████████████████████| Time: 0:05:00[39m


(Union{Missing, Float32}[0.0f0 0.0f0 … 0.0f0 0.0f0; 0.0f0 0.0f0 … 0.0f0 0.0f0; … ; 0.0f0 0.0f0 … 0.0f0 0.0f0; 0.0f0 0.0f0 … 0.0f0 0.0f0], ["HG00096", "HG00097", "HG00099", "HG00100", "HG00101", "HG00102", "HG00103", "HG00105", "HG00106", "HG00107"  …  "NA21128", "NA21129", "NA21130", "NA21133", "NA21135", "NA21137", "NA21141", "NA21142", "NA21143", "NA21144"], ["20", "20", "20", "20", "20", "20", "20", "20", "20", "20"  …  "20", "20", "20", "20", "20", "20", "20", "20", "20", "20"], [60479, 60522, 60571, 60828, 61098, 61138, 61270, 61279, 61388, 61651  …  62963996, 62964034, 62964109, 62964123, 62964448, 62964720, 62964968, 62964974, 62965015, 62965167], [["rs149529999"], ["rs150241001"], ["rs116145529"], ["rs187713677"], ["rs6078030"], ["rs552482647"], ["rs143291093"], ["rs189899941"], ["rs146681064"], ["rs76553454"]  …  ["rs149890816"], ["rs181883287"], ["rs541241205"], ["rs561035577"], ["rs151201761"], ["rs374676786"], ["rs557629530"], ["rs138685577"], ["rs149382639"], ["rs564138512

In [9]:
thousand_genome_snps = vcat(record_ids...)
idx = indexin(thousand_genome_snps, chrom_20_snpids)
count(!isnothing, idx)

12409

In [20]:
# largest interval without typed SNPs
maximum(diff(findall(!isnothing, idx)))

1946

**Conclusion:** UK Biobank has 12545 SNPs on chromosome 20, where 12409 SNPs are present in the 1000 genomes. 

### HRC

In [86]:
ukbb = SnpData("/Users/biona001/.julia/dev/MendelImpute/data/ukbb/ukb.plink.filtered")
chrom_10_snpids = ukbb.snp_info[findall(x -> x == "10", ukbb.snp_info[!, :chromosome]), :snpid]
chrom_20_snpids = ukbb.snp_info[findall(x -> x == "20", ukbb.snp_info[!, :chromosome]), :snpid]

@show ukbb.snps
@show length(chrom_10_snpids)
@show length(chrom_20_snpids);

ukbb.snps = 470228
length(chrom_10_snpids) = 23580
length(chrom_20_snpids) = 12545


In [87]:
# SNP IDs are in record_ids
geno, sampleID, record_chr, record_pos, record_ids, record_ref, record_alt = convert_gt(Float32, 
    "/Users/biona001/.julia/dev/MendelImpute/data/HRC/target.chr20.full.vcf.gz",
    save_snp_info=true, msg="importing")

[32mimporting100%|██████████████████████████████████████████| Time: 0:03:26[39m


(Union{Missing, Float32}[0.0f0 0.0f0 … 0.0f0 1.0f0; 0.0f0 0.0f0 … 0.0f0 1.0f0; … ; 0.0f0 0.0f0 … 0.0f0 1.0f0; 0.0f0 0.0f0 … 0.0f0 1.0f0], ["HG00127", "HG00131", "HG00267", "HG00328", "HG00338", "HG00345", "HG00583", "HG00599", "HG00640", "HG00734"  …  "EGAN00001013930", "EGAN00001013948", "EGAN00001013966", "EGAN00001013968", "EGAN00001099011", "EGAN00001099018", "EGAN00001029292", "EGAN00001029355", "EGAN00001029282", "EGAN00001375749"], ["20", "20", "20", "20", "20", "20", "20", "20", "20", "20"  …  "20", "20", "20", "20", "20", "20", "20", "20", "20", "20"], [60309, 60479, 60571, 60828, 61044, 61098, 61154, 61209, 61210, 61279  …  62963996, 62964123, 62964393, 62964448, 62964497, 62964720, 62964968, 62964974, 62965015, 62965185], [["rs574852966"], ["rs149529999"], ["rs116145529"], ["rs187713677"], ["rs535668403"], ["rs6078030"], ["."], ["."], ["."], ["rs189899941"]  …  ["rs149890816"], ["rs561035577"], ["."], ["rs151201761"], ["."], ["rs374676786"], ["rs557629530"], ["rs138685577"],

In [88]:
HRC_snps = vcat(record_ids...)
idx = indexin(HRC_snps, chrom_20_snpids)
count(!isnothing, idx)

12354

In [89]:
# largest interval without typed SNPs
maximum(diff(findall(!isnothing, idx)))

2689

**Conclusion**: UK Biobank has 12545 SNPs on chromosome 20, where 12354 SNPs are present in the HRC reference panel.

# 1000 genomes experiment

In [79]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function filter_and_mask()
    for chr in [20]
        # filter chromosome data for unique snps
#         data = "../beagle_raw/chr$chr.1kg.phase3.v5a.vcf.gz"
#         full_record_index = .!find_duplicate_marker(data)
#         VCFTools.filter(data, full_record_index, 1:nsamples(data), 
#             des = "chr$chr.uniqueSNPs.vcf.gz", allow_multiallelic=false)

        # import VCF data with only unique SNPs
        _, vcf_sampleID, _, _, vcf_record_ids, _, _ = convert_gt(Float32, 
            "chr$chr.uniqueSNPs.vcf.gz", save_snp_info=true, msg="importing")
        total_snps = length(vcf_record_ids)
        samples = length(vcf_sampleID)

        # generate target panel with all snps
        n = 100
        sample_idx = falses(samples)
        sample_idx[1:n] .= true
        shuffle!(sample_idx)
        VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
            sample_idx, des = "target.chr$chr.full.vcf.gz", allow_multiallelic=false)

        # generate reference panel without target samples
        VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
            .!sample_idx, des = "ref.chr$chr.excludeTarget.vcf.gz", allow_multiallelic=false)
        
        # generate target file with 100 samples whose snps are in the UK Biobank
        ukbb = SnpData("/Users/biona001/.julia/dev/MendelImpute/data/ukbb/ukb.plink.filtered")
        ukbb_snpids = ukbb.snp_info[findall(x -> x == string(chr), ukbb.snp_info[!, :chromosome]), :snpid]
        match_idx = indexin(vcat(vcf_record_ids...), ukbb_snpids)
        record_idx = falses(total_snps)
        record_idx[findall(!isnothing, match_idx)] .= true
        VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, sample_idx, 
            des = "target.chr$chr.typedOnly.vcf.gz", allow_multiallelic=false)

        # unphase and mask 1% entries in target file
        p = nrecords("target.chr$chr.typedOnly.vcf.gz")
        masks = falses(p, n)
        missingprop = 0.001
        for j in 1:n, i in 1:p
            rand() < missingprop && (masks[i, j] = true)
        end
        mask_gt("target.chr$chr.typedOnly.vcf.gz", masks, 
            des="target.chr$chr.typedOnly.masked.vcf.gz", unphase=true)
    end 
end
Random.seed!(2020)
@time filter_and_mask()

[32mimporting100%|██████████████████████████████████████████| Time: 0:05:12[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 0:09:27[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 0:23:59[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 0:09:57[39m


2970.249967 seconds (23.56 G allocations: 2.102 TiB, 17.23% gc time)


## MendelImpute on simulated data

In [80]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")

# compress reference haplotypes from .vcf.gz to .jlso format
reffile = "ref.chr20.excludeTarget.vcf.gz"
tgtfile = "target.chr20.typedOnly.masked.vcf.gz"
outfile = "ref.chr20.excludeTarget.jlso"
@time compress_haplotypes(reffile, tgtfile, outfile);

[32mimporting reference data...100%|████████████████████████| Time: 0:05:07[39m


443.177396 seconds (3.27 G allocations: 263.229 GiB, 28.87% gc time)


In [82]:
# phase & impute
tgtfile = "target.chr20.typedOnly.masked.vcf.gz"
reffile = "ref.chr20.excludeTarget.jlso"
outfile = "mendel.imputed.chr20.vcf.gz"
phase(tgtfile, reffile, outfile);

Number of threads = 1
Importing reference haplotype data...


[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:05[39m


Total windows = 546, averaging ~ 449 unique haplotypes per window.

Timings: 
    Data import                     = 10.6097 seconds
        import target data             = 0.161234 seconds
        import compressed haplotypes   = 10.4485 seconds
    Computing haplotype pair        = 4.17583 seconds
        BLAS3 mul! to get M and N      = 0.207207 seconds per thread
        haplopair search               = 3.93168 seconds per thread
        initializing missing           = 0.0108198 seconds per thread
        allocating and viewing         = 0.0241775 seconds per thread
        index conversion               = 0.000849231 seconds per thread
    Phasing by win-win intersection = 0.451995 seconds
        Window-by-window intersection  = 0.119888 seconds per thread
        Breakpoint search              = 0.259573 seconds per thread
        Recording result               = 0.0685291 seconds per thread
    Imputation                     = 6.03114 seconds
        Imputing missing          

## Beagle 5.1 on simulated data

In [83]:
# run beagle 5.1 (8 thread)
run(`java -jar beagle.18May20.d20.jar gt=target.chr20.typedOnly.masked.vcf.gz ref=ref.chr20.excludeTarget.vcf.gz out=beagle.chr20.result nthreads=8`)

beagle.18May20.d20.jar (version 5.1)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.18May20.d20.jar" to list command line argument
Start time: 03:50 PM PST on 09 Feb 2021

Command line: java -Xmx3641m -jar beagle.18May20.d20.jar
  gt=target.chr20.typedOnly.masked.vcf.gz
  ref=ref.chr20.excludeTarget.vcf.gz
  out=beagle.chr20.result
  nthreads=8

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:       2,404
Study samples:             100

Window 1 (20:60479-40060263)
Reference markers:     399,602
Study markers:           6,892

Burnin  iteration 1:           2 seconds
Burnin  iteration 2:           1 second
Burnin  iteration 3:           1 second
Burnin  iteration 4:           1 second
Burnin  iteration 5:           1 second
Burnin  iteration 6:           1 second

Phasing iteration 1:           1 second
Phasing iteration 2:           1 second
Phasing iteration 3:           1 second
Phasing iteration 4:           1 second
Phasing iteration 5:        

Process(`[4mjava[24m [4m-jar[24m [4mbeagle.18May20.d20.jar[24m [4mgt=target.chr20.typedOnly.masked.vcf.gz[24m [4mref=ref.chr20.excludeTarget.vcf.gz[24m [4mout=beagle.chr20.result[24m [4mnthreads=8[24m`, ProcessExited(0))

## Check error rate

In [85]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")

X_mendel = convert_gt(Float64, "mendel.imputed.chr20.vcf.gz")
X_beagle = convert_gt(Float64, "beagle.chr20.result.vcf.gz")
# X_impute = convert_gt(Float64, "impute5.chr20.result.vcf.gz")
Xtrue = convert_gt(Float64, "target.chr20.full.vcf.gz")
m, n = size(Xtrue) # matrix dimensions
@show mendel_error = sum(Xtrue .!= X_mendel) / m / n
@show beagle_error = sum(Xtrue .!= X_beagle) / m / n;
# @show impute5_error = sum(Xtrue .!= X_impute) / m / n;

mendel_error = (sum(Xtrue .!= X_mendel) / m) / n = 0.028955696155298534
beagle_error = (sum(Xtrue .!= X_beagle) / m) / n = 0.013908210159301599


## Impute5

On Hua's machine, Impute5's runtime = 134 seconds (with 8 threads), and its error rate is 0.014587312888160515

# HRC experiment

In [90]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/HRC")
vcffile = "chr20.uniqueSNPs.vcf.gz"
nrecords(vcffile), nsamples(vcffile)

(882742, 27165)

In [3]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/HRC")
function filter_and_mask()
    for chr in [20]
        # filter chromosome data for unique snps
#         data = "../beagle_raw/chr$chr.1kg.phase3.v5a.vcf.gz"
#         full_record_index = .!find_duplicate_marker(data)
#         VCFTools.filter(data, full_record_index, 1:nsamples(data), 
#             des = "chr$chr.uniqueSNPs.vcf.gz", allow_multiallelic=false)

        # import VCF data with only unique SNPs
        _, vcf_sampleID, _, _, vcf_record_ids, _, _ = convert_gt(Float32, 
            "chr$chr.uniqueSNPs.vcf.gz", save_snp_info=true, msg="importing")
        total_snps = length(vcf_record_ids)
        samples = length(vcf_sampleID)

        # generate target panel with all snps
        n = 1000
        sample_idx = falses(samples)
        sample_idx[1:n] .= true
        shuffle!(sample_idx)
        VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
            sample_idx, des = "target.chr$chr.full.vcf.gz", allow_multiallelic=false)

        # generate reference panel without target samples
        VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
            .!sample_idx, des = "ref.chr$chr.excludeTarget.vcf.gz", allow_multiallelic=false)
        
        # generate target file with 1000 samples whose snps are in the UK Biobank
        ukbb = SnpData("/Users/biona001/.julia/dev/MendelImpute/data/ukbb/ukb.plink.filtered")
        ukbb_snpids = ukbb.snp_info[findall(x -> x == string(chr), ukbb.snp_info[!, :chromosome]), :snpid]
        match_idx = indexin(vcat(vcf_record_ids...), ukbb_snpids)
        record_idx = falses(total_snps)
        record_idx[findall(!isnothing, match_idx)] .= true
        VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, sample_idx, 
            des = "target.chr$chr.typedOnly.vcf.gz", allow_multiallelic=false)

        # unphase and mask 1% entries in target file
        p = nrecords("target.chr$chr.typedOnly.vcf.gz")
        masks = falses(p, n)
        missingprop = 0.001
        for j in 1:n, i in 1:p
            rand() < missingprop && (masks[i, j] = true)
        end
        mask_gt("target.chr$chr.typedOnly.vcf.gz", masks, 
            des="target.chr$chr.typedOnly.masked.vcf.gz", unphase=true)
    end
end
Random.seed!(2020)
@time filter_and_mask()

[32mimporting100%|██████████████████████████████████████████| Time: 1:20:54[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 2:04:47[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 4:34:37[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 1:49:03[39m


36020.065104 seconds (335.89 G allocations: 30.977 TiB, 16.56% gc time)


## MendelImpute on simulated HRC data

In [4]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/HRC")

# compress reference haplotypes from .vcf.gz to .jlso format
reffile = "ref.chr20.excludeTarget.vcf.gz"
tgtfile = "target.chr20.typedOnly.masked.vcf.gz"
outfile = "ref.chr20.excludeTarget.jlso"
@time compress_haplotypes(reffile, tgtfile, outfile);

[32mimporting reference data...100%|████████████████████████| Time: 1:36:14[39m


7052.777154 seconds (46.27 G allocations: 3.441 TiB, 24.75% gc time)


In [10]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/HRC")

# phase & impute
tgtfile = "target.chr20.typedOnly.masked.vcf.gz"
reffile = "ref.chr20.excludeTarget.jlso"
outfile = "mendel.imputed.chr20.vcf.gz"
phase(tgtfile, reffile, outfile);

Number of threads = 8
Importing reference haplotype data...


[32mComputing optimal haplotypes...100%|████████████████████| Time: 0:00:08[39m
[32mPhasing...100%|█████████████████████████████████████████| Time: 0:00:08[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:10[39m


Total windows = 823, averaging ~ 314 unique haplotypes per window.

Timings: 
    Data import                     = 26.2951 seconds
        import target data             = 3.05574 seconds
        import compressed haplotypes   = 23.2393 seconds
    Computing haplotype pair        = 8.12376 seconds
        BLAS3 mul! to get M and N      = 0.0771965 seconds per thread
        haplopair search               = 5.58686 seconds per thread
        initializing missing           = 0.0426967 seconds per thread
        allocating and viewing         = 0.00691445 seconds per thread
        index conversion               = 0.0117964 seconds per thread
    Phasing by win-win intersection = 8.54653 seconds
        Window-by-window intersection  = 7.63571 seconds per thread
        Breakpoint search              = 0.0981065 seconds per thread
        Recording result               = 0.456344 seconds per thread
    Imputation                     = 15.521 seconds
        Imputing missing              

## Beagle 5.1

In [5]:
# run beagle 5.1 (8 thread)
cd("/Users/biona001/.julia/dev/MendelImpute/data/HRC")
run(`java -Xmx15g -jar beagle.18May20.d20.jar gt=target.chr20.typedOnly.masked.vcf.gz ref=ref.chr20.excludeTarget.vcf.gz out=beagle.chr20.result nthreads=8`)

beagle.18May20.d20.jar (version 5.1)
Copyright (C) 2014-2018 Brian L. Browning
Enter "java -jar beagle.18May20.d20.jar" to list command line argument
Start time: 01:00 PM PST on 10 Feb 2021

Command line: java -Xmx13653m -jar beagle.18May20.d20.jar
  gt=target.chr20.typedOnly.masked.vcf.gz
  ref=ref.chr20.excludeTarget.vcf.gz
  out=beagle.chr20.result
  nthreads=8

No genetic map is specified: using 1 cM = 1 Mb

Reference samples:      26,165
Study samples:           1,000

Window 1 (20:60309-40060275)
Reference markers:     528,057
Study markers:           6,899

Burnin  iteration 1:           18 seconds
Burnin  iteration 2:           17 seconds
Burnin  iteration 3:           16 seconds
Burnin  iteration 4:           17 seconds
Burnin  iteration 5:           17 seconds
Burnin  iteration 6:           20 seconds

Phasing iteration 1:           22 seconds
Phasing iteration 2:           19 seconds
Phasing iteration 3:           18 seconds
Phasing iteration 4:           12 seconds
Phasing 

Process(`[4mjava[24m [4m-Xmx15g[24m [4m-jar[24m [4mbeagle.18May20.d20.jar[24m [4mgt=target.chr20.typedOnly.masked.vcf.gz[24m [4mref=ref.chr20.excludeTarget.vcf.gz[24m [4mout=beagle.chr20.result[24m [4mnthreads=8[24m`, ProcessExited(0))

In [7]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/HRC")

X_mendel = convert_gt(Float64, "mendel.imputed.chr20.vcf.gz")
X_beagle = convert_gt(Float64, "beagle.chr20.result.vcf.gz")
Xtrue = convert_gt(Float64, "target.chr20.full.vcf.gz")
m, n = size(Xtrue) # matrix dimensions
@show mendel_error = sum(Xtrue .!= X_mendel) / m / n
@show beagle_error = sum(Xtrue .!= X_beagle) / m / n;

mendel_error = (sum(Xtrue .!= X_mendel) / m) / n = 0.008185619354239403
beagle_error = (sum(Xtrue .!= X_beagle) / m) / n = 0.002821281869447698


In [8]:
0.008185619354239403 / 0.002821281869447698

2.901383035450422