# Find which SNPs in reference panels are on GWAS chip

One of the reviewers' main comment is we should use real SNPs in our simulations. Thus, let's try to see if SNPs on the Illumina chips are also on the HRC/1000-genomes reference panel. 

SNPs on Illumina chips are downloaded as `.bed` files from:
+ https://support.illumina.com/array/array_kits/humanomni2_5-8_beadchip_kit/downloads.html
+ https://support.illumina.com/array/array_kits/humanomni5-4-beadchip-kit/downloads.html

In [2]:
using Revise
using VCFTools
using SnpArrays
using MendelImpute
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLSO
using BenchmarkTools
using GroupSlices
using LinearAlgebra
using CSV
using DataFrames
# using ProfileView

BLAS.set_num_threads(1)
Threads.nthreads()

1

# 1000 genomes reference panel

In [40]:
# SNP IDs are in record_ids
geno, sampleID, record_chr, record_pos, record_ids, record_ref, record_alt = convert_gt(Float32, 
    "/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered/chr20.uniqueSNPs.vcf.gz",
    save_snp_info=true, msg="importing");

[32mimporting100%|██████████████████████████████████████████| Time: 0:05:27[39m


### Chr20 SNPs on Illumina Omni2.5

In [43]:
bed = CSV.read("/Users/biona001/.julia/dev/MendelImpute/data/omni_chips/InfiniumOmni2-5-8v1-5_A1.bed", 
    skipto = 2, header=false, delim='\t')
chrom_20_snppos = bed[findall(x -> x == "chr20", bed[!, 1]), 3]

@show size(bed, 1)
@show length(chrom_20_snppos)

idx = indexin(chrom_20_snppos, record_pos)
println("Number of SNPs shared by Omni2.5 and 1000 genomes on chr 20 = ", count(!isnothing, idx))

size(bed, 1) = 2379684
length(chrom_20_snppos) = 56845
Number of SNPs shared by Omni2.5 and 1000 genomes on chr 20 = 52978


### Chr20 SNPs on Illumina Omni5.4

In [44]:
bed = CSV.read("/Users/biona001/.julia/dev/MendelImpute/data/omni_chips/InfiniumOmni5-4v1-2_A1.bed", 
    skipto = 2, header=false, delim='\t')
chrom_20_snppos = bed[findall(x -> x == "chr20", bed[!, 1]), 3]

@show size(bed, 1)
@show length(chrom_20_snppos)

idx = indexin(chrom_20_snppos, record_pos)
println("Number of SNPs shared by Omni5.4 and 1000 genomes on chr 20 = ", count(!isnothing, idx))

size(bed, 1) = 4320325
length(chrom_20_snppos) = 103318
Number of SNPs shared by Omni5.4 and 1000 genomes on chr 20 = 92652


# HRC reference panel

In [20]:
# SNP IDs are in record_ids
geno, sampleID, record_chr, record_pos, record_ids, record_ref, record_alt = convert_gt(Float32, 
    "/Users/biona001/.julia/dev/MendelImpute/data/HRC/target.chr10.full.vcf.gz",
    save_snp_info=true, msg="importing");

[32mimporting100%|██████████████████████████████████████████| Time: 0:12:17[39m


### Chr10 SNPs on Illumina Omni2.5

In [39]:
bed = CSV.read("/Users/biona001/.julia/dev/MendelImpute/data/omni_chips/InfiniumOmni2-5-8v1-5_A1.bed", 
    skipto = 2, header=false, delim='\t')
chrom_10_snppos = bed[findall(x -> x == "chr10", bed[!, 1]), 3]

@show size(bed, 1)
@show length(chrom_10_snppos)

idx = indexin(record_pos, chrom_10_snppos)
println("Number of SNPs shared by Omni2.5 and HRC on chr 10 = ", count(!isnothing, idx))

size(bed, 1) = 2379684
length(chrom_10_snppos) = 119731
Number of SNPs shared by Omni2.5 and HRC on chr 10 = 110887


### Chr10 SNPs on Illumina Omni5.4

In [37]:
bed = CSV.read("/Users/biona001/.julia/dev/MendelImpute/data/omni_chips/InfiniumOmni5-4v1-2_A1.bed", 
    skipto = 2, header=false, delim='\t')
chrom_10_snppos = bed[findall(x -> x == "chr10", bed[!, 1]), 3]

@show size(bed, 1)
@show length(chrom_10_snppos)

idx = indexin(record_pos, chrom_10_snppos)
println("Number of SNPs shared by Omni5.4 and HRC on chr 10 = ", count(!isnothing, idx))

size(bed, 1) = 4320325
length(chrom_10_snppos) = 206133
Number of SNPs shared by Omni5.4 and HRC on chr 10 = 191210


# 1000 genomes experiment

In [79]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
function filter_and_mask(chr::Int)
    # filter chromosome data for unique snps
#         data = "../beagle_raw/chr$chr.1kg.phase3.v5a.vcf.gz"
#         full_record_index = .!find_duplicate_marker(data)
#         VCFTools.filter(data, full_record_index, 1:nsamples(data), 
#             des = "chr$chr.uniqueSNPs.vcf.gz", allow_multiallelic=false)

    # import VCF data with only unique SNPs
    _, vcf_sampleID, _, vcf_record_pos, _, _, _ = convert_gt(UInt8, 
        "chr$chr.uniqueSNPs.vcf.gz", save_snp_info=true, msg="importing")
    total_snps = length(vcf_record_pos)
    samples = length(vcf_sampleID)

    # generate target panel with all snps
    n = 100
    sample_idx = falses(samples)
    sample_idx[1:n] .= true
    shuffle!(sample_idx)
    VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
        sample_idx, des = "target.chr$chr.full.vcf.gz", allow_multiallelic=false)

    # generate reference panel without target samples
    VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", 1:total_snps, 
        .!sample_idx, des = "ref.chr$chr.excludeTarget.vcf.gz", allow_multiallelic=false)

    # generate target file with 100 samples whose snps are in the UK Biobank
    bed = CSV.read("/home/biona001/omni_chips/InfiniumOmni5-4v1-2_A1.bed", 
        skipto = 2, header=false, delim='\t', DataFrame)
    typed_snppos = bed[findall(x -> x == "chr$chr", bed[!, 1]), 3]
    match_idx = indexin(vcf_record_pos, typed_snppos)
    record_idx = falses(total_snps)
    record_idx[findall(!isnothing, match_idx)] .= true
    VCFTools.filter("chr$chr.uniqueSNPs.vcf.gz", record_idx, sample_idx, 
        des = "target.chr$chr.typedOnly.vcf.gz", allow_multiallelic=false)

    # unphase and mask 1% entries in target file
    p = nrecords("target.chr$chr.typedOnly.vcf.gz")
    masks = falses(p, n)
    missingprop = 0.001
    for j in 1:n, i in 1:p
        rand() < missingprop && (masks[i, j] = true)
    end
    mask_gt("target.chr$chr.typedOnly.vcf.gz", masks, 
        des="target.chr$chr.typedOnly.masked.vcf.gz", unphase=true)
end 
Random.seed!(2020)
@time filter_and_mask(20)

[32mimporting100%|██████████████████████████████████████████| Time: 0:05:12[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 0:09:27[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 0:23:59[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 0:09:57[39m


2970.249967 seconds (23.56 G allocations: 2.102 TiB, 17.23% gc time)
