# Chromosome Painting

In [2]:
using Revise
using VCFTools
using MendelImpute
using VariantCallFormat
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using BenchmarkTools
using GroupSlices
using LinearAlgebra
using DataFrames
using Plots
using DelimitedFiles
using JLSO
using StatsBase
# using ProfileView

BLAS.set_num_threads(1)

# Download population data

Download [population code](ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/data/) for each 1000 genomes sample. Different population code is explained [here](https://www.internationalgenome.org/category/population/) and their inclusion criteria is explained [here](https://www.coriell.org/1/NHGRI/About/Guidelines-for-Referring-to-Populations).

In [2]:
# run this code in terminal
# wget -r -l3 -N --no-parent ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/data/

# Get each sample's population and super-population origin

In [3]:
refID_to_population = thousand_genome_samples_to_population()

Dict{String, String} with 2504 entries:
  "HG01791" => "GBR"
  "HG02736" => "PJL"
  "HG00182" => "FIN"
  "HG03914" => "BEB"
  "HG00149" => "GBR"
  "NA12156" => "CEU"
  "HG02642" => "GWD"
  "HG02851" => "GWD"
  "NA19835" => "ASW"
  "NA19019" => "LWK"
  "HG01131" => "CLM"
  "HG03578" => "MSL"
  "NA18550" => "CHB"
  "HG02401" => "CDX"
  "HG01350" => "CLM"
  "HG03973" => "ITU"
  "NA07000" => "CEU"
  "HG01709" => "IBS"
  "HG01395" => "PUR"
  "HG01980" => "PEL"
  "HG01979" => "PEL"
  "HG01122" => "CLM"
  "HG03869" => "ITU"
  "HG03729" => "ITU"
  "NA19920" => "ASW"
  ⋮         => ⋮

# Separate admixed populations

We will use samples from:

- MXL: Mexican Ancestry in Los Angeles
- PUR: Puerto Rican in Puerto Rico
- ASW: African Ancestry in SW USA
- CLM: Colombian in Medellin, Colombia
- PEL: Peruvian in Lima, Peru
- ACB: African Caribbean in Barbados
- CHD: Chinese in Metropolitan Denver, Colorado, USA (actually NOT present in our data)
- GIH: Gujarati Indians in Houston, Texas, USA
- ITU: Indian Telugu in the U.K.
- STU: Sri Lankan Tamil in the UK

as targets to infer ancestry. These population show high degrees of recent admixture and are also highly heterogeneous in their admixture. Remaining populations will be used as the reference panel. 

We may wish to also exclude:

Deatailed population description is found [here](https://www.coriell.org/1/NHGRI/About/Guidelines-for-Referring-to-Populations). 

In [6]:
# need full sample ID list
data = "/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/beagle_raw/chr18.1kg.phase3.v5a.vcf.gz"
sampleIDs = sampleID(data)

2504-element Vector{String}:
 "HG00096"
 "HG00097"
 "HG00099"
 "HG00100"
 "HG00101"
 "HG00102"
 "HG00103"
 "HG00105"
 "HG00106"
 "HG00107"
 "HG00108"
 "HG00109"
 "HG00110"
 ⋮
 "NA21126"
 "NA21127"
 "NA21128"
 "NA21129"
 "NA21130"
 "NA21133"
 "NA21135"
 "NA21137"
 "NA21141"
 "NA21142"
 "NA21143"
 "NA21144"

In [7]:
# check how many people remain in reference panel
admixed = ["MXL", "PUR", "CLM", "PEL", "ASW", "ACB", "GIH", "ITU", "STU"]
ref_idx = Int[]
sample_idx = Int[]
for (i, id) in enumerate(sampleIDs)
    refID_to_population[id] ∈ admixed ? push!(sample_idx, i) : push!(ref_idx, i)
end
@show length(ref_idx)
@show length(sample_idx);

length(ref_idx) = 1693
length(sample_idx) = 811


In [6]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/country_origin2")
function filter_and_mask(ref_idx, sample_idx, refID_to_population)
    # filter chromosome data for unique snps
#     data = "../beagle_raw/chr18.1kg.phase3.v5a.vcf.gz"
#     full_record_index = .!find_duplicate_marker(data)
#     VCFTools.filter(data, full_record_index, 1:nsamples(data), 
#         des = "chr18.uniqueSNPs.vcf.gz")

    # summarize data
    total_snps, samples, _, _, _, maf_by_record, _ = gtstats("chr18.uniqueSNPs.vcf.gz")

    # generate target panel with all snps
    VCFTools.filter("chr18.uniqueSNPs.vcf.gz", 1:total_snps, 
        sample_idx, des = "target.chr18.full.vcf.gz", allow_multiallelic=false)

    # also generate reference panel
    VCFTools.filter("chr18.uniqueSNPs.vcf.gz", 1:total_snps, 
        ref_idx, des = "ref.chr18.vcf.gz", allow_multiallelic=false)

    # set top 50k ancestry informative marker as typed SNPs
    p = 50000
    aim_pvals = VCFTools.aim_select("chr18.uniqueSNPs.vcf.gz", refID_to_population)
    aim_rank = sortperm(aim_pvals)
    record_idx = falses(total_snps)
    [record_idx[aim_rank[i]] = true for i in 1:p] # typed SNPs are top aim markers
    VCFTools.filter("chr18.uniqueSNPs.vcf.gz", record_idx, sample_idx, 
        des = "target.chr18.typedOnly.aim.vcf.gz", allow_multiallelic=false)
    
    # unphase and mask 1% entries in target file
    n = length(sample_idx)
    masks = falses(p, n)
    missingprop = 0.001
    for j in 1:n, i in 1:p
        rand() < missingprop && (masks[i, j] = true)
    end
    mask_gt("target.chr18.typedOnly.aim.vcf.gz", masks, 
        des="target.chr18.typedOnly.aim.masked.vcf.gz", unphase=true)
end
Random.seed!(2020)
@time filter_and_mask(ref_idx, sample_idx, refID_to_population)

[32mProgress: 100%|█████████████████████████████████████████| Time: 0:09:25[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 0:11:57[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 0:16:41[39m
[32mProgress: 100%|█████████████████████████████████████████| Time: 0:10:11[39m
[32mfiltering vcf file...100%|██████████████████████████████| Time: 0:09:28[39m
[32mmasking vcf file...100%|████████████████████████████████| Time: 0:00:09[39m


3538.497681 seconds (38.75 G allocations: 3.560 TiB, 12.05% gc time)


In [7]:
# Compress reference file to VCF
d = 1000
@time compress_haplotypes("ref.chr18.vcf.gz", "target.chr18.typedOnly.aim.masked.vcf.gz",
    "ref.chr18.jlso", d)

[32mimporting reference data...100%|████████████████████████| Time: 0:04:37[39m
[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:06[39m


470.035680 seconds (3.00 G allocations: 274.495 GiB, 25.55% gc time)
