# Chromosome Painting

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLSO
using BenchmarkTools
using GroupSlices
using LinearAlgebra
using DataFrames
# using ProfileView

BLAS.set_num_threads(1)

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1278


# Download population data

Download [population code](ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/data/) for each 1000 genomes sample. Different population code is explained [here](https://www.internationalgenome.org/category/population/). 

In [2]:
# run this code in terminal
# wget -r -l3 -N --no-parent ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/data/

# Get each sample's population origin

In [3]:
df = DataFrame(sample = String[], population = String[])
sample_to_population = Dict{Symbol, Symbol}()
for population in readdir("data/")
    for sample in readdir("data/" * population)
        sample == ".DS_Store" && continue
        push!(df, (sample, population))
        sample_to_population[Symbol(sample)] = Symbol(population)
    end
end
sample_to_population
    

Dict{Symbol,Symbol} with 2709 entries:
  :HG02728 => :PJL
  :HG00731 => :PUR
  :NA20902 => :GIH
  :HG02282 => :ACB
  :HG02156 => :CDX
  :HG00684 => :CHS
  :HG01709 => :IBS
  :HG01785 => :IBS
  :HG03055 => :MSL
  :HG00406 => :CHS
  :HG00654 => :CHS
  :HG03642 => :STU
  :HG03886 => :STU
  :HG02025 => :KHV
  :NA19909 => :ASW
  :NA06984 => :CEU
  :NA19431 => :LWK
  :HG00187 => :FIN
  :HG03085 => :MSL
  :NA19773 => :MXL
  :HG03636 => :PJL
  :HG01790 => :GBR
  :HG00478 => :CHS
  :HG03782 => :ITU
  :HG00607 => :CHS
  ⋮        => ⋮

# Compute sample composition in 1000 genomes

In [5]:
# compute each person's phase information
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")
Random.seed!(2020)
chr = 18
maf = 0.1
d = 1000
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr18.maxd$d.maf$maf.excludeTarget.jlso"
outfile = "mendel.imputed.vcf.gz"
@time ph = phase(tgtfile, reffile, outfile=outfile, impute=true, max_d = d,
    phase = true);

Importing reference haplotype data...


[32mComputing optimal haplotypes...100%|████████████████████| Time: 0:00:07[39m


Total windows = 2367, averaging ~ 527 unique haplotypes per window.

Timings: 
    Data import                     = 17.9653 seconds
        import target data             = 4.6349 seconds
        import compressed haplotypes   = 13.3304 seconds
    Computing haplotype pair        = 7.79536 seconds
        BLAS3 mul! to get M and N      = 0.30176 seconds per thread
        haplopair search               = 3.5918 seconds per thread
        initializing missing           = 0.0245673 seconds per thread
        allocating and viewing         = 0.0672008 seconds per thread
        index conversion               = 0.00140828 seconds per thread
    Phasing by win-win intersection = 0.749241 seconds
        Window-by-window intersection  = 0.0983918 seconds per thread
        Breakpoint search              = 0.14624 seconds per thread
        Recording result               = 0.0127038 seconds per thread
    Imputation                     = 2.24531 seconds
        Imputing missing              

In [7]:
# need compressed reference panel
compressed_Hunique = JLSO.load(reffile)[:compressed_Hunique];

In [12]:
@time comp = paint(ph[1], sample_to_population, compressed_Hunique)

1.705204e6
852602
  0.020345 seconds (15.39 k allocations: 775.396 KiB)


26-element Array{Float64,1}:
  23816.0
  64497.0
   2414.0
   5607.0
   5428.0
  57546.0
  42934.0
    272.0
  11913.0
   7860.0
   1142.0
  20545.0
    234.0
 178658.0
   2250.0
      1.199728e6
   4262.0
   3991.0
      0.0
  11341.0
   5647.0
   6314.0
   1024.0
  35201.0
   2954.0
   9626.0