# Chromosome Painting

In [None]:
using GeneticVariation
using VCFTools
using DataFrames
using CSV
using MendelImpute

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1278


# Download population data

Download [population code](ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/data/) for each 1000 genomes sample. Different population code is explained [here](https://www.internationalgenome.org/category/population/). 

In [None]:
;wget -r -l3 -N --no-parent ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/data/

# Compile each sample's population origin

In [26]:
df = DataFrame(sample = String[], population = String[])
sample_to_population = Dict{Symbol, Symbol}()
for population in readdir("data/")
    for sample in readdir("data/" * population)
        sample == ".DS_Store" && continue
        push!(df, (sample, population))
        sample_to_population[Symbol(sample)] = Symbol(population)
    end
end
sample_to_population
    

Dict{Symbol,Symbol} with 2709 entries:
  :HG02728 => :PJL
  :HG00731 => :PUR
  :NA20902 => :GIH
  :HG02282 => :ACB
  :HG02156 => :CDX
  :HG00684 => :CHS
  :HG01709 => :IBS
  :HG01785 => :IBS
  :HG03055 => :MSL
  :HG00406 => :CHS
  :HG00654 => :CHS
  :HG03642 => :STU
  :HG03886 => :STU
  :HG02025 => :KHV
  :NA19909 => :ASW
  :NA06984 => :CEU
  :NA19431 => :LWK
  :HG00187 => :FIN
  :HG03085 => :MSL
  :NA19773 => :MXL
  :HG03636 => :PJL
  :HG01790 => :GBR
  :HG00478 => :CHS
  :HG03782 => :ITU
  :HG00607 => :CHS
  ⋮        => ⋮

In [6]:
# get sampleID listed in VCF file
vcffile = "../beagle_raw/chr22.1kg.phase3.v5a.vcf.gz"
reader = VCF.Reader(openvcf(vcffile, "r"))
sampleID = VCF.header(reader).sampleID
close(reader)

# Compute sample composition

In [29]:
function paint(
    sample_phase::HaplotypeMosaicPair,
    sample_to_population::Dict{Symbol, Symbol},
    compressed_Hunique::CompressedHaplotypes;
    populations::Vector{Symbol} = unique_populations(sample_to_population),
    composition::AbstractVector = zeros(length(populations))
    )
    snps = sample_phase.strand1.length
    l = length(sample_phase.strand1.haplotypelabel)
    ref_sampleIDs = compressed_Hunique.sampleID

    # strand1 
    for i in 1:l
        h1 = sample_phase.strand1.haplotypelabel[i] # unique haplotype index
        w1 = sample_phase.strand1.window[i]
        snps = (i == l ? 
            (sample_phase.strand1.start[i]:sample_phase.strand1.length) : 
            (sample_phase.strand1.start[i]:(sample_phase.strand1.start[i+1] 
            - 1)))

        # convert unique haplotype idx to sampleID
        H1 = unique_all_idx_to_complete_idx(h1, w1, 
            compressed_Hunique) # complete haplotype idx
        sample_idx = div(H1, 2, RoundUp)
        id = ref_sampleIDs[sample_idx]
        population = Symbol(sample_to_population[id])

        # update composition
        idx = findfirst(x -> x == population, populations)
        composition[idx] += length(snps)
    end

    # strand2
    for i in 1:l
        h2 = sample_phase.strand2.haplotypelabel[i] # unique haplotype index
        w2 = sample_phase.strand2.window[i]
        snps = (i == l ? 
            (sample_phase.strand2.start[i]:sample_phase.strand2.length) : 
            (sample_phase.strand2.start[i]:(sample_phase.strand2.start[i+1] 
            - 1)))

        # convert unique haplotype idx to sampleID
        H1 = unique_all_idx_to_complete_idx(h2, w2, 
            compressed_Hunique) # complete haplotype idx
        sample_idx = div(H1, 2, RoundUp)
        id = ref_sampleIDs[sample_idx]
        population = Symbol(sample_to_population[id])

        # update composition
        idx = findfirst(x -> x == population, populations)
        composition[idx] += length(snps)
    end
    
    return composition
end

"""
unique_populations(x::Dict{Symbol, Symbol})

Computes the unique list of populations. `x` is a `Dict`
where each sample is a key and populations are values. 
"""
function unique_populations(x::Dict{Symbol, Symbol})
    populations = Symbol[]
    for (key, val) in x
        if val ∉ populations 
            num_unique += 1
            push!(populations, val)
        end
    end
    return populations
end

LoadError: UndefVarError: HaplotypeMosaicPair not defined