# Post imputation quality control

Here to use chr18 data from 1000 genomes.

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLSO
using BenchmarkTools
using GroupSlices
using LinearAlgebra
using UnicodePlots
# using ProfileView

BLAS.set_num_threads(1)

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1278


In [2]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")

In [12]:
# 8 threads, d = 1000
Random.seed!(2020)
chr = 18
maf = 0.1
d = 1000
overlap = 0.0
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr18.maxd$d.overlap$overlap.maf$maf.excludeTarget.jlso"
outfile = "mendel.imputed.vcf.gz"
@time ph, haploscore = phase(tgtfile, reffile, outfile=outfile);

X_complete = convert_gt(Float64, "target.chr18.full.vcf.gz", trans=true)
p, n = size(X_complete)
X_mendel = convert_gt(Float64, outfile, trans=true)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

Number of threads = 8
Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m


Total windows = 2367, averaging ~ 527 unique haplotypes per window.

Timings: 
    Data import                     = 23.7672 seconds
        import target data             = 5.33822 seconds
        import compressed haplotypes   = 18.4289 seconds
    Computing haplotype pair        = 4.87321 seconds
        BLAS3 mul! to get M and N      = 0.363539 seconds per thread
        haplopair search               = 3.99455 seconds per thread
        initializing missing           = 0.0257177 seconds per thread
        allocating and viewing         = 0.0701098 seconds per thread
        index conversion               = 0.00365273 seconds per thread
    Phasing by win-win intersection = 1.64581 seconds
        Window-by-window intersection  = 0.10344 seconds per thread
        Breakpoint search              = 1.24049 seconds per thread
        Recording result               = 0.0483477 seconds per thread
    Imputation                     = 1.96674 seconds
        Imputing missing              

## Plot least square error for each sample

In [13]:
# normalize hapscore by number of SNPs
tot_hapscore = sum.(haploscore) ./ p
@show histogram(tot_hapscore)

# find samples with avg error ≤ 0.001
keep_idx = findall(x -> x ≤ 0.001, tot_hapscore)
npruned = length(keep_idx)
println("error pruned = $(sum(X_mendel[:, keep_idx] .!= X_complete[:, keep_idx]) / npruned / p) \n")

histogram(tot_hapscore) =                       ┌                                        ┐ 
   [0.0002, 0.0004  ) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 48   
   [0.0004, 0.0006  ) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 29                 
   [0.0006, 0.0008  ) ┤▇▇▇▇▇ 6                                   
   [0.0008, 0.001   ) ┤▇▇▇▇▇▇ 8                                  
   [0.001 , 0.0012  ) ┤▇▇▇▇ 5                                    
   [0.0012, 0.0014  ) ┤▇▇ 3                                      
   [0.0014, 0.0016  ) ┤▇ 1                                       
                      └                                        ┘ 
                                      Frequency
error pruned = 0.0049533566168525976 



In [8]:
histogram(haploscore[20])

[90m                  ┌                                        ┐[39m 
   [0m[90m[[0m 0.0[90m, [0m 2.0  [90m)[0m[90m ┤[39m[32m▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇[39m[0m 2309 [90m [39m 
   [0m[90m[[0m 2.0[90m, [0m 4.0  [90m)[0m[90m ┤[39m[0m 33                                     [90m [39m 
   [0m[90m[[0m 4.0[90m, [0m 6.0  [90m)[0m[90m ┤[39m[0m 17                                     [90m [39m 
   [0m[90m[[0m 6.0[90m, [0m 8.0  [90m)[0m[90m ┤[39m[0m 5                                      [90m [39m 
   [0m[90m[[0m 8.0[90m, [0m10.0  [90m)[0m[90m ┤[39m[0m 0                                      [90m [39m 
   [0m[90m[[0m10.0[90m, [0m12.0  [90m)[0m[90m ┤[39m[0m 0                                      [90m [39m 
   [0m[90m[[0m12.0[90m, [0m14.0  [90m)[0m[90m ┤[39m[0m 0                                      [90m [39m 
   [0m[90m[[0m14.0[90m, [0m16.0  [90m)[0m[90m ┤[39m[0m 0                                 

**Conclusion:** For each sample, most windows are imputed well, but a few are not (e.g. hapscore $> 20$). 

## Try calculating $r^2$

According to [minimac 3 documentation](https://genome.sph.umich.edu/wiki/Minimac3_Info_File#Rsq), for each SNP we can calculate 

$$r^2 = \frac{\frac{1}{2n} \sum_{i=1}^{2n}(D_i - \hat{p})^2}{\hat{p}(1 - \hat{p})}$$

where $\hat{p}$ is the alternative allele frequency (of your GWAS data), $D_i$ is the imputed alternate allele probability at the $i$th haplotype, and $n$ is the number of GWAS samples.

In [28]:
# import imputed haplotypes
H = convert_ht(Float64, outfile, trans=true);

In [41]:
p̂ = vec(sum(H, dims=2)) ./ n ./ 2 # mean = 2maf
r² = zeros(p)
for j in 1:p
    rⱼ = 0.0
    pⱼ = p̂[j]
    # skip snps with 0 minor alleles
    if pⱼ == 0.0 || pⱼ == 1.0
        r²[j] = 1
        continue
    end
    for i in 1:2n
        rⱼ += (H[j, i] - pⱼ)^2
    end
    r²[j] = rⱼ / 2n / (pⱼ * (1 - pⱼ))
end

In [42]:
histogram(p̂)

[90m                ┌                                        ┐[39m 
   [0m[90m[[0m0.0 [90m, [0m0.05[90m)[0m[90m ┤[39m[32m▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇[39m[0m 620983 [90m [39m 
   [0m[90m[[0m0.05[90m, [0m0.1 [90m)[0m[90m ┤[39m[32m▇▇[39m[0m 48182                                [90m [39m 
   [0m[90m[[0m0.1 [90m, [0m0.15[90m)[0m[90m ┤[39m[32m▇[39m[0m 28152                                 [90m [39m 
   [0m[90m[[0m0.15[90m, [0m0.2 [90m)[0m[90m ┤[39m[32m▇[39m[0m 20780                                 [90m [39m 
   [0m[90m[[0m0.2 [90m, [0m0.25[90m)[0m[90m ┤[39m[32m▇[39m[0m 17265                                 [90m [39m 
   [0m[90m[[0m0.25[90m, [0m0.3 [90m)[0m[90m ┤[39m[32m▇[39m[0m 14750                                 [90m [39m 
   [0m[90m[[0m0.3 [90m, [0m0.35[90m)[0m[90m ┤[39m[32m▇[39m[0m 12579                                 [90m [39m 
   [0m[90m[[0m0.35[90m, [0m0.4 [90m)[0m[90m ┤[

In [44]:
histogram(r²)

[90m                                            ┌                                        ┐[39m 
   [0m[90m[[0m0.999999999999988 [90m, [0m0.999999999999989 [90m)[0m[90m ┤[39m[0m 4                                      [90m [39m 
   [0m[90m[[0m0.999999999999989 [90m, [0m0.99999999999999  [90m)[0m[90m ┤[39m[0m 18                                     [90m [39m 
   [0m[90m[[0m0.99999999999999  [90m, [0m0.999999999999991 [90m)[0m[90m ┤[39m[0m 29                                     [90m [39m 
   [0m[90m[[0m0.999999999999991 [90m, [0m0.999999999999992 [90m)[0m[90m ┤[39m[0m 101                                    [90m [39m 
   [0m[90m[[0m0.999999999999992 [90m, [0m0.999999999999993 [90m)[0m[90m ┤[39m[0m 670                                    [90m [39m 
   [0m[90m[[0m0.999999999999993 [90m, [0m0.999999999999994 [90m)[0m[90m ┤[39m[0m 1035                                   [90m [39m 
   [0m[90m[[0m0.999999999999994 [90m, [