# Post imputation quality control

Here to use chr18 data from 1000 genomes.

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLSO
using BenchmarkTools
using GroupSlices
using LinearAlgebra
using UnicodePlots
# using ProfileView

BLAS.set_num_threads(1)

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1278


In [2]:
cd("/Users/biona001/.julia/dev/MendelImpute/data/1000_genome_phase3_v5/filtered")

In [12]:
# 8 threads, max d = 1000
Random.seed!(2020)
chr = 18
maf = 0.1
d = 1000
overlap = 0.0
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr18.maxd$d.overlap$overlap.maf$maf.excludeTarget.jlso"
outfile = "mendel.imputed.vcf.gz"
@time ph, haploscore = phase(tgtfile, reffile, outfile=outfile);

X_complete = convert_gt(Float64, "target.chr18.full.vcf.gz", trans=true)
p, n = size(X_complete)
X_mendel = convert_gt(Float64, outfile, trans=true)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

Number of threads = 8
Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:05[39m


Total windows = 2367, averaging ~ 527 unique haplotypes per window.

Timings: 
    Data import                     = 23.7672 seconds
        import target data             = 5.33822 seconds
        import compressed haplotypes   = 18.4289 seconds
    Computing haplotype pair        = 4.87321 seconds
        BLAS3 mul! to get M and N      = 0.363539 seconds per thread
        haplopair search               = 3.99455 seconds per thread
        initializing missing           = 0.0257177 seconds per thread
        allocating and viewing         = 0.0701098 seconds per thread
        index conversion               = 0.00365273 seconds per thread
    Phasing by win-win intersection = 1.64581 seconds
        Window-by-window intersection  = 0.10344 seconds per thread
        Breakpoint search              = 1.24049 seconds per thread
        Recording result               = 0.0483477 seconds per thread
    Imputation                     = 1.96674 seconds
        Imputing missing              

In [74]:
# consider all haplotypes if a window's least square error too high
Random.seed!(2020)
chr = 18
maf = 0.1
d = 1000
overlap = 0.0
tgtfile = "target.chr$chr.typedOnly.maf$maf.masked.vcf.gz"
reffile = "ref.chr18.maxd$d.overlap$overlap.maf$maf.excludeTarget.jlso"
outfile = "mendel.imputed.vcf.gz"
@time ph, haploscore = phase(tgtfile, reffile, outfile=outfile);

# X_complete = convert_gt(Float64, "target.chr18.full.vcf.gz", trans=true)
p, n = size(X_complete)
X_mendel = convert_gt(Float64, outfile, trans=true)
println("error overall = $(sum(X_mendel .!= X_complete) / n / p) \n")

Number of threads = 8
Importing reference haplotype data...
reached here
Total windows = 2367, averaging ~ 527 unique haplotypes per window.

Timings: 
    Data import                     = 22.3206 seconds
        import target data             = 4.36715 seconds
        import compressed haplotypes   = 17.9534 seconds
    Computing haplotype pair        = 4.29841 seconds
        BLAS3 mul! to get M and N      = 0.311395 seconds per thread
        haplopair search               = 3.57343 seconds per thread
        initializing missing           = 0.022812 seconds per thread
        allocating and viewing         = 0.0631998 seconds per thread
        index conversion               = 0.00193417 seconds per thread
    Phasing by win-win intersection = 1.79499 seconds
        Window-by-window intersection  = 0.0982888 seconds per thread
        Breakpoint search              = 1.14894 seconds per thread
        Recording result               = 0.0321271 seconds per thread
    Imputation   

## Check least square error for each sample

In [61]:
# least square error for 100 samples
sum.(haploscore)

100-element Array{Float32,1}:
 223.17535
 206.97342
 179.98242
 338.5188
 309.95145
 288.35715
 319.97934
 352.46643
 296.29828
 399.41782
 330.4275
 256.24335
 302.00192
   ⋮
 366.39935
 920.719
 774.6061
 345.9853
 296.6286
 252.53487
 301.99554
 346.68875
 330.2639
 191.65004
 313.05994
 249.30289

Let's normalize least squares error by number of SNPs and plot histograms

In [66]:
# normalize by number of SNPs
typed_snps = nrecords(tgtfile)
tot_hapscore = sum.(haploscore) ./ typed_snps
@show histogram(tot_hapscore)

# find samples with avg error ≤ 0.005
keep_idx = findall(x -> x ≤ 0.005, tot_hapscore)
npruned = length(keep_idx)
println("error pruned = $(sum(X_mendel[:, keep_idx] .!= X_complete[:, keep_idx]) / npruned / p) \n")

histogram(tot_hapscore) =                     ┌                                        ┐ 
   [0.001, 0.002  ) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 47   
   [0.002, 0.003  ) ┤▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ 30                
   [0.003, 0.004  ) ┤▇▇▇▇▇ 6                                   
   [0.004, 0.005  ) ┤▇▇▇▇▇▇ 8                                  
   [0.005, 0.006  ) ┤▇▇▇▇ 5                                    
   [0.006, 0.007  ) ┤▇▇ 3                                      
   [0.007, 0.008  ) ┤▇ 1                                       
                    └                                        ┘ 
                                    Frequency
error pruned = 0.0049533566168525976 



For each sample, we can also check imputation quality for each window

In [65]:
histogram(haploscore[20]) # sample 20

[90m                  ┌                                        ┐[39m 
   [0m[90m[[0m 0.0[90m, [0m 2.0  [90m)[0m[90m ┤[39m[32m▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇[39m[0m 2309 [90m [39m 
   [0m[90m[[0m 2.0[90m, [0m 4.0  [90m)[0m[90m ┤[39m[0m 33                                     [90m [39m 
   [0m[90m[[0m 4.0[90m, [0m 6.0  [90m)[0m[90m ┤[39m[0m 17                                     [90m [39m 
   [0m[90m[[0m 6.0[90m, [0m 8.0  [90m)[0m[90m ┤[39m[0m 5                                      [90m [39m 
   [0m[90m[[0m 8.0[90m, [0m10.0  [90m)[0m[90m ┤[39m[0m 0                                      [90m [39m 
   [0m[90m[[0m10.0[90m, [0m12.0  [90m)[0m[90m ┤[39m[0m 0                                      [90m [39m 
   [0m[90m[[0m12.0[90m, [0m14.0  [90m)[0m[90m ┤[39m[0m 0                                      [90m [39m 
   [0m[90m[[0m14.0[90m, [0m16.0  [90m)[0m[90m ┤[39m[0m 0                                 

**Conclusion:** For each sample, most windows are imputed well ($0 \le error \le 2$), but a few are not (e.g. hapscore $> 20$). 

## Try calculating $r^2$

According to [minimac 3 documentation](https://genome.sph.umich.edu/wiki/Minimac3_Info_File#Rsq), for each SNP we can calculate 

$$r^2 = \frac{\frac{1}{2n} \sum_{i=1}^{2n}(D_i - \hat{p})^2}{\hat{p}(1 - \hat{p})}$$

where $\hat{p}$ is the alternative allele frequency (of your imputed data), $D_i$ is the imputed alternate allele probability at the $i$th haplotype, and $n$ is the number of GWAS samples. 

In MendelImpute.jl, we do not output a posterior probability; for each locus, each haplotype is imputed with 0 or 1. That is, $D_i \in \{0, 1\}$. Thus, using the following relation:

$$\hat{p} = \frac{1}{2n}\sum_{i = 1}^{2n}D_i$$

we have

\begin{align*}
    r^2 
    &= \frac{\frac{1}{2n} \sum_{i=1}^{2n}(D_i^2 - 2D_i\hat{p} + \hat{p}^2)}{\hat{p}(1 - \hat{p})}\\
    &= \frac{\frac{1}{2n} \left[\sum_{i=1}^{2n}D_i^2 - 2\hat{p}\sum_{i=1}^{2n}D_i + 2n\hat{p}^2\right]}{\hat{p}(1 - \hat{p})}\\
    &= \frac{\frac{1}{2n} \left[\sum_{i=1}^{2n}D_i - 4n\hat{p}^2 + 2n\hat{p}^2\right]}{\hat{p}(1 - \hat{p})}\\
    &= \frac{\frac{1}{2n} \left[2n\hat{p} - 2n\hat{p}^2\right]}{\hat{p}(1 - \hat{p})}\\
    &= 1
\end{align*}

We can confirm this behavior using the code below:

In [28]:
# import imputed haplotypes
H = convert_ht(Float64, outfile, trans=true);

In [46]:
p̂ = vec(sum(H, dims=2)) ./ n ./ 2 # mean = 2maf
r² = zeros(p)
for j in 1:p
    rⱼ = 0.0
    pⱼ = p̂[j]
    # skip snps with 0 minor alleles
    if pⱼ == 0.0 || pⱼ == 1.0
        r²[j] = 1
        continue
    end
    for i in 1:2n
        rⱼ += (H[j, i] - pⱼ)^2 # not efficient
    end
    r²[j] = rⱼ / 2n / (pⱼ * (1 - pⱼ))
end

In [62]:
histogram(r²) # distribution of r^2

[90m                                            ┌                                        ┐[39m 
   [0m[90m[[0m0.999999999999988 [90m, [0m0.999999999999989 [90m)[0m[90m ┤[39m[0m 4                                      [90m [39m 
   [0m[90m[[0m0.999999999999989 [90m, [0m0.99999999999999  [90m)[0m[90m ┤[39m[0m 18                                     [90m [39m 
   [0m[90m[[0m0.99999999999999  [90m, [0m0.999999999999991 [90m)[0m[90m ┤[39m[0m 29                                     [90m [39m 
   [0m[90m[[0m0.999999999999991 [90m, [0m0.999999999999992 [90m)[0m[90m ┤[39m[0m 101                                    [90m [39m 
   [0m[90m[[0m0.999999999999992 [90m, [0m0.999999999999993 [90m)[0m[90m ┤[39m[0m 670                                    [90m [39m 
   [0m[90m[[0m0.999999999999993 [90m, [0m0.999999999999994 [90m)[0m[90m ┤[39m[0m 1035                                   [90m [39m 
   [0m[90m[[0m0.999999999999994 [90m, [

In [67]:
histogram(p̂) # distribution of mean maf

[90m                ┌                                        ┐[39m 
   [0m[90m[[0m0.0 [90m, [0m0.05[90m)[0m[90m ┤[39m[32m▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇[39m[0m 620983 [90m [39m 
   [0m[90m[[0m0.05[90m, [0m0.1 [90m)[0m[90m ┤[39m[32m▇▇[39m[0m 48182                                [90m [39m 
   [0m[90m[[0m0.1 [90m, [0m0.15[90m)[0m[90m ┤[39m[32m▇[39m[0m 28152                                 [90m [39m 
   [0m[90m[[0m0.15[90m, [0m0.2 [90m)[0m[90m ┤[39m[32m▇[39m[0m 20780                                 [90m [39m 
   [0m[90m[[0m0.2 [90m, [0m0.25[90m)[0m[90m ┤[39m[32m▇[39m[0m 17265                                 [90m [39m 
   [0m[90m[[0m0.25[90m, [0m0.3 [90m)[0m[90m ┤[39m[32m▇[39m[0m 14750                                 [90m [39m 
   [0m[90m[[0m0.3 [90m, [0m0.35[90m)[0m[90m ┤[39m[32m▇[39m[0m 12579                                 [90m [39m 
   [0m[90m[[0m0.35[90m, [0m0.4 [90m)[0m[90m ┤[

**Conclusion:** As predicted, all $r^2$ values are 1. Also most SNPs have MAF $\le 0.05$. 