# HRC chrom 20 result

In [1]:
using Revise
using VCFTools
using MendelImpute
using GeneticVariation
using Random
using StatsBase
using CodecZlib
using ProgressMeter
using JLD2, FileIO, JLSO
using BenchmarkTools
using GroupSlices
using TimerOutputs
using LinearAlgebra

BLAS.set_num_threads(1)

┌ Info: Precompiling MendelImpute [e47305d1-6a61-5370-bc5d-77554d143183]
└ @ Base loading.jl:1278


# MendelImpute error rate

In [2]:
Threads.nthreads()

8

In [3]:
# 8 threads
Random.seed!(2020)
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.maxd1000.overlap0.0.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
@time ph, haploscore = phase(tgtfile, reffile, outfile = outfile);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz")
n, p = size(X_mendel)
println("error_rate = ", sum(X_mendel .!= X_complete) / n / p)

Number of threads = 8
Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:27[39m


max_width = 698, max_d = 45349


LoadError: UndefVarError: fdsa not defined

In [3]:
# 8 threads
Random.seed!(2020)
d       = 1000
tgtfile = "target.chr20.typedOnly.maf0.01.masked.vcf.gz"
reffile = "ref.chr20.maxd$d.maf0.01.excludeTarget.jlso"
outfile = "mendel.chr20.imputed.target.vcf.gz"
@time ph, haploscore = phase(tgtfile, reffile, outfile = outfile);

# import imputed result and compare with true
X_mendel = convert_gt(Float32, outfile)
X_complete = convert_gt(Float32, "target.chr20.full.vcf.gz")
n, p = size(X_mendel)
println("error_rate = ", sum(X_mendel .!= X_complete) / n / p)

Importing reference haplotype data...


[32mImporting genotype file...100%|█████████████████████████| Time: 0:00:29[39m
[32mComputing optimal haplotypes...100%|████████████████████| Time: 0:01:02[39m
[32mPhasing...100%|█████████████████████████████████████████| Time: 0:00:40[39m
[32mWriting to file...100%|█████████████████████████████████| Time: 0:00:09[39m


Total windows = 3252, averaging ~ 510 unique haplotypes per window.

Timings: 
    Data import                     = 65.5912 seconds
        import target data             = 31.8892 seconds
        import compressed haplotypes   = 33.7021 seconds
    Computing haplotype pair        = 62.7794 seconds
        BLAS3 mul! to get M and N      = 0.955784 seconds per thread
        haplopair search               = 53.0187 seconds per thread
        initializing missing           = 1.28654 seconds per thread
        allocating and viewing         = 0.0907578 seconds per thread
        index conversion               = 0.0775373 seconds per thread
    Phasing by win-win intersection = 40.094 seconds
        Window-by-window intersection  = 34.3224 seconds per thread
        Breakpoint search              = 0.684406 seconds per thread
        Recording result               = 2.18599 seconds per thread
    Imputation                     = 12.6762 seconds
        Imputing missing               = 2.

# See if window edges have higher error rate

In [18]:
compressed_Hunique = JLSO.load(reffile)[:compressed_Hunique]
windows = nwindows(compressed_Hunique)
window_start = compressed_Hunique.Hstart
window_width = diff(window_start)
snps = length(compressed_Hunique.pos);

In [19]:
@show minimum(window_width)
@show maximum(window_width)
@show mean(window_width)
@show median(window_width);

minimum(window_width) = 17
maximum(window_width) = 1612
mean(window_width) = 271.42602276222703
median(window_width) = 219.0


In [46]:
# take 10 SNPs from each side of window edges
snp_near_edges = Vector{UnitRange}(undef, windows + 1)
snp_near_edges[1] = 1:10
snp_near_edges[end] = (snps - 9):snps
for i in 2:windows
    snp_near_edges[i] = (window_start[i] - 9):(window_start[i] + 10)
end

# compute average error rate of SNPs near edges
error_near_edges = 0
for r in snp_near_edges
    error_near_edges += sum(view(X_complete, :, r) .!= view(X_mendel, :, r))
end

# total error rate 
total_error = sum(X_mendel .!= X_complete);

In [47]:
# around 7% of snps are near window edges
sum(length.(snp_near_edges)) / snps

0.07367951224706652

In [48]:
# error near edges is 8%, not significantly high than snps at the middle of windows
error_near_edges / total_error

0.08640454711643429

Conclusion: no, snps near window edges do not have much higher error rate

# See if smaller windows have higher error rate

First plot window width distribution

In [53]:
using UnicodePlots
X_winrange = compressed_Hunique.X_window_range
X_winwidth = length.(X_winrange)
histogram(X_winwidth, bins=20)

[90m                  ┌                                        ┐[39m 
   [0m[90m[[0m  0.0[90m, [0m 20.0[90m)[0m[90m ┤[39m[32m▇▇[39m[0m 74                                   [90m [39m 
   [0m[90m[[0m 20.0[90m, [0m 40.0[90m)[0m[90m ┤[39m[32m▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇[39m[0m 755                  [90m [39m 
   [0m[90m[[0m 40.0[90m, [0m 60.0[90m)[0m[90m ┤[39m[32m▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇[39m[0m 1412 [90m [39m 
   [0m[90m[[0m 60.0[90m, [0m 80.0[90m)[0m[90m ┤[39m[0m 0                                      [90m [39m 
   [0m[90m[[0m 80.0[90m, [0m100.0[90m)[0m[90m ┤[39m[32m▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇[39m[0m 880               [90m [39m 
   [0m[90m[[0m100.0[90m, [0m120.0[90m)[0m[90m ┤[39m[0m 0                                      [90m [39m 
   [0m[90m[[0m120.0[90m, [0m140.0[90m)[0m[90m ┤[39m[0m 0                                      [90m [39m 
   [0m[90m[[0m140.0[90m, [0m160.0[90m)[0m[90m ┤[39m[0m 0   

Seems like most windows have less than 100 typed SNPs. 

In [66]:
# compute error rate for windows with various widths
error_rate_20, error_count_20 = 0, 0
error_rate_50, error_count_50 = 0, 0
error_rate_100, error_count_100 = 0, 0
error_rate_200, error_count_200 = 0, 0
for (i, Xrange) in enumerate(X_winwidth)
    Hrange = window_start[i]:(i == windows ? snps : window_start[i + 1])
    err = sum(view(X_complete, :, Hrange) .!= view(X_mendel, :, Hrange))
    if Xrange ≤ 20
        error_rate_20 += err
        error_count_20 += length(Hrange)
    elseif Xrange ≤ 50
        error_rate_50 += err
        error_count_50 += length(Hrange)
    elseif Xrange ≤ 100
        error_rate_100 += err
        error_count_100 += length(Hrange)
    elseif Xrange ≤ 200
        error_rate_200 += err
        error_count_200 += length(Hrange)
    end
end

In [68]:
@show error_rate_20 / error_count_20 / n
@show error_rate_50 / error_count_50 / n
@show error_rate_100 / error_count_100 / n
@show error_rate_200 / error_count_200 / n;

(error_rate_20 / error_count_20) / n = 0.00237559088955737
(error_rate_50 / error_count_50) / n = 0.0015021973412171273
(error_rate_100 / error_count_100) / n = 0.0011659154187974184
(error_rate_200 / error_count_200) / n = 0.0010674090026751137


Conclusion: Windows with 20 or less SNPs have >2x error rate than windows with 100 SNPs 