In [None]:
##luigi-vars
NCPU = 8
RAW_HD5 = '/nbi/group-data/ifs/JIC/Research-Groups/Diane-Saunders/FP_pipeline/data/0.1/Callset/callsets/2014/2014_raw.hd5'

In [None]:
import vcfnp
import numpy as np
import h5py
import matplotlib.pyplot as plt
import matplotlib as mpl
import allel
import seaborn as sns
import pandas as pd
import dask.array as da
from dask.diagnostics import ProgressBar,  Profiler, ResourceProfiler, CacheProfiler, visualize
from bokeh.io import output_notebook

import qgrid
qgrid.nbinstall(overwrite=True)  # copies javascript dependencies to your /nbextensions folder

output_notebook()
%matplotlib inline
plt.rcParams['figure.figsize'] = (12,8)
sns.set_style('white')

In [None]:
callset = h5py.File(RAW_HD5, mode='r')
calldata = callset['calldata']

samples = list(callset['samples'])
genotypes = allel.GenotypeChunkedArray(callset['calldata']['GT'])
variants = allel.VariantChunkedTable(callset['variants'])

is_snp = variants.is_snp[:]

# Summary

In [None]:
called = genotypes.is_called().sum(axis=0)
het = genotypes.is_het().sum(axis=0)
hom_alt = genotypes.is_hom_alt().sum(axis=0)
ref = genotypes.is_hom_ref().sum(axis=0)
var_frac = 1 - (ref/called)[:]

In [None]:
summary = pd.DataFrame(np.array([called, het, hom_alt, ref, 1000*var_frac]).T, 
                       columns=['Called', 'Het', 'Hom Alt', 'Hom Ref', 'Variants/kbp'],
                       index=samples)


In [None]:
qgrid.show_grid(summary)

# Heterozygosity

In [None]:
plt.scatter(called, het, label="Het sites")
plt.scatter(called, hom_alt, c='red', label="Hom Alt site")
plt.ylabel("Called Site")
plt.legend()

# Covariation of site level statisitics

In [None]:
def hexbin(x, y, color, **kwargs):
    cmap = sns.cubehelix_palette(n_colors=24, as_cmap=True, dark=0, light=1)
    cmap.set_under('white')
    cmap.set_bad('white')
    plt.hexbin(x, y, gridsize=40, cmap=cmap, **kwargs)

def hist(x, **kwargs):
    sns.distplot(x, kde=False)
    
logDP = np.log1p(variants['DP'])
SOR_clip = np.clip(variants['SOR'],a_max=6, a_min=0)
ReadPosRankSum_clip = np.clip(variants['ReadPosRankSum'],a_max=5, a_min=-5)
MAF = np.nanmax(variants['AF'], axis=1)
QD = variants['QD'][:]

df_variants = pd.DataFrame({'logDP': logDP, 
              'SOR_clip': SOR_clip, 
              'ReadPosRankSum_clip': ReadPosRankSum_clip, 
              'MAF': MAF,
              'QD': QD})

g = sns.PairGrid(data=df_variants[variants.is_snp[:]].fillna(0), 
                 x_vars=['QD', 'logDP', 'MAF', 'SOR_clip', 'ReadPosRankSum_clip'], 
                 y_vars=['QD', 'logDP', 'MAF', 'SOR_clip', 'ReadPosRankSum_clip'],
                 size=4,
                 diag_sharey=False)
g.map_diag(hist)
g.map_lower(hexbin)

# QUAL by Depth

In [None]:
def collapse_fancy_index(idx):
    '''Takes an array of indics, eg from argsort or lexsort
       and collapses runs of consecutive indices into
       (start,end) blocks. 
       
       >>>collapse_fancy_index([1,2,3,4,5,10,11,12,13])
       [(1, 6), (10, 14)]
       '''
    blocks = []
    curr_start, curr_end = idx[0], idx[0]
    for i in idx[1:]:
        if i == curr_end + 1:
            # Extend current block
            curr_end += 1
        else:
            # start new block
            blocks.append((curr_start, curr_end+1))
            curr_start, curr_end = i, i
    blocks.append((curr_start, curr_end+1))
    return blocks

def take_collpased_index(blocks, X):
    '''Similar to np.take. Performs simple indexing for a list
       of (start,end) tuples and concatenates the result'''
    if isinstance(X, np.ndarray):
        return np.concatenate([X[s:e] for s,e in blocks])
    elif isinstance(X, da.Array):
        return da.concatenate([X[s:e] for s,e in blocks])
    else:
        raise Exception("X must either a Dask or Numpy array")

QD = da.from_array(variants['QUAL'], chunks=10000)/da.from_array(variants['DP'], chunks=10000)
rQD = take_collpased_index(collapse_fancy_index(np.where(~is_snp)[0]), QD)                                                   
vQD = take_collpased_index(collapse_fancy_index(np.where(is_snp)[0]), QD)                                                   


with ProgressBar():
    rQD = rQD.compute(num_workers=NCPU)
    vQD = vQD.compute(num_workers=NCPU)
    
vQD = np.nan_to_num(vQD)
rQD = np.nan_to_num(rQD)

In [None]:
fig, ax = plt.subplots(ncols=2,nrows=1)

sns.distplot(vQD[vQD > 0], kde=False, ax=ax.flat[0])
sns.distplot(rQD[rQD > 0], kde=False, ax=ax.flat[1])

ax.flat[0].set_yscale('log')
ax.flat[0].set_title('Variants')
ax.flat[0].set_ylabel("Sites")
ax.flat[0].set_xlabel("QUAL/DP")

ax.flat[1].set_yscale('log')
ax.flat[1].set_title('Non-variants')
ax.flat[1].set_xlabel("QUAL/DP")


plt.tight_layout()

# Depth Distributions

In [None]:
DP = da.from_array(callset['calldata/DP'], chunks=(100000, 1))
with ProgressBar():
    DPmax = DP.max().compute(num_workers=NCPU)
counted_ = [da.bincount(DP[:,i], minlength=DPmax+1) for i, _ in enumerate(samples)]
counted = ([b.compute(num_workers=NCPU) for b in counted_])

In [None]:
def log_bin(X, a, start=1):
    # Create the bins
    j_max = int(np.ceil(np.log(np.max(X))/np.log(a)))
    widths = [a**j for j in range(j_max)]
    bins = np.cumsum([start] + widths)
    
    # Integerisation
    lefts, rights = np.ceil(bins[:-1]), np.floor(bins[1:])
    int_width = rights - lefts + 1
    centres = np.sqrt(lefts*rights)
    
    # Distribute
    indices = np.digitize(X, bins)
    counts = np.zeros_like(centres)
    for i in indices:
        if i!=0:
            # Drop anything dropping of the left side
            counts[i-1]+=1  
    return centres, counts/(int_width*len(X))

In [None]:
plt.figure(figsize=(12,8))
for c in counted:
    plt.plot(*log_bin(c, 1.75), '-', alpha=0.1,color='blue')
plt.loglog()
plt.xlabel("Read Depth")
plt.ylabel("Probability")

# INDEL Lengths

In [None]:
sns.distplot(np.max(variants['svlen'], axis=1), kde=False)
plt.yscale('log')