In [None]:
##luigi-vars
SNPS_HD5 = 'default'

In [None]:
import vcfnp
import numpy as np
import h5py
import matplotlib.pyplot as plt
import matplotlib as mpl
import allel
import seaborn as sns
import pandas as pd
from sklearn.mixture import GaussianMixture
from matplotlib.backends.backend_pdf import PdfPages

plt.rcParams['figure.figsize'] = (12,6)
%matplotlib inline

In [None]:
callset = h5py.File(SNPS_HD5, mode='r')
calldata = callset['calldata']
genotypes = allel.GenotypeChunkedArray(callset['calldata']['GT'])
samples = list(callset['samples'])
variants = allel.VariantChunkedTable(callset['variants'])[:]

called = np.where(genotypes.is_called(),'.','+')
het = np.where(genotypes.is_called(), np.where(genotypes.is_het(),'green','red'), 'black')

cov = np.logical_and(np.logical_and(calldata['DP'][:] != 0, 
                                    genotypes.is_called()[:]), 
                                    genotypes.is_het()[:])
DP = calldata['DP']


In [None]:
# Sort the alleles by depth
sort = np.sort(calldata['AD'], axis=-1)[:,:,::-1]
third = np.sum(sort[:,:,2:] ,axis=-1)
AD = np.concatenate((sort[:,:,:2], third[:,:,None]), axis=2)

In [None]:
def BIC_ratio(AF, plot=False, ax=None):
    '''Fit a 1 and 3 component GMM to the AF and compare the BIC values'''
    AF = AF.reshape((-1,1))
    gmm = [GaussianMixture(i).fit(AF) for i in [1,3]]
    bic = [x.bic(AF) for x in gmm]
    x = np.linspace(0,1, 100)
    if plot:
        if ax is None:
            plt.figure()
            ax = plt.gca()     
        ax.plot(x, np.exp(gmm[0].score_samples(x.reshape(-1,1))) )
        ax.plot(x, np.exp(gmm[1].score_samples(x.reshape(-1,1))) )
        ax.hist(AF, bins=100, normed=True);
        plt.xlabel("Site Allele Frequency")
    return bic[0]/bic[1]

In [None]:
!mkdir QC

In [None]:
# Plots all of the allele frequencies and the BIC ratios
pp = PdfPages("QC/allele_freq.pdf")
br = np.zeros_like(samples, dtype=np.float)
for i, s in enumerate(samples):
    X = AD[cov[:,i],i,:2]/DP[:][cov[:,i],i,None]
    br[i] = BIC_ratio(X, plot=True)
    plt.title("{0}\nBR = {1:.3}".format(s,br[i]))
    pp.savefig()
    plt.close()
pp.close()

In [None]:
nrows=int(len(samples)/4)+1
br = np.zeros_like(samples, dtype=np.float)
fig, ax_list = plt.subplots(ncols=4, nrows=nrows, sharex=True, sharey=True)
fig.set_size_inches((12, nrows*1.5))
for i, s in enumerate(samples):
    X = AD[cov[:,i],i,:2]/DP[:][cov[:,i],i,None]
    br[i] = BIC_ratio(X, plot=True, ax=ax_list.flat[i])
    ax_list.flat[i].set_title("{0}\nBR = {1:.3}".format(s,br[i]))

In [None]:
sns.barplot(x=[s for s in samples], y=br)
plt.xticks(rotation=90);
plt.title("BIC(n=1)/BIC(n=3)")

In [None]:
nrows=int(len(samples)/4)+1
br = np.zeros_like(samples, dtype=np.float)
fig, ax_list = plt.subplots(ncols=4, nrows=nrows, sharex=True, sharey=True)
fig.set_size_inches((12, nrows*3))

for i, s in enumerate(samples):
    ax = ax_list.flat[i]
    ax.set_title(s)
    ax.scatter(AD[cov[:,i],i,0], AD[cov[:,i],i,1], alpha=0.1 )
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlim((1e0,1e4))
    ax.set_ylim((1e0,1e4))

plt.ylabel('Rank 2 AD')
plt.xlabel("Rank 1 AD")
