In [None]:
##luigi-vars
SNP_HD5 = ''
TREE_NWK = ''
MIN_COV = 0.8
CLUSTER_MIN = 2
N_CLUST = 2
CLUSTER_MAX = 25
MAX_LINKAGE=0.95

# DAPC Clustering

In [None]:
import vcfnp
import numpy as np
import h5py
import matplotlib as mpl
import matplotlib.pyplot as plt

import allel
import seaborn as sns
import pandas as pd

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score,calinski_harabaz_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale

%matplotlib inline
sns.set_style('whitegrid')
mpl.rcParams['figure.figsize'] = (10, 6)

In [None]:
def compute_bic_adegenet(km,X):
    #number of clusters
    k = km.n_clusters
    #size of data set
    N, d = X.shape

    WSS = sum([np.sum((km.cluster_centers_[i] - X[km.predict(X)==i,:])**2 )
                    for i in range(km.n_clusters)])

    return(N*np.log(WSS/N) + k*np.log(N))

def plot_ld(gn, title):
    m = allel.stats.rogers_huff_r(gn) ** 2
    ax = allel.plot.pairwise_ld(m)
    ax.set_title(title)
    return m

# Loading Biallelic SNPs

In [None]:
callset = h5py.File(SNP_HD5, mode='r')
genotypes = allel.GenotypeChunkedArray(callset['calldata']['GT'])
variants = allel.VariantChunkedTable(callset['variants'])
samples = list(callset['samples'][:])

In [None]:
genotypes

# Filter Sites and Individuals


In [None]:
filtered = genotypes[(genotypes.count_missing(axis=1)[:]/genotypes.shape[1]) < (1 - MIN_COV), :]
print(filtered.shape)
filtered

In [None]:
n_alt = allel.AlleleCountsChunkedArray(filtered.to_n_alt(fill=-1))
n_alt = np.where(n_alt == -1, np.array([float('nan')]),n_alt)
n_alt = np.where(np.isnan(n_alt), 
                 np.nanmean(n_alt, axis=1).reshape(-1,1), 
                 n_alt)

# Linkage Disequllibrium

In [None]:
unlinked = n_alt[allel.locate_unlinked(n_alt, size=n_alt.shape[0], step=1, threshold=MAX_LINKAGE),:]
print("Unlinking reduced the number of sites from {0} to {1}".format(filtered.shape[0], unlinked.shape[0]))

# DAPC 

In [None]:
n_pca1 = n_alt.shape[1]
cluster_range = range(CLUSTER_MIN, CLUSTER_MAX)

# Scale
X = scale(unlinked.T, with_std=False, with_mean=True)

#Permform PCA retaining all comps
pca = PCA(n_pca1)
Y = pca.fit_transform(X)
cum_var = np.cumsum(pca.explained_variance_ratio_)
plt.bar(np.arange(n_pca1), cum_var)
n_pca2 = int(np.argwhere(cum_var > 0.95)[0])

plt.vlines(n_pca2, 0, 1)
plt.ylim((0,1))
plt.xlabel("PCA Components")
plt.ylabel("Explained variance")

In [None]:
# K-means clustering of the PCs
km_list = [KMeans(int(i), tol=1e-7, max_iter=1e5, n_init=100).fit(Y) for i in cluster_range]

# K selection metrics

In [None]:
# Calinski Harabaz score
ch = [calinski_harabaz_score(Y,km.predict(Y)) for km in km_list]

# Silhouette Score
sil = [silhouette_score(Y,km.predict(Y)) for km in km_list]

# BIC
bic = [compute_bic_adegenet(km,Y) for km in km_list]

plt.figure(figsize=(24, 8))
plt.subplot(131).plot(list(cluster_range), bic, '.-')
plt.subplot(131).set_title("BIC")

sns.barplot(list(cluster_range), ch, ax=plt.subplot(132))
plt.subplot(132).set_title("Calinski Harabaz score")
sns.barplot(list(cluster_range), sil, ax=plt.subplot(133))
plt.subplot(133).set_title("Silhouette score")

plt.tight_layout()

In [None]:
lda = LinearDiscriminantAnalysis(solver='eigen', tol=1e-6,  shrinkage='auto')

Z = PCA(n_pca2).fit_transform(X)
lda.fit(Z, km_list[N_CLUST-CLUSTER_MIN].predict(Y))

sns.barplot(list(range(len(lda.explained_variance_ratio_))), lda.explained_variance_ratio_**2)

In [None]:
plt.figure(figsize=(12,12))
plt.scatter(x=lda.transform(Z)[:,0], y=lda.transform(Z)[:,1], c=[sns.color_palette(n_colors=10)[int(i)] for i in lda.predict(Z)], s=50,  marker='o')

In [None]:
df = pd.DataFrame(lda.predict_proba(Z))
plt.gcf().set_size_inches(12, 4)
df.plot(kind='bar',stacked=True)

In [None]:
groups = pd.DataFrame(np.stack([list(samples), lda.predict_proba(Z).argmax(axis=1)]).T, columns=['samples', 'group'])

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R -w1000 -h1000 -o svg -i groups -i TREE_NWK

library(ggtree)
library(ggplot2)
library(dplyr)
library(ape)

tree <- di2multi(read.tree(TREE_NWK), 1e-5)
p <- ggtree(tree, layout="rectangular", ladderize=TRUE)
d <- p$data

# Append external data
p <- p %<+% groups

# Group labels
p <-p + geom_point2(aes(color=group, subset=isTip ), size=1)
p <- p + theme_tree2(legend.position='left')
p