In [1]:
import scipy
import numpy as np
import pandas as pd
import itertools as it

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import MeanShift, estimate_bandwidth

from scipy.stats import invgamma 
from scipy.stats import beta

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
import plotly.figure_factory as ff

from IPython.display import clear_output

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)


### AMOVA output

This notebook provides a practical example of using among group variation across regions. 

The script `AMOVA_PCA.py` was run twice across all 12 chromosomes of rice, *Oryza sativa*, using the 3K RG data set (Mansueto et al. 2014). For the control run, the argument `--random` was passed, and `--Nrand` was set to 1000. For the test run, the list of all miRNA positions across this genome was provided using the argument `--aims`, and local margins of 1 kb were set using the argument `--mrg`.

In the first section we study the population structure produced by variation in mean shift cluster membership. MS cluster profiles are automatically produced by the script.



In [2]:
Home= ''
ID= 'AMOVA_mRNA_supervised'
suf_file= '_KDE_pca'

df = pd.read_csv(Home + ID + suf_file + '.txt',sep= '\t')
orderCore= pd.read_csv(Home + 'Order_core.txt',sep= '\t')


amova_suf= '_AMOVA'

amova_f = pd.read_csv(Home + ID + amova_suf + '.txt',sep= '\t')

amova_f.head()


orderCore.head()

Unnamed: 0.1,Unnamed: 0,ID,NAME,COUNTRY,REGION,sNMF_K3,Jap_K4,K9_cluster,Initial_subpop,genoIndex,code,label
0,0,CX59,"MILAGROSA,_ZAWA_BANDAY",Philippines,As5,4,1,cB_(Bas),aro,296,4,aro
1,1,CX65,DOMSIAH,Iran,As1,4,1,cB_(Bas),aro,301,4,aro
2,2,CX67,BINAM,Iran,As1,4,1,cB_(Bas),aro,303,4,aro
3,3,CX104,SADRI_RICE_1,Iran,As1,4,1,cB_(Bas),aro,338,4,aro
4,4,CX143,KHASAR,Iran,As1,4,1,cB_(Bas),aro,372,4,aro


In [3]:
### plot loadings:

def plot_accessions(pc1,pc2):
    
    layout= go.Layout(
        title= 'Analysis: {}, MS structure'.format(ID),
        xaxis= dict(
            title= 'PC{}'.format(pc1),
        ),
        yaxis= dict(
            title= 'PC{}'.format(pc2),
        ),
        showlegend= True
        )
    names_index = [[f for f in orderCore.ID].index(x) for x in [str(y) for y in df.id]]
    opac= .8
    soiz= 8

    scheme = [int(orderCore.sNMF_K3[x]) for x in names_index]
    coords = {y:[x for x in range(len(scheme)) if scheme[x] == y and x in names_index] for y in list(set(scheme))}

    pop_refs= ["Indica","cAus","Japonica","GAP","cBasmati","Admix"]
    color_here= ["red","yellow","blue","silver","green","purple"]

    fig= [go.Scatter(
    x = df.iloc[coords[i],pc1],
    y = df.iloc[coords[i],pc2],
    mode= "markers",
    text= orderCore.iloc[[names_index[x] for x in coords[i]],:][["ID","NAME","COUNTRY","Initial_subpop"]].apply(lambda lbgf: (
  "<b>{}</b><br>Name: {}<br>Country: {}<br>{}".format(lbgf[0],lbgf[1],lbgf[2],lbgf[3])),
    axis= 1),
    marker= {
    'color': color_here[i],
    'line': {'width': 0},
    'size': soiz,
    'symbol': 'circle',
      "opacity": opac
      },
      name= pop_refs[i]
    ) for i in list(set(scheme)) if coords[i]]
                
    fig = go.Figure(data=fig,layout= layout)
    iplot(fig)

interact(plot_accessions,pc1= [x + 1 for x in range(df.shape[1])],pc2=[x + 1 for x in range(1,df.shape[1])]) 

<function __main__.plot_accessions>

### AMOVA across miRNA regions

The `AMOVA` file produced by script `AMOVA_PCA.py` is composed of 6 columns (table below). These columns provide information of the location where each statistic was extracted (**CHR**, **start** and **end**), an **ID** of each locus (defaults to *rand* if --random is set), the AMOVA statistic extracted (**AMOVA**) and the number of labels used (**n_clusters**). 



In [6]:
amova_f.head()

Unnamed: 0,CHR,start,end,id,AMOVA,n_clusters
0,1,26191362,26193385,osa-miR1862a,0.251906,3
1,1,4765355,4767375,osa-miR1431,0.505323,3
2,1,5989125,5991276,osa-MIR5817,0.287011,3
3,1,11893404,11895424,osa-miR2924,0.294245,3
4,1,22523147,22525246,osa-MIR156a,0.155465,3


Let us begin by looking at the distribution of AMOVA across the mRNA loci we surveyed.

In [5]:
### Distribution of feature space distances between control populations for even and biased scenarios
from sklearn.neighbors import KernelDensity


X_plot = np.linspace(0, 1, 1000)

kde = KernelDensity(kernel='gaussian', bandwidth=0.02).fit(np.array(amova_f.AMOVA).reshape(-1,1))

log_dens = kde.score_samples(X_plot.reshape(-1,1))

fig_roost_dens= [go.Scatter(x=X_plot, y=np.exp(log_dens), 
                            mode='lines', fill='tozeroy', name= 'Biased senarios',
                            line=dict(color='blue', width=2))]
##

layout= go.Layout(
    title= '{} distribution ID: {}'.format(amova_suf,ID),
    yaxis= dict(
        title= 'density'
    ),
    xaxis= dict(
        title= 'AMOVA'
    )
)

fig = go.Figure(data=fig_roost_dens, layout= layout)
iplot(fig)

### Comparison to neutral distribution.

Because this is a working example, we will make it a bit more complex than just reading the results from a single analysis. As described in the introduction, parallel to the analysis run on all mRNA loci we estimated AMOVA between populations at 1000 random sites along the same data set. We intend to use this run as a null distribution so let's first look at it.

We begin by reading this data.

In [7]:

control_ID= 'AMOVA_rand_supervised'

amova_control = pd.read_csv(Home + control_ID + amova_suf + '.txt',sep= '\t')

amova_control.head()

Unnamed: 0,CHR,start,end,id,AMOVA,n_clusters
0,1,9871110.0,9873940.0,rand,0.71798,3
1,1,1373959.0,1379262.0,rand,0.556608,3
2,1,15466766.0,15474921.0,rand,0.712415,3
3,1,1500943.0,1505728.0,rand,0.247253,3
4,1,8697362.0,8702156.0,rand,0.819248,3


Now we look at the distribution of AMOVA values extracted at random sites.

In [8]:
### Distribution of feature space distances between control populations for even and biased scenarios
from sklearn.neighbors import KernelDensity


X_plot = np.linspace(0, 1, 1000)

kde = KernelDensity(kernel='gaussian', bandwidth=0.02).fit(np.array(amova_control.AMOVA).reshape(-1,1))

log_dens = kde.score_samples(X_plot.reshape(-1,1))

fig_roost_dens= [go.Scatter(x=X_plot, y=np.exp(log_dens), 
                            mode='lines', fill='tozeroy', name= 'Biased senarios',
                            line=dict(color='blue', width=2))]
##

layout= go.Layout(
    title= '{} distribution ID: {}'.format(amova_suf,ID),
    xaxis= dict(
        title= 'AMOVA'
    ),
    yaxis= dict(
        title= 'density'
    )
)

fig = go.Figure(data=fig_roost_dens, layout= layout)
iplot(fig)

Already we can see an significant difference between the results of this analysis and the ones of the analysis targeting mRNA loci. Now, in a population genetics approach, using the above distribution as null, our question regarding AMOVA values at mRNA loci is to know which deviate significantly from our expectation.

### Extracting *p*-value from control distribtion

Our strategy here will be to use the density of AMOVA values accross the spectrum to derive *p*-values for new observations.

To do this we will first extract the log-likelihood of every value from within the range [0-1] under our null distribution. We will assume the normal distribution of these log-likelihoods and extract their cdf. 

In [9]:
ref_pdist= kde.score_samples(np.array(amova_control.AMOVA).reshape(-1,1))

kde_pval= kde.sample(1000)
kde_pval= kde.score_samples(kde_pval.reshape(-1,1))

kde_pval= scipy.stats.norm(np.mean(ref_pdist),np.std(ref_pdist)).cdf(kde_pval)

mRNA_kde= kde.score_samples(np.array(amova_f.AMOVA).reshape(-1,1))

Dist = scipy.stats.norm(np.mean(ref_pdist),np.std(ref_pdist)).cdf(mRNA_kde)


Let's look at the *p*-value obtained for each value of AMOVA under our null distribution:

In [10]:
fig= [go.Scatter(
    x = amova_f.AMOVA,
    y = Dist,
    mode= 'markers'
)]

layout= go.Layout(
    title= 'AMOVA to p-value',
    xaxis= dict(
    title= 'AMOVA'
    ),
    yaxis= dict(
    title= 'p-value'
    )
)

fig = go.Figure(data=fig,layout= layout)
iplot(fig)


Finally we extract the *p*-value of every AMOVA statistic calculated at mRNA loci and of those extracted at random sites.

In [12]:


X_plot = np.linspace(0, 1, 1000)

kde = KernelDensity(kernel='gaussian', bandwidth=0.02).fit(np.array(Dist).reshape(-1,1))

log_dens = kde.score_samples(X_plot.reshape(-1,1))

fig_roost_dens= [go.Scatter(x=X_plot, y=np.exp(log_dens), 
                            mode='lines', fill='tozeroy', name= 'mRNA pVal',
                            line=dict(color='blue', width=2))]
##

X_plot = np.linspace(0, 1, 1000)

kde = KernelDensity(kernel='gaussian', bandwidth=0.02).fit(np.array(kde_pval).reshape(-1,1))

log_dens = kde.score_samples(X_plot.reshape(-1,1))

fig_roost_dens.append(go.Scatter(x=X_plot, y=np.exp(log_dens), 
                            mode='lines', fill='tozeroy', name= 'random pVal',
                            line=dict(color='red', width=2)))



layout= go.Layout(
    title= '{} p-value distribution ID: {}'.format(amova_suf,ID),
    yaxis= dict(
        title= 'density'
    ),
    xaxis= dict(
        title= 'AMOVA'
    )
)

fig = go.Figure(data=fig_roost_dens, layout= layout)
iplot(fig)