In [1]:
import scipy
import numpy as np
import pandas as pd
import itertools as it

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import MeanShift, estimate_bandwidth

from scipy.stats import invgamma 
from scipy.stats import beta

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
import plotly.figure_factory as ff

from IPython.display import clear_output

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)


In [3]:
Home= ''
ID= 'AMOVA_rand_supervised'
suf_file= '_KDE_pca'

df = pd.read_csv(Home + ID + suf_file + '.txt',sep= '\t')
orderCore= pd.read_csv(Home + 'Order_core.txt',sep= '\t')

orderCore.head()

Unnamed: 0.1,Unnamed: 0,ID,NAME,COUNTRY,REGION,sNMF_K3,Jap_K4,K9_cluster,Initial_subpop,genoIndex,code,label
0,0,CX59,"MILAGROSA,_ZAWA_BANDAY",Philippines,As5,4,1,cB_(Bas),aro,296,4,aro
1,1,CX65,DOMSIAH,Iran,As1,4,1,cB_(Bas),aro,301,4,aro
2,2,CX67,BINAM,Iran,As1,4,1,cB_(Bas),aro,303,4,aro
3,3,CX104,SADRI_RICE_1,Iran,As1,4,1,cB_(Bas),aro,338,4,aro
4,4,CX143,KHASAR,Iran,As1,4,1,cB_(Bas),aro,372,4,aro


In [4]:
### plot loadings:

def plot_accessions(pc1,pc2):
    
    layout= go.Layout(
        title= 'Analysis: {}, MS structure'.format(ID),
        xaxis= dict(
            title= 'PC{}'.format(pc1),
        ),
        yaxis= dict(
            title= 'PC{}'.format(pc2),
        ),
        showlegend= True
        )
    names_index = [[f for f in orderCore.ID].index(x) for x in [str(y) for y in df.id]]
    opac= .8
    soiz= 8

    scheme = [int(orderCore.sNMF_K3[x]) for x in names_index]
    coords = {y:[x for x in range(len(scheme)) if scheme[x] == y and x in names_index] for y in list(set(scheme))}

    pop_refs= ["Indica","cAus","Japonica","GAP","cBasmati","Admix"]
    color_here= ["red","yellow","blue","silver","green","purple"]

    fig= [go.Scatter(
    x = df.iloc[coords[i],pc1],
    y = df.iloc[coords[i],pc2],
    mode= "markers",
    text= orderCore.iloc[[names_index[x] for x in coords[i]],:][["ID","NAME","COUNTRY","Initial_subpop"]].apply(lambda lbgf: (
  "<b>{}</b><br>Name: {}<br>Country: {}<br>{}".format(lbgf[0],lbgf[1],lbgf[2],lbgf[3])),
    axis= 1),
    marker= {
    'color': color_here[i],
    'line': {'width': 0},
    'size': soiz,
    'symbol': 'circle',
      "opacity": opac
      },
      name= pop_refs[i]
    ) for i in list(set(scheme)) if coords[i]]
                
    fig = go.Figure(data=fig,layout= layout)
    iplot(fig)

interact(plot_accessions,pc1= [x + 1 for x in range(df.shape[1])],pc2=[x + 1 for x in range(1,df.shape[1])]) 

<function __main__.plot_accessions>

In [6]:

amova_suf= '_AMOVA'

amova_f = pd.read_csv(Home + ID + amova_suf + '.txt',sep= '\t')

amova_f.head()


Unnamed: 0,CHR,start,end,id,AMOVA,n_clusters
0,1,9871110.0,9873940.0,rand,0.71798,3
1,1,1373959.0,1379262.0,rand,0.556608,3
2,1,15466766.0,15474921.0,rand,0.712415,3
3,1,1500943.0,1505728.0,rand,0.247253,3
4,1,8697362.0,8702156.0,rand,0.819248,3


In [7]:
### Distribution of feature space distances between control populations for even and biased scenarios
from sklearn.neighbors import KernelDensity


X_plot = np.linspace(0, 1, 1000)

kde = KernelDensity(kernel='gaussian', bandwidth=0.02).fit(np.array(amova_f.AMOVA).reshape(-1,1))

log_dens = kde.score_samples(X_plot.reshape(-1,1))

fig_roost_dens= [go.Scatter(x=X_plot, y=np.exp(log_dens), 
                            mode='lines', fill='tozeroy', name= 'Biased senarios',
                            line=dict(color='blue', width=2))]
##

layout= go.Layout(
    title= '{} distribution ID: {}'.format(amova_suf,ID)
)

fig = go.Figure(data=fig_roost_dens, layout= layout)
iplot(fig)