In [1]:

import dash
from dash.dependencies import Input, Output
import dash_html_components as html
import dash_core_components as dcc
import plotly.graph_objs as go

import plotly.figure_factory as ff

import numpy as np
import pandas as pd
import itertools as it

from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn import preprocessing

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

import scipy

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)


In [26]:

### read data sets

chromosome_sizes = {
    1: 43241123,
    2: 35925311,
    3: 36399800,
    4: 35498544,
    5: 29906830,
    6: 31207206,
    7: 29661716,
    8: 28418058,
    9: 22933762,
    10: 23189644,
    11: 28998832,
    12: 27523550
}

color_ref= ['red','yellow','blue','gap','green','green','purple','green','deepskyblue2','red3','darkolivegreen1','navy','chartreuse','darkorchid3','goldenrod2']


ID= "JapanKorea_tropical_Rdist"

Where= "12"

ref= "1"

pop_refs= ['unlabelled', '1', '3', '4']

###

Ordercore_file= 'Order_core_csv.txt'

try:
    import cPickle as pickle
except ImportError:  # python 3.x
    import pickle

fp=  open('Layer_analysis.p', 'rb')

data = pickle.load(fp)

orderCore= pd.read_csv('Order_core_csv.txt')

df = pd.read_csv('DIM_private_'+ ref +'_request_CHR'+ Where +'.'+ID+'.txt',sep= '\t',header= None)


Slice = pd.read_csv("Profile_coordinates_" + ref + "_CHR" + Where + "."+ ID + ".txt", sep = '\t')

#Distances= pd.read_csv('Distances.' + ID + '.txt',sep= '\t')
Distances= data['Distances']

Centre_distances= data['centre_dists']

Ref_stats= data['Ref_stats']
#Centre_distances= pd.read_csv('Centre_dist.' + ID + '.txt', sep= '\t')

Reductions= ['MeanShift','Kmeans','Ward','DBscan']
Dr= 'Kmeans'

ceff_lib_filename= 'coeff_library.p'


fp=  open(ceff_lib_filename, 'rb')

coeff_lib = pickle.load(fp)

#Distances= preprocessing.scale(Distances,axis= 1)
#Centre_distances= preprocessing.scale(Centre_distances,axis= 1)

In [3]:
data.keys()

dict_keys(['Ref_stats', 'centre_dists', 'Kmeans', 'Distances', 'MeanShift', 'DBscan', 'Ward'])

In [4]:
### plot clusters:

from targetedDistance_plot_tools import plot_clusters
clusters_labels= data[Dr]['labels_l1']
clusters= data[Dr]['features']

#interact(plot_clusters,selected= [x for x in range(len(clusters.label.unique())+1)])

plot_clusters(selected= 0,clusters= clusters,clusters_labels= clusters_labels,ID= ID)

In [5]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,IRIS_313-8436,102.448728,11.13233,-3.003371,31.071679,23.177418,
1,IRIS_313-9949,110.566017,12.62593,6.192479,20.551386,17.068588,
2,IRIS_313-10740,95.4908,33.990767,7.000005,45.600488,39.088043,
3,IRIS_313-10771,-45.292773,60.335178,40.342032,9.814943,23.390124,
4,IRIS_313-10785,71.430253,82.718345,21.512972,50.980094,109.415999,


In [6]:
allow_geo= ['Japan','SouthKorea','NorthKorea','Myanmar','Thailand','Laos','China','India','Malaysia','Philippines','Bangladesh','Cambodia','Indonesia','Vietnam','Bhutan','Nepal']
## or untoggle the next line for no selection:
#list(set(orderCore.loc[x,'COUNTRY']))

allow_sbgp= ['trop','temp','subtrop','japx']
## or untoggle the next line for no selection:
#orderCore.loc[x,'Initial_subpop']

In [7]:
from targetedDistance_plot_tools import plot_accessions
### plot loadings:
vectors= data[Dr]['KDE']


#interact(plot_accessions,selected_column= [x for x in range(-2,len(clusters.label.unique())+1)])
plot_accessions(selected_column= -1,
                    df= df,
                    vectors= fixed(vectors),
                    orderCore= orderCore,
                    allow_geo= allow_geo,
                    allow_sbgp= allow_sbgp,
                    color_ref= color_ref,
                    opac= .8,
                    cols_text= ["ID","NAME","COUNTRY","Initial_subpop"])

In [8]:
from targetedDistance_plot_tools import return_densities
#interact(return_densities,label= range(1,11))

return_densities(label= 5,
                Distances= Distances,
                 data= data,
                 Centre_distances= Centre_distances,
                Dr= Dr)


### Convert PCA euclidean distances to Fst

In [27]:
ploidy= 2

absent= list(set([x for x in Slice.Nsnps if x not in coeff_lib[ploidy].keys()]))
len(absent)

0

In [28]:
from Generate_freq_vectors import generate_Branches_Beta
from Euc_to_fst import Euc_to_fst
import os

if absent:
    
    for length in absent:
        Nbranches= 4
        L= length
        n= 100
        rangeA= [1,2.5]
        rangeB = [.1,.6]
        steps= 20
        n_comp = length
        density= 50

        features, vector_lib= generate_Branches_Beta(4,50,L,n,rangeA,rangeB,steps,n_comp)
        print(features.shape)
        print(vector_lib.shape)

        m_coeff, b, biased_pairwise, fst_x, y_true= Euc_to_fst(vector_lib)
        
        coeff_lib[ploidy][length]= {
            'coeff': m_coeff,
            'b': b
        }
    
    
    
    with open(ceff_lib_filename, 'wb') as fp:
        pickle.dump(coeff_lib, fp, protocol=pickle.HIGHEST_PROTOCOL)

        
coeff_list= [coeff_lib[ploidy][x]['coeff'] for x in Slice.Nsnps]
const_list= [coeff_lib[ploidy][x]['b'] for x in Slice.Nsnps]


In [29]:
from targetedDistance_plot_tools import return_cluster_refs

#interact(return_refs,label= range(1,11),Z= [0,1,2])


return_cluster_refs(label= 5,Z= 2,
                df= df,
                orderCore= orderCore,
               Ref_stats= Ref_stats,
                Distances= Distances,
                Centre_distances= Centre_distances,
                Slice= Slice,
                data= data,
                color_ref= color_ref,
                const_list= const_list,
                coeff_list= coeff_list,
                   Dr= Dr,
                   ID= ID)


dict_keys([0, 1, 2])



divide by zero encountered in log



In [31]:
from targetedDistance_plot_tools import return_refs

#interact(return_refs,t1= range(-2,20),t2= range(2,20),Z= ['raw','scaled','Fst'],registered=['inlier','requested'])

return_refs(-2,2,
            registered= 'inlier',
            Z= 'Fst',
            Wsnps= 50,
            df= df,
            Ref_stats= Ref_stats,
            Slice= Slice,
            orderCore= orderCore,
            Centre_distances= Centre_distances,
            Distances= Distances,
            coeff_list= coeff_list,
            const_list= const_list,
           color_ref= color_ref,
           ID= ID)


dict_keys([0, 1, 2, 5])
792



divide by zero encountered in log



In [32]:
### Distribution of feature space distances between control populations for even and biased scenarios
from targetedDistance_plot_tools import dist_Centre_Fst

dist_Centre_Fst(Slice= Slice,Ref_stats= Ref_stats,Wsnps= 20,coeff_list= coeff_list,const_list= const_list,ID= ID)

### Studying individual clusters

In [33]:

select_l1= [1]
selected1= [x for x in range(Distances.shape[0]) if data[Dr]['labels_l1'][x] + 1 in select_l1]    
meansVars= Ref_stats[selected1,:]
trim= [x for x in range(meansVars.shape[0]) if meansVars[x,1] > 0]

meansVars= meansVars[trim,:]
select_trim= [selected1[x] for x in trim]

sel_d= Distances[select_trim,:]


In [34]:

print([len([x for x in data[Dr]['labels_l1'] if x == y]) for y in list(set(data[Dr]['labels_l1']))])
print([len([x for x in [z for z in range(Distances.shape[0]) if data[Dr]['labels_l1'][z] + 1 in select_l1] if data[Dr]['labels_l2'][x] == y]) for y in range(9)])

[824, 1069, 169, 1339, 1283, 586, 195, 58, 310, 259]
[165, 127, 15, 43, 123, 32, 121, 198, 0]


In [35]:
####
from targetedDistance_plot_tools import plot_clust_dist_vectors

N= 400
P= 40

N_view= 15
select_l1= [6]


plot_clust_dist_vectors(select_l1,N_view= 15,N= 400, P= 40,
                           Distances= Distances, Centre_distances= Centre_distances, 
                            coeff_list= coeff_list,
                            const_list= const_list,
                            data= data, Fst= False, Dr= Dr)

586 of clusters selected



plotly.graph_objs.ColorBar is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.marker.ColorBar
  - plotly.graph_objs.surface.ColorBar
  - etc.




### Under development

In [37]:
#### 3D representation: only possible when all 3 clusters required are present

Axis= {
    1: {
        1:[x for x in range(9)]
    },
    2: {
        3:[x for x in range(9)]
    },
    3: {
        5:[x for x in range(9)],
    }
    
}

Classes= recursively_default_dict()
windows= [x for x in Slice.start]

for w in range(len(windows)):
    Classes[windows[w]][data[Dr]['labels_l1'][w]]= w

Possibilities= [wind for wind in Classes.keys() if \
               sum([int(len([z for z in Axis[a].keys() if z in Classes[wind]]) > 0) for a in Axis.keys()]) == len(Axis.keys())]

print('{} windows identified'.format(len(Possibilities)))

1 windows identified


In [38]:
params = {'bandwidth': np.linspace(.4,1,30)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)

threshold= .001
coordinates= recursively_default_dict()
Cdist_dens= []
P= 60
range_distances= [np.percentile(Distances,1),np.percentile(Distances,99),400]

i_coords, j_coords, z_coords = np.meshgrid(np.linspace(range_distances[0],range_distances[1],P),
                      np.linspace(range_distances[0],range_distances[1],P),
                    np.linspace(range_distances[0],range_distances[1],P),indexing= 'ij')


traces= [x for x in it.product(range(P),range(P),range(P))]

background= np.array([i_coords,j_coords,z_coords])

background= [background[:,c[0],c[1],c[2]] for c in traces]
background=np.array(background)


for karl in Possibilities:
    
    """
    kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(np.array(Distances[karl,:])[:,np.newaxis])
    scores= kde.score_samples(np.linspace(*range_distances)[:,np.newaxis])

    distances_dens.append(scores)

    kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(np.array(Centre_distances[karl,:])[:,np.newaxis])
    scores= kde.score_samples(np.linspace(*range_cdist)[:,np.newaxis])
    
    Cdist_dens.append(scores)
    """
    ## In case more than one axis related profile is found.
    #mean_axis= {axium: np.mean([Distances[Classes[karl][profile],:] for profile in Classes[karl].keys() if profile in Axis[axium]],axis= 0) for axium in Axis.keys()}
    mean_axis= {axium: np.mean([Distances[Classes[karl][profile],:] for profile in Classes[karl].keys() if profile in Axis[axium]],axis= 0) for axium in [1,2]}
    mean_axis[3]= np.mean([Distances[Classes[karl][profile],:] for profile in Classes[karl].keys() if profile in Axis[3]],axis= 0)
    
    datum= np.array([mean_axis[a] for a in mean_axis]).T
    
    
    
    grid.fit(datum)
    kde = grid.best_estimator_
    
    P_dist= kde.score_samples(datum)
    scores= kde.score_samples(background)
    
    #
    scores= scipy.stats.norm(np.mean(P_dist),np.std(P_dist)).cdf(scores)
    
    
    #scores= np.array([x for x in scipy.stats.norm(np.mean(scores),np.std(scores)).cdf(scores)])
    primitive= background[[x for x in range(background.shape[0]) if scores[x] >= threshold],:]
    
    coordinates[karl]= {
        'data':primitive,
        'scores': [scores[x] for x in range(background.shape[0]) if scores[x] >= threshold]
    }
        
    #Cdist_dens.append(scores)


#distances_dens= np.array(distances_dens)
Cdist_dens= np.array(Cdist_dens)
#coords= {i:[x for x in range(Distances.shape[0]) if data['labels_l1'][x] == i] for i in list(set(data['labels_l1']))}


In [39]:
coordinates.keys()

dict_keys([9997746])

In [40]:
data= coordinates[9997746]['data']


fig_data= [go.Scatter3d(
        x = data[:,0],
        y = data[:,1],
        z = data[:,2],
        mode= "markers",
        marker= {
        'line': {'width': 0},
        'size': 4,
        'symbol': 'circle',
      "opacity": .8
      },
    )]



layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=fig_data, layout=layout)

iplot(fig)

In [8]:

def jumpingJack(x):
    
    data= coordinates[x]['data']
    

    fig_data= [go.Scatter3d(
            x = data[:,0],
            y = data[:,1],
            z = data[:,2],
            type='scatter3d',
            mode= "markers",
            marker= {
            'line': {'width': 0},
            'size': 4,
            'symbol': 'circle',
          "opacity": .8
          },
        )]



    layout = go.Layout(
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        ),
        xaxis= dict(
        autorange=False,
        domain= [-7,7]
        ),
        yaxis= dict(
        autorange=False,
        domain= [-7,7]
        )
    )
    fig = go.Figure(data=fig_data, layout=layout)
    iplot(fig)

interact(jumpingJack,x=[x for x in coordinates.keys()])

<function __main__.jumpingJack>