In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint

import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)




## Application to rice data.

All of this is interesting for the possibilities it opens of inferring parameters based on real data. Here we will explore applying these algorithms to rice genetic data (_Oryza sativa L._). Data is stored in vcf format. We will read and subset this large-ish data set (>3000 individuals, 15k markers.) to a size that my PC can run locally. We then split the data into windows of a set size. 


We will perform analysis on one population at one of those windows. Choice of population is based on prior knowledge, but a global PCA of the data set is displayed to help the user select groups based on that. I chose a very small pop (10 haps) to make the application possible. 

In [2]:
from structure_tools.vcf_geno_tools import read_geno_nanum

Home= 'vcf/'
Chr= 8
filename= Home + 'Extract_Chr{}_15000.vcf'.format(Chr)

row_info= 6
header_info= 9
phased= False

genotype, summary, Names= read_geno_nanum(filename, row_info= row_info, header_info= header_info,phased= phased)

print('Number of markers: {}'.format(genotype.shape[1]))
print('Number of individuals: {}'.format(genotype.shape[0]))


{'fileformat': 'VCFv4.2', 'fileDate': '20190327', 'source': 'PLINKv1.90', 'contig': '<ID8,length28422468>', 'INFO': '<IDPR,Number0,TypeFlag,Description"Provisional reference allele, may not be based on real reference genome">', 'FORMAT': '<IDGT,Number1,TypeString,Description"Genotype">'}
Number of markers: 15000
Number of individuals: 3023


In [7]:
## read passport information

Input_file= '3K_info.txt'

RG_info= pd.read_csv('3K_info.txt',sep= '\t')

RG_info.head()

Unnamed: 0,IRIS_ID,NAME,Variety_Name_verif,COUNTRY,REGION,K9_cluster,Initial_subpop
0,B001,HEIBIAO,Heibiao,China,As6,GJ-tmp,temp
1,B002,SANSUIJIN,Sansuijin,China,As6,GJ-tmp,temp
2,B003,ZAOSHENGBAI,Zaoshengbai_,China,As6,GJ-adm,japx
3,B004,QIUGUANGTENGXI_104,Qiuguangtengxi_104_,Japan,As7,GJ-tmp,temp
4,B005,WANSHI,Wanshi,Japan,As7,GJ-tmp,temp


In [8]:
## Process Names vcf names.
## Instance specific processing due to ID copy in VCF file.

for x in range(len(Names)):
    ind= Names[x]
    newid= ind.split('_')
    if len(newid) > 2:
        newid= '_'.join(newid[:2])
    else:
        newid= newid[0]
    
    Names[x]= newid



In [9]:
from structure_tools.vcf_geno_tools import geno_subset_random

Sn= 500
Sm= 11000

ID_col= 'IRIS_ID'
subset_col= 'Initial_subpop'

code= {
    'ind1A':0,
    'ind1B':0,
    'ind2':0,
    'ind3':0,
    'aus':1,
    'temp':2,
    'trop':2,
    'subtrop':2,
    'aro': 3,
    'admx': 4
}


others= 'admx'

gen_sample, subsummary, code_vec, code_lib, Nsample, Msample= geno_subset_random(genotype,summary, RG_info, ID_col,subset_col, Names,code=code, Sn= Sn, Sm= Sm)

color_groups= ['red','yellow','blue','green','purple','black','silver','silver','red3','deepskyeblue','navy','chartreuse','darkorchid3','goldenrod2']


gen_sample shape: 500, 11000


### B. Global variation

#### i. PCA

Perform PCA across data set.

Perform Mean shift clustering to attempt to extract genetically coherent groups of accessions.

These can later be used for supervised analysis.

In [10]:

PCA_color_ref= ['darkseagreen','crimson', 'darkorange', 'darkblue', 'darkcyan',
            'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen',
            'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange',
            'darkorchid', 'darkred', 'darksalmon', 'darkseagreen',
            'darkslateblue', 'darkslategray', 'darkslategrey',
            'darkturquoise', 'darkviolet', 'deeppink']

## Perform PCA
n_comp= 5
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')

feats= pca.fit_transform(gen_sample)

## perform MeanShift clustering.
bandwidth = estimate_bandwidth(feats, quantile=0.15)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=True, min_bin_freq=15)
ms.fit(feats)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}

###
from structure_tools.Tutorial_subplots import plot_global_classes


plot_global_classes(feats,
                    code_lib,
                    label_select,
                    color_groups,
                    PCA_color_ref,
                    title_I= 'IRRI class',
                    title_II= 'Mean_shift',height= 400, width= 950)


This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]



### Chose references

In [11]:
references= ['Local','External']

chose_refs= 1

ref_chosen= references[chose_refs]

if ref_chosen== 'Local':
    ref_dict= label_select
    ref_vector= labels1

if ref_chosen== 'External':
    ref_dict= code_lib
    ref_vector= code_vec

### Split data into windows.

`window_size` is determined in SNPs, not physical extent. `Steps` determines the number of SNPs separating the initial SNP across consecutive windows.

In [12]:
from structure_tools.vcf_geno_tools import geno_window_split
##### 
window_size= 50
Steps= 14

Windows, Out= geno_window_split(gen_sample,
                                subsummary,
                                Steps= Steps,
                                window_size=window_size)

print('number of chromosomes: {}'.format(len(Windows)))
print('number of windows: {}'.format(sum([len(Windows[x].keys()) for x in Windows.keys()])))


number of chromosomes: 1
number of windows: 784


### Window selection.

We will work on a single window for the purpose of this notebook. In the next block the window is selected at random from those generated above. Homozygous ancestral and derived SNPs are recoded as 1, heterozygotes are set to the ancestral SNP.

In [33]:
### Chose a single window to work on :

some_windows= np.random.choice(list(Windows[8].keys()),5)
some_windows

wind_select= np.random.choice(list(Windows[8].keys()),1)[0]
#wind_select= 24870800


data_window= list(Windows[Chr][wind_select])
data_window= np.array(data_window)

#data_w= data_w[code_lib[popA]]
data_window[data_window==1]= 0
data_window[data_window==2]= 1


### Sample selection

The 3K RG is an extensive data set. Already we have subsetted this data set to a smaller, more manageable number of accessions (see above). However, for coalescence we really want to trim down those numbers.

Two options are available bellow:

- `random`: a number (`max_sample`) of accessions are selected at random.
- `ms`: sample a number `Ng` of accessions from clusters estimated using the mean shift algorithm.

It is also possible to choose the ancestor sequence to coalesce to from among the available samples by setting `Anc_pop_1` to True. This option relies on the global clustering displayed in the second section above. One random haplotype is selected from the global group `mrca_pop`. All haplotypes are recoded integer boolean vectors of marker difference. 

In [36]:
from structure_tools.Coal_index import get_config
# max number of samples from this pop
max_sample= 12

# windows. by snp.
# population. Use label in PCA plot in function of label source selected.
popA= 0

sub_sel_method= 'ms'
Ng= 2
Anc_pop_1= False
mrca_pop= 0
#####
#####

### rand sample
if sub_sel_method == 'rand':
    vec_samp= np.random.choice(list(range(data_window.shape[0])),max_sample,replace= False)

if sub_sel_method == 'ms':
    ## mean shift sample:
    ## Perform PCA
    n_comp= 4
    pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')

    featsw= pca.fit_transform(data_window)

    ## perform MeanShift clustering.

    bandwidth = estimate_bandwidth(featsw, quantile=0.2)

    ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=15)
    ms.fit(featsw)
    labels_local = ms.labels_
    label_local_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1)))}
    
    vec_samp= [list(np.random.choice(label_local_select[z],Ng)) for z in label_local_select.keys()]
    vec_samp= list(it.chain(*vec_samp))
    
    mrca_idx= np.random.choice(label_local_select[mrca_pop],1)[0]
    mrca_hap= list(data_window[mrca_idx])
    
    #data_window[mrca_idx]= np.zeros(len(mrca_hap))
        

#######

vec_samp= sorted(vec_samp)
data_w= data_window[vec_samp,:]

if Anc_pop_1:
    
    data_w= [[int(z[x] != mrca_hap[x]) for x in range(len(mrca_hap))] for z in data_w]
    if mrca_idx in vec_samp:
        data_w[vec_samp.index(mrca_idx)]= [0] * len(mrca_hap)
        
    data_w= np.array(data_w)

print(data_w.shape)

dataT= data_w

nsamp= dataT.shape[0]

config_dataw, hap_str= get_config(dataT,nsamp)


hap_sol= list(hap_str.keys())
hap_sun= np.array([np.array(list(x),dtype= int) for x in hap_sol])

hap_size= [len(hap_str[x]) for x in hap_sol]
hap_size= {z:[x for x in range(len(hap_size)) if hap_size[x] == z] for z in list(set(hap_size))}



passing= hap_size.keys()
pack= list(it.chain(*[hap_size[x] for x in passing]))
passport= list(it.chain(*[[x]*len(hap_size[x]) for x in passing]))

pack= [[pack[x],passport[x]] for x in range(len(pack))]
pack= sorted(pack)
pack= np.array(pack)

Dict_mat= {0: 
           {
               -2: hap_sun,
               -1: [0] * hap_sun.shape[0],
               0: pack
              }
          }



(18, 50)


### Coalescent 

Create the coalescent to the ancestor from the observed data. 

In [37]:
from structure_tools.Coal_index import Inf_sites

point_up= recursively_default_dict()

point_dn= recursively_default_dict()

root_lib, point_up = Inf_sites(Dict_mat,point_up,layer_range= 36,sub_sample= 0,poppit= False)

layer: 0; len: 2
layer: 1; len: 6
layer: 2; len: 17
layer: 3; len: 32
layer: 4; len: 48
layer: 5; len: 63
layer: 6; len: 74
layer: 7; len: 79
layer: 8; len: 81
layer: 9; len: 83
layer: 10; len: 85
layer: 11; len: 86
layer: 12; len: 83
layer: 13; len: 75
layer: 14; len: 63
layer: 15; len: 48
layer: 16; len: 33
layer: 17; len: 21
layer: 18; len: 13
layer: 19; len: 8
layer: 20; len: 6
layer: 21; len: 4
layer: 22; len: 3
layer: 23; len: 1
layer: 24; len: 1
time elapsed: 0.71 s


### Theta.

In [38]:
from structure_tools.Coal_probab import Ascent_return, tree_ascent
from structure_tools.Coalesce_plots import plot_rec_InfSites

func_names= ['tree_construct']
funcs= [
        Ascent_return      # runUp_balance # tree_construct
       ]

range_theta= np.linspace(0.01,10,30)

plot_rec_InfSites(point_up,root_lib,funcs,func_names,range_theta,height= 500)

This is the format of your plot grid:
[ (1,1) x1,y1 ]



### Estimating time

Here we will use the coalescent structure to determine time, in generations: of the first probable occurence of a given haplotype. We then move to varying theta in time.

#### Constant theta

In [39]:
from structure_tools.Coal_tools import tree_descent_gen
from structure_tools.Coal_probab import prob_coal, prob_mut

mut_rate= 9.5e-9
Theta= 1.38

Nt= Theta / (mut_rate * 4)

print('estimate Ne: {}'.format(int(Nt)))

sink= max(root_lib.keys())

if 0 not in root_lib[sink].keys():
    while 0 not in root_lib[sink].keys():
        sink -= 1

node_weigths, paths_reverse, node_bins, paths_vector = tree_descent_gen(root_lib,point_up,sink,Theta= Theta,mu= mut_rate)

average_gen= np.mean(paths_vector)
var_gen= np.std(paths_vector)



estimate Ne: 36315789


In [41]:
from structure_tools.Coalesce_plots import plot_InfSites_gens

Anc_poss= root_lib[sink][-2]

hap_frame, fig_gens= plot_InfSites_gens(Anc_poss,point_up,root_lib,range_theta,Theta= Theta,height= 500,width= 900)

iplot(fig_gens)

In [42]:
hap_frame

Unnamed: 0,hap_id,hap,t
0,0,0000000000000000000000001000000000000000000000...,11500.0
1,1,0000000000000000000000000000000000000000000000...,17055.556
2,2,0000000000100000000000001000000000010000000000...,17055.556
3,3,1000000000000000000000000000000000000000000000...,17055.556
4,4,0000000000000000000000001000000000011000010000...,22611.111
5,5,0000000000000000000000001000000000010000010000...,22611.111
6,6,0000000000000000000000001000000000011000000000...,22611.111
7,7,0000000000000000000000001000000000011000010000...,22611.111
8,8,0000000000000000000000001000000000010000000000...,28333.333
9,9,0000000000000000000000001000000000010000010000...,34222.222



### Theta in time

In [96]:
n = 2
r = np.linspace(0.1,max_theta,theta_gaps)
grid=np.array(np.meshgrid(*[r]*n)).T.reshape(-1,n)

for i in range(0,n-1):
    grid = np.array([g for g in grid if g[i]<g[i+1]])


Ngaps= 4
theta_gaps= 5
range_theta= np.linspace(0.1,max_theta,theta_gaps)
ranges_theta= [np.linspace(0.1,max_theta,theta_gaps) for x in range(Ngaps)]

permuts= np.array(np.meshgrid(*ranges_theta)).T.reshape(-1,Ngaps)


(625, 4)

In [97]:
from structure_tools.Coal_index import theta_time, theta_function, tree_ascent_times

import random

mut_rate= 9.5e-9
max_theta= 10
Ngaps= 6
theta_gaps= 4
range_theta= np.linspace(0.1,max_theta,theta_gaps)
ranges_theta= [np.linspace(0.1,max_theta,theta_gaps) for x in range(Ngaps)]

max_time= np.median(hap_frame.t) * max_theta / (Theta * 2)
print('maximum time in function: {}'.format(round(max_time, 3)))

permut_max= 500

###
sink= max(root_lib.keys())

if 0 not in root_lib[sink].keys():
    while 0 not in root_lib[sink].keys():
        sink -= 1

###
permuts= np.array(np.meshgrid(*ranges_theta)).T.reshape(-1,Ngaps)

print(len(permuts))
    
if len(permuts) > permut_max:
                
    chose_some= np.linspace(0.1,len(permuts)-1,permut_max)
    #chose_some= sorted(chose_some)
    chose_some= np.array(chose_some,dtype= int)
    
    permuts= [permuts[x] for x in chose_some]

print(len(permuts))
####

Theta_record= recursively_default_dict()

for combo in range(len(permuts)):
    
    combi= tuple(permuts[combo])
    
    theta_array= theta_time(list(combi),max_time,Ngaps)
    
    node_weigths, paths_backward, node_times = tree_ascent_times(root_lib,point_up,sink,
                                                                 mu= 9e-8,theta_time_array= theta_array)
    
    Theta_record[combi]= {
        'probs': node_weigths[sink][0],
        'times': node_times,
        'comb': theta_array
    }



maximum time in function: 81924.315
4096
500


In [98]:

from sklearn.neighbors import KernelDensity

probs_keys= list(Theta_record.keys())
probs_vector= [Theta_record[th]['probs'] for th in probs_keys]
probs_vector= np.array(probs_vector).reshape(-1,1)

Z= (probs_vector - np.mean(probs_vector)) / np.std(probs_vector)

bandwidth = estimate_bandwidth(Z, quantile=0.2, n_samples=500)

X_plot = np.linspace(-2, 8, 100)[:, np.newaxis]

kde_plot = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(Z)
log_dens = kde_plot.score_samples(X_plot)

fig_dens_I= [go.Scatter(x=X_plot[:, 0], y=np.exp(log_dens),
                            mode='lines', fill='tozeroy',
                            line=dict(color='#AAAAFF', width=2))]

layout= go.Layout(
    title= 'max {}, z: {}'.format(max(probs_vector)[0], max(Z)[0])
)

Figure= go.Figure(data= fig_dens_I, layout= layout)
iplot(Figure)


### PCA optimization

This approach is based on the observation that similar vectors of theta are correlated linearly. This implies that vectors close to an optimum combination of values should cluster together in PCA space. See Figure bellow. 

We estimate the kernel density estimation in feature space of vectors with high probability to sample from. Parameter vectors are recovered using the function `PCA.inverse transformation`. This should introduce some variation that might bring us closer to an optimum parameter set. The plot bellow includes inverse transformed obsevations in orange. 

The algorithm proposed performs runs of dimensionality reduction, selection of higher probability and sampling from the inferred space to create new parameter vector data sets.


In [99]:
from sklearn.preprocessing import scale
from structure_tools.Coalesce_plots import theta_PCAms_plot 

data_combs= [x for x in probs_keys]
data_combs= np.array(data_combs)

N_samp= 50
n_comp= 5


Figure, new_data, feats_combs, pca_feat, Z_chose= theta_PCAms_plot(data_combs,Z,
                                                          N_samp= N_samp,n_comp= n_comp,kernel= 'gaussian')

(15, 5)


In [100]:
iplot(Figure)

### Iterate PCA selection 

In [102]:
from structure_tools.coal_thetTime_opt import pca_optimize
from structure_tools.Coalesce_plots import PCA_sumplot

###
from sklearn.neighbors.kde import KernelDensity
from sklearn.model_selection import GridSearchCV
##

N_samp= 350
Ncomps= 6
Nlayers= 50
up_t= 15
kernel= 'gaussian'

prob_mean, prob_median, prob_sd, pca_theta, pca_record= pca_optimize(feats_combs,data_combs,Z,pca_feat,
                                                                     root_lib,point_up,sink,
                                                                     N_samps= N_samp,
                                                                     Nlayers=Nlayers,max_time= max_time, 
                                                                     Ngaps= Ngaps,Ncomps= Ncomps,
                                                                    kernel=kernel)



(20, 5)


### Summary stats

In [106]:
run_stats= [
    prob_mean, prob_median, prob_sd
]
stats_names= ['mean','median','sd']

run_stats= np.array(run_stats).T

fig_stats= [
    go.Scatter(
        x= list(range(run_stats.shape[0])),
        y= run_stats[:,i],
        mode= 'lines',
        name= stats_names[i]
    ) for i in range(run_stats.shape[1])
]

layout= go.Layout(
    title= 'PCA optimization run stats. p-values by run',
    yaxis= dict(title='mean, median, sd'),
    xaxis= dict(title= 'run')
)

Figure= go.Figure(data= fig_stats, layout= layout)

iplot(Figure)

In [104]:

def plot_thetatime(pca_record,max_time= 4e5):
    fig_best_times= []
    from structure_tools.Coal_index import theta_time, theta_function

    for combi in pca_record.keys(): 
        if len(pca_record[tuple(list(combi))]['comb']):
            x_plot= np.linspace(1,max_time, 100)

            y_plot= [theta_function(x, pca_record[tuple(list(combi))]['comb']) for x in x_plot]

            fig= go.Scatter(
                x= x_plot,
                y= y_plot,
                mode= 'lines',
                name= 'prob: {}'.format(round(pca_record[tuple(list(combi))]['probs'], 5))
            )

            fig_best_times.append(fig)

    layout= go.Layout(
        title= 'best_times',
        xaxis= dict(title= 'generations'),
        yaxis= dict(title= 'theta')
    )

    Figure= go.Figure(data= fig_best_times, layout= layout)
    iplot(Figure)



In [105]:
from structure_tools.Coalesce_plots import plot_thetatime

plot_thetatime(pca_record,max_time= max_time)

### Trees and Graphs

In [107]:
from structure_tools.tree_graph_tools import Gus_get_phylo, Gus_phylo_test

data_phyl= dataT

phylo_bool, Mp, col_lib, Mp_similarity= Gus_phylo_test(data_phyl)

print('has tree: {}'.format(phylo_bool))

Tree, leaves, edges= Gus_get_phylo(Mp,col_lib,Mp_similarity)

has tree: True


In [108]:
### let's visualize the tree
### plotting a network because igraph is not installed.

from structure_tools.Coalesce_plots import plot_phyl_net

node_list= list(range(-1,Mp.shape[1]))
root= True
nodes_as_seqs= True

plot_phyl_net(data_phyl,leaves,node_list,edges,
              nodes_as_seqs= nodes_as_seqs,root= root)


## Coalescent tree

It is interesting to note that if we take all the haplotypes that are generated by a coalescent model, from present data to the root, then this data set is unlikely to have a phylogenetic tree:

In [109]:
sink= max(root_lib.keys())

if 0 not in root_lib[sink].keys():
    while 0 not in root_lib[sink].keys():
        sink -= 1

data_phyl= dataT
phylo_bool, Mp, col_lib, Mp_similarity= Gus_phylo_test(data_phyl)

print('has tree: {}'.format(phylo_bool))

has tree: True


In [110]:
Tree, leaves, edges= Gus_get_phylo(Mp,col_lib,Mp_similarity)

node_list= list(range(-1,Mp.shape[1]))
root= True
nodes_as_seqs= True

plot_phyl_net(data_phyl,leaves,node_list,edges,
              nodes_as_seqs= nodes_as_seqs,root= root)

In [116]:
##
from structure_tools.tree_graph_tools import tree_descent_net

nodes, edges, leaves, node_code= tree_descent_net(root_lib,point_up,sink,init= [0])

from structure_tools.Coalesce_plots import get_ori_graph

present= True
nodes_as_seqs= True
root= True



get_ori_graph(root_lib,edges,node_list,leaves,present= present,
                                            nodes_as_seqs= nodes_as_seqs,
                                            root= root)


[0, 1, 2, 7, 11]


In [119]:
gp_codeName

['Indica', 'cAus', 'Japonica', 'cBasmati', 'Admix']

In [120]:
#### multiple populations
### local labels


from structure_tools.Coalesce_plots_II import node_to_pca_plot

node_list= sorted(list(set(it.chain(*edges))))
n_comp= 5

gp_codeName= ['Indica','cAus','Japonica','cBasmati','Admix']

fig_net_pc= node_to_pca_plot(data_window,root_lib, leaves, mrca_hap, node_list, ref_dict,
                             color_groups=color_groups,present= present, root= root,
                             gp_codeName= gp_codeName,n_comp= n_comp)



iplot(fig_net_pc)


[0, 1, 2, 7, 11]


### Finite sites map. 
###### i.e. cheating because this could be admixture. 


The steps are repeated from above without the commentary in between. 

re-iterating, the steps are: 

 - Select window. Chose ancestor from global groups. or not, pay attention to reference genome. Sample haplotypes randomly or using mean shift. 
    
 - Construct coalescent index of unique haplotypes. Do not add to coalesced haplotype if already existing. Allow for the mutation of non-singleton alleles selecting for minimum frequency.
    
 - draw resulting graph. 
    
    

In [121]:
from structure_tools.vcf_geno_tools import get_window

wind_select= np.random.choice(list(Windows[Chr].keys()),1)[0]
#wind_select= 24870800
popA= 0

sub_sel_method= 'ms'
max_sample= 50
Ng= 3
Anc_pop_1= False
mrca_pop= 1
unique_haps= True
#####
#####

pack, hap_sun, dataT, data_window= get_window(wind_select, Windows, Chr= Chr, sub_sel_method= 'ms',
                                   max_sample= max_sample,
                                    Ng= Ng,
                                    Anc_pop_1= Anc_pop_1,
                                    mrca_pop= mrca_pop,
                                  unique_haps= unique_haps)


(24, 50)
(8, 50)


In [122]:
from structure_tools.Finite_sites_coalescence import Inf_sites_graph


Dict_mat_Fin= {0: 
           {
               -2: hap_sun,
               -1: [0] * hap_sun.shape[0],
               0: pack
              }
          }

point_up= recursively_default_dict()


if dataT.shape[0] > 1:
    
    root_lib, point_up = Inf_sites_graph(Dict_mat_Fin,point_up,layer_range= 36)

    sink= max(root_lib.keys())

    if 0 not in root_lib[sink].keys():
        while 0 not in root_lib[sink].keys():
            sink -= 1
    
    #data_phyl= dataT

    phylo_bool, Mp, col_lib, Mp_similarity= Gus_phylo_test(dataT)
    node_list= list(range(-1,Mp.shape[1]))
    
    print('has tree: {}'.format(phylo_bool))

    #Tree, leaves, edges= Gus_get_phylo(Mp,col_lib,Mp_similarity)
    
    noddy, edges, leaves, node_code= tree_descent_net(root_lib,point_up,sink,init= [0])
    
    node_list= list(it.chain(*[list(x) for x in edges]))
    
    #from structure_tools.Coalesce_plots import plot_phyl_net

    present= True
    nodes_as_seqs= True
    root= True
    if -1 not in node_list:
        root= False
        print('no root?')
    
    get_ori_graph(root_lib,edges,node_list,leaves,present= present,
                                                nodes_as_seqs= nodes_as_seqs,
                                                root= root)
    
    
    


layer: 0; len: 2
layer: 1; len: 3
layer: 2; len: 3
layer: 3; len: 1
layer: 4; len: 6
layer: 5; len: 4
layer: 6; len: 2
layer: 7; len: 6
layer: 8; len: 4
layer: 9; len: 1
layer: 10; len: 0
time elapsed: 0.024 s
has tree: False
[1, 2, 3, 4, 5, 6, 8, 10]


In [123]:

from structure_tools.Coalesce_plots import node_to_pca_plot

node_list= sorted(list(set(it.chain(*edges))))
n_comp= 5

gp_codeName= ['Indica','cAus','Japonica','cBasmati','Admix']

fig_net_pc= node_to_pca_plot(data_window,root_lib, leaves, mrca_hap, node_list,
                             ref_dict= ref_dict, color_groups= color_groups,
                             present= present, root= root,
                             gp_codeName= gp_codeName,n_comp= n_comp)



iplot(fig_net_pc)


[1, 2, 3, 4, 5, 6, 8, 10]
