In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint

import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)



## Trees and topologies.

Notes from _Genes Ggenealogies, Variation and Evolution_, chapter II, of the same name [1]. 

For tree test, construction and comparison were taken directly from the original work by Gusfield D. (1991). Further work on gene network is personal.



- Gusfield, D. (1991). Efficient algorithms for inferring evolutionary trees. Networks, 21(1), 19-28.

- Hein, J., Schierup, M., & Wiuf, C. (2004). Gene genealogies, variation and evolution: a primer in coalescent theory. Oxford University Press, USA.

## Gusfield 1991

### Algo I: test if hap matrix has phylo tree

In [2]:

def Gus_phylo_test(hap_array):
    
    ## get cols
    Tree= np.array(hap_array,dtype= str).T
    
    ### get binary of cols
    binary_t= [''.join(x) for x in Tree]
    binary_t= [int(x,2) for x in binary_t]
    
    ### get sort of col binary
    bin_sort= np.argsort(binary_t)[::-1]
    
    ### remove duplicates
    Tree_str= [''.join(x) for x in Tree]
    
    Mp= [Tree_str[x] for x in bin_sort if binary_t[x]]
    
    dup_try= [0,*[x for x in range(1,len(Mp)) if Mp[x] != Mp[x - 1]]]
    
    Mp= [Mp[x] for x in dup_try]
    
    ## get Mprime to original array col index for phyl construction later.
    Mp_similarity= {
        z: [x for x in range(len(Tree_str)) if Tree_str[x] == Mp[z]] for z in range(len(Mp))
    }
    
    Mp= [list(x) for x in Mp]
    
    ### get haps back cols sorted, nulls removed.
    Mp= np.array(Mp,dtype= int).T

    ## get all positive cells.
    valid_c= np.where(Mp == 1)
    where_one= [tuple([valid_c[0][x],valid_c[1][x]]) for x in range(len(valid_c[0]))]
    
    ## cells by row
    where_lib= [c[0] for c in where_one]
    where_lib= {
        z:[where_one[x] for x in range(len(where_lib)) if where_lib[x] == z] for z in list(set(where_lib))
    }
    
    # cells in the same row but previous col for every cell
    where_prime= [
        [z[1] for z in where_lib[cel[0]] if z[1] < cel[1]] for cel in where_one
    ]
    
    ## respective cols L(i,j)
    where_prime= [[[-1],x][int(len(x) > 0)] for x in where_prime]
    where_prime= [max(x) for x in where_prime]

    ## cells by col
    col_lib= [c[1] for c in where_one]
    
    ## largest smaller index by col
    col_lib= {
        z:[where_prime[x] for x in range(len(col_lib)) if col_lib[x] == z] for z in list(set(col_lib))
    }
    
    ## check how many different previous to larger col indeces by cell
    col_lib= {z:list(set(g)) for z,g in col_lib.items()}
    
    discovery= np.array([len(x) for x in col_lib.values()])
    
    ## According to Gusfield:
    '''
    - Check whether L(i,J) = Lo') for every cell (ij) E 0. If so. then M has
    a phylogenetic tree: otherwise. M does not have one.
    '''
    
    ## i'm interpreting this as: all previous to last col indeces must be same by col index. 
    
    discovery= np.where(discovery > 1)[0]
    discovery= len(discovery) == 0
    return discovery, Mp, col_lib, Mp_similarity


Gus_data= [
    [1,1,0,0,0],
    [0,0,1,0,0],
    [1,1,0,0,1],
    [0,0,1,1,0],
    [0,1,0,0,0]
]
Gus_data= np.array(Gus_data)

Gus_data_F= [
    [1,1,0,0,0],
    [0,0,1,0,1],
    [1,1,0,0,1],
    [0,0,1,1,0],
    [0,1,0,0,1]
]
Gus_data_F= np.array(Gus_data_F)


windows_test= [
    Gus_data, Gus_data_F
]

Expct_title= ['has phyl tree', 'does not have phyl tree']

for wind in range(len(windows_test)):
    
    phylo_bool, Mp, col_lib, Mp_similarity= Gus_phylo_test(windows_test[wind])
    
    phylo_bool
    
    print('T= {}; R= {}'.format(Expct_title[wind],phylo_bool))

T= has phyl tree; R= True
T= does not have phyl tree; R= False


### Algo II: Construct phylogenetic tree


In [3]:
def Gus_get_phylo(Mp,col_lib,Mp_similarity):
    node_edges= recursively_default_dict()
    nodes_all=[]
    
    node_edges[-1]= {}

    root= {}

    tree_nodes= recursively_default_dict()
    
    leaves= recursively_default_dict()

    for col,L in col_lib.items():
        
        node_edges[L[0]][col]= Mp_similarity[col]
    
    
    for ri in range(Mp.shape[0]):
        row= Mp[ri]

        ci= np.where(row == 1)[0]

        if len(ci):
            ci= max(ci)
            edges= [tuple([z,ci]) for z in node_edges.keys() if ci in node_edges[z]]

            for ed in edges:
                leaves[ed[1]][ri]= 1

        else:
            leaves[-1][ri]= 1

    leaves= {
        z: list(leaves[z].keys()) for z in leaves
    }

    ## because this format might prove more useful later
    edges= [[(x,z) for z in node_edges[x]] for x in node_edges.keys()]
    edges= list(it.chain(*edges))

    return node_edges, leaves, edges


In [4]:
### example from figure 2.10 (GGVE). 

dataT= [
    [1,1,0,0],
    [1,1,0,1],
    [0,0,0,0],
    [0,0,1,0],
    [0,0,1,0]
]

dataT= np.array(dataT)
data_phyl= dataT

phylo_bool, Mp, col_lib, Mp_similarity= Gus_phylo_test(data_phyl)

print('has tree: {}'.format(phylo_bool))

Tree, leaves, edges= Gus_get_phylo(Mp,col_lib,Mp_similarity)

has tree: True


In [5]:
### let's visualize the tree
### plotting a network because igraph is not installed.

from structure_tools.Coalesce_plots import plot_phyl_net

node_list= list(range(-1,Mp.shape[1]))
root= True
nodes_as_seqs= True

plot_phyl_net(data_phyl,leaves,node_list,edges,
              nodes_as_seqs= nodes_as_seqs,root= root)


### Algo III. Check for tree compatibility

The third algorithm proposed by Gusfield (1991). Return combination of trees if possible, incompatible otherwise.

Final algorithm entails several co-factors.

- `get_obj_key`: generate dictionary of leaf nodes by sequence object (to get number and ID of objects on same leafs).

- `get_back_graph`: returns backwards dictionary graph of given tree.

- `get_nodeLeaves`: number of descendents for each node.

- `get_subTree`: get all elements below a given node (recursive, perhaps unsuitable for large trees).


- `tree_dat`: get and store the information produced by the above functions for two data sets.


Finally, `get_updated_tree` takes the tree_dat dictionnary of tree intel and looks for nested branches as described in Gusfield (1991).


In [6]:

def get_obj_key(leaves,data_phyl):
    
    obj_w= {
        x: [z for z in leaves.keys() if x in leaves[z]][0] for x in range(data_phyl.shape[0])
    }
    
    obj_w= {
        x: tuple([obj_w[x], len(leaves[obj_w[x]])]) for x in obj_w.keys()
    }
    
    return obj_w


In [7]:

def get_back_graph(Tree):
    
    rev_dict= recursively_default_dict()
    
    for nd in Tree.keys():
        for dt in Tree[nd]:
            rev_dict[dt][nd]= 0
    
    rev_dict= {z: list(rev_dict[z].keys()) for z in rev_dict.keys()}
    
    return rev_dict


In [8]:

def get_nodeLeaves(back_tree,leaves):

    ## get leaf nodes
    leaf_nds= [x for x in leaves.keys() if len(leaves[x])]
    
    ## root boolean
    froot= False
    
    ## edg_store
    Nstore= {
        z: list(leaves[z]) for z in leaf_nds
    }
    
    current_list= list(leaf_nds)
    current_list= [x for x in current_list if x != -1]
    
    while not froot:
            
            if len(current_list) == 0:
                froot= True
                continue
            
            new_layer= []
            
            for nd in current_list:
            
                for inc in back_tree[nd]:
                    if inc not in Nstore.keys():
                        Nstore[inc]= Nstore[nd]
                    else:
                        Nstore[inc].extend(Nstore[nd])
                    
                    if inc >= 0:
                        new_layer.append(inc)
            
            current_list= list(set(new_layer))
    
    return Nstore


In [9]:

def get_subtree(Tree,node,subT= {}):
    
    if node not in Tree.keys():
        return subT
    
    subT[node]= Tree[node]
    
    for nd in Tree[node].keys():
        get_subtree(Tree,nd,subT)
    
    return subT


In [10]:

def tree_dat(dataMlist):
    dataM= {}
    
    for ds in range(len(dataMlist)):
        
        data_m= dataMlist[ds]
        
        phylo_bool, Mp, col_lib, Mp_similarity= Gus_phylo_test(data_m)

        print('data set {} {} tree.'.format(ds,['does not have','has'][int(phylo_bool)]))
        
        if phylo_bool:

            Tree, leaves, edges= Gus_get_phylo(Mp,col_lib,Mp_similarity)

            total_nodes= list(set(list(it.chain(*edges))))
            total_nodes= sorted(total_nodes)

            ## get backward tree
            back_tree= get_back_graph(Tree)

            ## get node_weights
            Nweights= get_nodeLeaves(back_tree,leaves)

            ## leaf by object
            obj_to_leaf= get_obj_key(leaves,data_m)
            
            dataM[ds]= {
                'tree': Tree,
                'Rtree': back_tree,
                'edges': edges,
                'leaves': leaves,
                'obj': obj_to_leaf,
                'desc': Nweights
            }
    
    return dataM


In [11]:

def get_updated_tree(TreeH,data_sets):
    for j in range(len(TreeH)):
        tj= j
        tk= 1 - j

        obj_list= list(TreeH[tj]['obj'].keys())

        for obj in obj_list:

            objn= TreeH[tj]['obj'][obj][0]
            objn_nei= TreeH[tj]['desc'][objn]
            objw= TreeH[tj]['obj'][obj][1]

            if objw > TreeH[tk]['obj'][obj][1]:

                ndk= TreeH[tk]['obj'][obj][0]
                ldk= TreeH[tk]['desc'][ndk]

                froot= False

                while not froot:
                    
                    if ndk == -1 and ldk < objw:
                        print('root of t.{} smaller than node {} in t.{}'.format(tk,obj,tj))
                        
                    if len(ldk) >= objw:

                        compare= [x for x in objn_nei if x in ldk]

                        if len(compare) < objw:
                            print('trees {}, {} not compatible'.format(tj,tk))

                            froot= True
                            return {}

                        else:
                            new_branch= get_subtree(TreeH[tk]['tree'],ndk)
                            froot= True

                    if not froot:
                        ndk= TreeH[tk]['Rtree'][ndk][0]
                        ldk= TreeH[tk]['desc'][ndk]


                extra_back= get_back_graph(new_branch)
                
                new_nodes= list(it.chain(*[list(new_branch[z].keys()) for z in new_branch.keys()]))
                new_nodes= [*list(new_branch.keys()),*new_nodes]
                new_nodes= sorted(new_nodes)

                new_leaves= [x for x in TreeH[tk]['leaves'] if x in new_nodes]

                new_leaves= {
                    z: TreeH[tk]['leaves'][z] for z in new_leaves
                }

                new_edges= [x for x in TreeH[tk]['edges'] if x[0] in new_nodes]


                
                ### replace and update
                if objn in TreeH[tj]['tree']:
                    TreeH[tj]['tree'].pop(objn)

                if objn in TreeH[tj]['leaves']:
                    TreeH[tj]['leaves'].pop(objn)

                TreeH[tj]['tree'].update(new_branch)
                TreeH[tj]['Rtree'].update(extra_back)
                TreeH[tj]['edges'].extend(new_edges)
                TreeH[tj]['leaves'].update(new_leaves)

                ###
                Nweights= get_nodeLeaves(TreeH[tj]['Rtree'],TreeH[tj]['leaves'])

                ## leaf by object
                obj_to_leaf= get_obj_key(TreeH[tj]['leaves'],data_sets[tj])

                ###
                TreeH[tj]['obj']= obj_to_leaf
                TreeH[tj]['desc']= Nweights
        
    return TreeH[tj]



In [12]:
new_col= np.array([[0,0,0,1,0]]).T
data_phyl_II= np.hstack((data_phyl,new_col))

data_sets= [data_phyl, data_phyl_II]


TreeDict= tree_dat(data_sets)


data set 0 has tree.
data set 1 has tree.


In [13]:
### the test of compatibility is expressed in the output of the function
### get_updated_tree: if empty then trees are incompatible. otherwise go.

get_updated_tree(TreeDict,data_sets)

{'tree': defaultdict(<function __main__.recursively_default_dict()>,
             {-1: {0: [0, 1], 2: [2]},
              0: defaultdict(<function __main__.recursively_default_dict()>,
                          {1: [3]}),
              2: defaultdict(<function __main__.recursively_default_dict()>,
                          {3: [4]})}),
 'Rtree': {0: [-1], 2: [-1], 1: [0], 3: [2]},
 'edges': [(-1, 0), (-1, 2), (0, 1), (2, 3)],
 'leaves': {0: [0], 1: [1], -1: [2], 3: [3], 2: [4]},
 'obj': {0: (0, 1), 1: (1, 1), 2: (-1, 1), 3: (3, 1), 4: (2, 1)},
 'desc': {0: [0, 1], 1: [1], -1: [2, 0, 4, 3, 0, 1, 4, 3], 3: [3], 2: [4, 3]}}

## B. Coalescent tree

It is interesting to note that if we take all the haplotypes that are generated by a coalescent model, from present data to the root, then this data set is unlikely to have a phylogenetic tree:

In [14]:
from structure_tools.Coal_index import get_config

### example from figure 2.10.

dataT= [
    [1,1,0,0],
    [1,1,0,1],
    [0,0,0,0],
    [0,0,1,0],
    [0,0,1,0]
]

dataT= np.array(dataT)

nsamp= dataT.shape[0]

config_dataw, hap_str= get_config(dataT,nsamp)

hap_sol= list(hap_str.keys())
hap_sun= np.array([np.array(list(x),dtype= int) for x in hap_sol])

hap_size= [len(hap_str[x]) for x in hap_sol]
hap_size= {z:[x for x in range(len(hap_size)) if hap_size[x] == z] for z in list(set(hap_size))}



passing= hap_size.keys()
pack= list(it.chain(*[hap_size[x] for x in passing]))
passport= list(it.chain(*[[x]*len(hap_size[x]) for x in passing]))

pack= [[pack[x],passport[x]] for x in range(len(pack))]
pack= sorted(pack)
pack= np.array(pack)

Dict_mat= {0: 
           {
               -2: hap_sun,
               -1: [0] * hap_sun.shape[0],
               0: pack
              }
          }

point_up= recursively_default_dict()


### Indexing layers


In [15]:
from structure_tools.Coal_index import Inf_sites

root_lib, point_up = Inf_sites(Dict_mat,point_up,layer_range= 10,sub_sample= 0,poppit= False)


layer: 0; len: 2
layer: 1; len: 2
layer: 2; len: 3
layer: 3; len: 5
layer: 4; len: 5
layer: 5; len: 5
layer: 6; len: 4
layer: 7; len: 1
layer: 8; len: 1
time elapsed: 0.02 s


In [16]:
sink= max(root_lib.keys())

if 0 not in root_lib[sink].keys():
    while 0 not in root_lib[sink].keys():
        sink -= 1

data_phyl= root_lib[sink][-2]
phylo_bool, Mp, col_lib, Mp_similarity= Gus_phylo_test(data_phyl)

print('has tree: {}'.format(phylo_bool))

has tree: False


In [17]:
Tree, leaves, edges= Gus_get_phylo(Mp,col_lib,Mp_similarity)

node_list= list(range(-1,Mp.shape[1]))
root= True
nodes_as_seqs= True

plot_phyl_net(data_phyl,leaves,node_list,edges,
              nodes_as_seqs= nodes_as_seqs,root= root)

This is because the coalescent will explore parallel paths to an ancestry, so that not all haplotypes 
across paths are compatible with an infinite sites model. While the resulting network looks the same, by browsing over the nodes you will notice that one of the nodes holds two different sequences, one of which should in fact have been a new node and edge linking to the dual node. 

But there is an interest in reproducing the graph of all possible haplotypes. Especially if we have access to all possible ancestors. For example, if we wish to link up different networks, then we might want to hold on to hypothetical ancestral haplotypes to anchor new branches. The uncertainty might also be solved if one of the ambiguous haplotypes shows up in another data set. 

The next script creates the graph from root to modern day leaves.

In [18]:
from sklearn.metrics import pairwise_distances

def tree_descent_net(root_lib,point_up,sink,init= [0]):
    
    for layer in list(range(1,sink + 1))[::-1]:
        
        where_to= point_up[layer - 1]
        
        if layer == sink:
            starters= init
        else:
            starters= list(set(where_to[:,1]))

        point_up_house= where_to[where_to[:,3] == 0]

        if layer == sink:
            
            AC= root_lib[sink][0][0]
            
            leaves= {
                -1: root_lib[sink][-2][AC[0]]
            }
            
            nodes= {
                -1: []
            }
            
            node_code= {
                AC[0]: -1
            }
            
            edges= []
        
        
        for row in range(point_up_house.shape[0]):
            line= point_up_house[row,:]
            
            who_app= [x for x in root_lib[layer - 1][line[0]][:,0] if x not in root_lib[layer][line[1]][:,0]][0]
            
            if who_app not in node_code.keys():
                node_code[who_app]= len(node_code) - 1
                
                nodes[node_code[who_app]]= []
                
                leaves[node_code[who_app]]= root_lib[layer][-2][who_app]
            
            mut= point_up_house[row,4]
            
            who_seq= list(root_lib[layer - 1][-2][who_app])
            
            who_seq[mut]= 0
            
            dists= pairwise_distances(np.array(who_seq).reshape(1,-1), root_lib[layer][-2],
                                                metric='manhattan')[0]
            
            which= np.where(dists==0)[0]
            
            for poss in which:
                
                if poss != who_app:
                    
                    if poss not in node_code.keys():

                        node_code[poss]= len(node_code) - 1

                        nodes[node_code[poss]]= []

                        leaves[node_code[poss]]= root_lib[layer][-2][poss]
                    
                    code_dn= node_code[poss]

                    new_edge= (code_dn,node_code[who_app])
                    
                    if new_edge not in edges:
                        edges.append(new_edge)

                        nodes[new_edge[0]].append(new_edge[1])

    return nodes, edges, leaves, node_code





In [19]:
##

nodes, edges, leaves, node_code= tree_descent_net(root_lib,point_up,sink,init= [0])

from structure_tools.Coalesce_plots import get_ori_graph

present= True
nodes_as_seqs= True
root= True



get_ori_graph(root_lib,edges,node_list,leaves,present= present,
                                            nodes_as_seqs= nodes_as_seqs,
                                            root= root)


[0, 3, 4, 5]


color code: 
    - blue= present day haplotypes.
    - purple= haplotypes present only as precendents.
    - red= root.

### Conclusions

- Every group of ambiguous nodes is a directed graph in which the the sink is always a known hap. 
- Every path from root to sink has the same length = the number of differences between root and sink.
- Breadth first search returns nodes grouped by dist from source/sink in layers.

The final observation leads to the following:

                    - Number of nodes per layer: l combinations in D.
                    - total number of ambiguous nodes from source to sink: sum of (i comb. in D) for i in [1 through D-1].

For `D` = number of differences between source and sink haps, and `l`= layer number (0 at sink).

The preceding might only be true of networks constructed from coalescent reconstructions of data respecting infinite sites assumption. 