In [38]:
import numpy as np
from random import choices
from scipy.stats import expon
from functools import reduce
from math import factorial
from operator import mul    # or mul=lambda x,y:x*y
from fractions import Fraction

def nCk(n,k):
  return int( reduce(mul, (Fraction(n-i, i+1) for i in range(k)), 1) )

import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)


In [276]:
def get_time(k= 4,Nt=10):
    
    data= []
    
    while len(data) < Nt:
        ## return average instead of sample from dist. 
        um= 1/ nCk(k,2) #expon.rvs(size= 1,scale= 1 / nCk(k,2))[0]
        
        if um > 0:
            data.append(um)
    
    return data


def sim_coales(k= 4):
    
    Tcs= []
    
    for co in range(k-1):
        
        td= get_time(k= k - co,Nt=1)
        
        Tcs.extend(td)
        
        
    return Tcs

def get_randTree(nod_list):
    
    
    edges= []
    rev_edges= {}
    nodes= {}
    
    current_layer= list(nod_list)
    
    nnames= range(len(nod_list) - 1)
    nnames= nnames[::-1]
    nnames= ['n{}'.format(d) for d in nnames]
    
    nodes_times= sim_coales(k= len(nod_list))
    nodes_times= [sum(nodes_times[:(x+1)]) for x in range(len(nodes_times))]
    nodes_times= nodes_times[::-1]
    
    times_dict= {nnames[x]: nodes_times[x] for x in range(len(nnames))}
    
    d= 0
    for idx in range(len(nod_list) - 1):
        
        nname= nnames[idx]
        two= np.random.choice(current_layer,2,replace= False)
        #
        nodes[nname]= list(two)
        
        for nn in two:
            edges.append((nname,nn))
            rev_edges[nn]= nname
        
        current_layer.append(nname)
        current_layer= [x for x in current_layer if x not in two]
        d += 1
    
    return nodes, edges, rev_edges, times_dict



In [243]:
def leaf_to_root(leaves,rev_edges,source= 'n0'):
    #rev_edges= [x[::-1] for x in edges]
    
    leaf_path= {x:[] for x in leaves}
    
    for ed in leaves:
        curr= ed
        up= rev_edges[ed]
        
        d= 0
        while d==0:
            leaf_path[ed].append(up)
            
            curr= ed
            
            if up == source: 
                
                d += 1
            else:
                up= rev_edges[up]
    
    return leaf_path



In [244]:
def tree_dists(leaf_path):
    
    all_dists= []
    for x,g1 in leaf_path.items():
        dist_foc= []
        for y,g2 in leaf_path.items():
            comb= [x for x in g1 if x in g2][0]
            
            dists= [g1.index(comb),g2.index(comb)]
            
            if x == y:
                dist_foc.append(0)
            else:
                
                dist_foc.append(max(dists)+1)
        
        all_dists.append(dist_foc)
    
    all_dists= np.array(all_dists)
    
    return all_dists


def tree_timeS(nodes_times, leaf_path):
    
    all_dists= []
    for x,g1 in leaf_path.items():
        dist_foc= []
        for y,g2 in leaf_path.items():
            comb= [x for x in g1 if x in g2][0]
            
            dists= nodes_times[comb]
            
            if x == y:
                dist_foc.append(0)
            else:
                
                dist_foc.append(dists)
        
        all_dists.append(dist_foc)
    
    all_dists= np.array(all_dists)
    
    return all_dists


Nnodes= 5
nod_names= list(range(Nnodes))
nod_names= ["l{}".format(x) for x in nod_names]
triu= np.triu_indices(len(nod_names),k=1)


In [245]:

Nrep= 200

dist_store= []
times_store= []

for ix in range(Nrep):
    
    nodes, edges, rev_edges, nodes_times= get_randTree(nod_names)
    leaf_path= leaf_to_root(nod_names,rev_edges,source= 'n0')
    
    dists= tree_dists(leaf_path)
    times_mat= tree_timeS(nodes_times, leaf_path)
    
    tree_dist= dists[triu]
    times_mat= times_mat[triu]
    
    dist_store.append(tree_dist)
    times_store.append(times_mat)

times_store= np.array(times_store)
dist_store= np.array(dist_store)


In [246]:
###
###

In [247]:
from scipy.stats import poisson

actual_dists= [
    [0,1,0,1,2],
    [1,0,0,4,2],
    [1,1,0,2,1],
    [1,2,4,0,0],
    [0,0,0,0,0]
]

actual_dists= np.array(actual_dists)
actual_dists= actual_dists[triu]

actual_dists= actual_dists.reshape(1,-1)
actual_dists= np.repeat(actual_dists,Nrep,axis= 0)

In [248]:
Theta= 2


In [252]:
Pexp= times_store * Theta / 2

probs= poisson.pmf(actual_dists,Pexp)
probs= np.prod(probs,axis= 1)


In [253]:
from sklearn.decomposition import PCA

n_comp= 4
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')

feats= pca.fit_transform(times_store)


In [254]:

figwl= [go.Scatter(
    mode='markers',
    x=feats[:,0],
    y=feats[:,1],
    #z=background[:,2],
    marker= {
    'color':probs,
    'colorbar': go.scatter.marker.ColorBar(
        title= 'likelihood',
        yanchor="top", y=0.3,
        lenmode="pixels", len=200,
    ),
    'colorscale':'Viridis',
    'line': {'width': 0},
    'size': 10,
    'symbol': 'circle',
  "opacity": .7
  }
)]

layout= go.Layout(
    xaxis= dict(title= 'PC1'),
    yaxis= dict(title= 'PC2'),
    height= 900,
    width= 900
)

Figure_wl= go.Figure(data= figwl, layout= layout)

iplot(Figure_wl)

In [255]:
p= np.argmax(probs)
p

102

In [256]:
probs[p]

8.271988572550158e-17

In [257]:
dist_store[p]

array([2, 3, 2, 2, 4, 1, 3, 4, 2, 3])

In [258]:
times_store[p]

array([0.14888889, 0.01      , 0.14888889, 0.03777778, 0.01      ,
       1.14888889, 0.03777778, 0.01      , 0.01      , 0.03777778])

In [259]:
actual_dists[0]

array([1, 0, 1, 2, 0, 4, 2, 2, 1, 0])

In [260]:
####
#### vary theta


In [261]:
from sklearn.metrics import pairwise_distances

actual_data= [
    [1,1,0,0],
    [1,1,0,1],
    [0,0,0,0],
    [0,0,1,0],
    [0,0,1,0]
]


actual_data= [
    [1,1,0,0],
    [1,1,0,1],
    [0,0,0,0],
    [0,0,1,0],
    [0,0,1,0]
]

actual_data= np.array(actual_data)

actual_dists= pairwise_distances(actual_data,metric= 'manhattan')
actual_dists= actual_dists[triu]
actual_dists= actual_dists.reshape(1,-1)


In [312]:


Nrep= 1000
dists_matrix= np.repeat(actual_dists,Nrep,axis= 0)
Theta_range= np.linspace(0.1,50,200)

Nnodes= 5
nod_names= list(range(Nnodes))
nod_names= ["l{}".format(x) for x in nod_names]



In [327]:
def prob_coal(theta,nsamp):
    
    p= (nsamp - 1) / (nsamp - 1 + theta)
    
    return p

In [336]:
triu= np.triu_indices(len(nod_names),k=1)

dist_store= []
times_store= []

for ix in range(Nrep):

    nodes, edges, rev_edges, nodes_times= get_randTree(nod_names)
    leaf_path= leaf_to_root(nod_names,rev_edges,source= 'n0')

    dists= tree_dists(leaf_path)
    times_mat= tree_timeS(nodes_times, leaf_path)

    tree_dist= dists[triu]
    times_mat= times_mat[triu]

    dist_store.append(tree_dist)
    times_store.append(times_mat)

times_store= np.array(times_store)
dist_store= np.array(dist_store)
#print(dist_store.shape)
###

test_list= []



for Theta in Theta_range:
    
    ###
    #prob_coals= [prob_coal(Theta,x) for x in range(2,len(nod_names)+1)]
    #prob_coals= np.prod(prob_coals)
    ###
    Pexp= times_store * Theta / 2
    
    probs= poisson.pmf(dists_matrix,mu= Pexp)
    probs= np.prod(probs,axis= 1) 
    
    test_list.append([Theta,np.sum(probs)])

test_list= np.array(test_list)


In [337]:
[prob_coal(Theta,x) for x in range(2,len(nod_names)+1)]

[0.0196078431372549,
 0.038461538461538464,
 0.05660377358490566,
 0.07407407407407407]

In [338]:
#pexp= 3

#test_space= np.linspace(1,4,20,dtype= int)
#test= [poisson.cdf(x,mu=3) for x in test_space]

figwl= [go.Scatter(
    mode='markers',
    x=test_list[:,0],
    y=test_list[:,1],
    #z=background[:,2]
)]

layout= go.Layout(
    xaxis= dict(title= 'Theta'),
    yaxis= dict(title= 'prod L'),
    height= 900,
    width= 900
)

Figure_wl= go.Figure(data= figwl, layout= layout)

iplot(Figure_wl)

# Attempt II.



In [594]:


def get_TreeLikes(nod_list,hap_dict,Theta= 1):
    
    edges= []
    rev_edges= {}
    nodes= {}

    current_layer= list(nod_list)

    nnames= range(len(nod_list) - 1)
    nnames= nnames[::-1]
    nnames= ['n{}'.format(d) for d in nnames]

    times= sim_coales(k= len(nod_list))
    nod_times= [times[0]]

    for idx in range(1,len(times)):
        nod_times.append(times[idx] - nod_times[-1])

    nodes_times= [sum(times[:(x+1)]) for x in range(len(times))]
    nodes_times= nodes_times[::-1]

    times_dict= {nnames[x]: nod_times[x] for x in range(len(nnames))}
    
    d= 0

    dist_vec= []
    probs= []
    for idx in range(len(nod_list) - 1):

        nname= nnames[idx]
        two= np.random.choice(current_layer,2,replace= False)
        #
        nodes[nname]= list(two)

        ##
        diff_two= hap_dict[two[0]] - hap_dict[two[1]]
        dist_two= diff_two != 0
        dist_two= np.array(dist_two,dtype= int)
        dist_vec.append(sum(dist_two))

        ##
        new_hap= diff_two == 0
        new_hap= np.array(new_hap,dtype= int)
        hap_dict[nname]= new_hap
        
        if sum(dist_two) == 0:
            prb= prob_coal(Theta,len(nod_list)-idx)
        else:
            prb= prob_mut(Theta,len(nod_list)-idx) ** sum(dist_two)
        probs.append(prb)
        
        for nn in two:
            edges.append((nname,nn))
            rev_edges[nn]= nname

        current_layer.append(nname)
        current_layer= [x for x in current_layer if x not in two]

        d += 1

    return nodes, edges, rev_edges, times_dict, dist_vec, nod_times, probs



In [584]:


Nrep= 1000
dists_matrix= np.repeat(actual_dists,Nrep,axis= 0)
Theta_range= np.linspace(0.1,20,200)

Nnodes= 5
nod_names= list(range(Nnodes))
nod_names= ["l{}".format(x) for x in nod_names]



In [595]:
Nrep= 100
triu= np.triu_indices(len(nod_names),k=1)
hap_dict= {nod_names[x]:actual_data[x] for x in range(len(nod_names))}
nod_list= nod_names

dist_store= []
times_store= []
prob_store= []

for ix in range(Nrep):

    nodes, edges, rev_edges, times_dict, dist_vec, nod_times, probs= get_TreeLikes(nod_list,hap_dict)
    dist_store.append(dist_vec)
    times_store.append(nod_times)
    prob_store.append(probs)

prob_store= np.array(prob_store)
times_store= np.array(times_store)
dist_store= np.array(dist_store)

In [597]:

def prob_mut(theta,nsamp):
    
    p= theta / (nsamp - 1 + theta)
    
    return p

test_list= []

for Theta in Theta_range:
    
    ###
    prob_coals= [prob_coal(Theta/2,x) for x in range(2,len(nod_names)+1)]
    mut_probs= [prob_mut(Theta,x) for x in range(2,len(nod_names)+1)]
    #mut_probs= np.array(mut_probs).reshape(1,-1)
    #mut_probs= np.repeat(mut_probs,Nrep,axis= 0)
    
    #prob_coals= prob_coals[::-1]
    ##
    Pexp= times_store * Theta / 2
    Pexp= Pexp 
    ##
    
    probs= poisson.pmf(dist_store,mu= Pexp)
    
    probs= np.prod(probs,axis= 1) 
    
    test_list.append([Theta,np.max(probs)])

test_list= np.array(test_list)


In [601]:
#pexp= 3

#test_space= np.linspace(1,4,20,dtype= int)
#test= [poisson.cdf(x,mu=3) for x in test_space]

figwl= [go.Scatter(
    mode='markers',
    x=test_list[:,0],
    y=test_list[:,1],
    #z=background[:,2]
)]

layout= go.Layout(
    xaxis= dict(title= 'Theta'),
    yaxis= dict(title= 'prod L'),
    height= 900,
    width= 900
)

Figure_wl= go.Figure(data= figwl, layout= layout)

iplot(Figure_wl)

In [591]:
####
prob_store= []
for Theta in Theta_range:
    
    prob_vec= []
    
    for ix in range(Nrep):

        nodes, edges, rev_edges, times_dict, dist_vec, nod_times, probs= get_TreeLikes(nod_list,hap_dict)
        prob_vec.append(probs)
    
    probs_vec= np.array(prob_vec)
    probs_vec= np.prod(probs_vec,axis= 1)
    prob_store.append(np.sum(probs_vec))


In [593]:
probs_vec

array([4.34027778e-05, 4.16666667e-06, 3.25520833e-05, 1.30208333e-06,
       1.25000000e-03, 8.33333333e-05, 5.55555556e-06, 1.73611111e-06,
       1.54320988e-05, 2.31481481e-06, 5.55555556e-06, 2.31481481e-04,
       3.61689815e-06, 1.04166667e-06, 1.38888889e-04, 1.38888889e-06,
       6.94444444e-06, 2.31481481e-04, 1.73611111e-06, 1.66666667e-04,
       2.77777778e-05, 5.55555556e-06, 2.77777778e-05, 1.04166667e-05,
       2.77777778e-05, 6.51041667e-06, 1.66666667e-04, 3.25520833e-05,
       1.08506944e-05, 8.33333333e-03, 5.55555556e-06, 9.25925926e-04,
       6.94444444e-04, 2.08333333e-03, 1.04166667e-06, 6.94444444e-04,
       2.77777778e-05, 2.08333333e-03, 6.94444444e-04, 8.68055556e-06,
       4.34027778e-05, 6.94444444e-04, 1.66666667e-04, 6.94444444e-04,
       2.08333333e-05, 5.20833333e-04, 6.94444444e-04, 1.04166667e-06,
       1.66666667e-05, 6.94444444e-04, 2.31481481e-04, 4.16666667e-06,
       2.60416667e-04, 1.38888889e-04, 1.38888889e-06, 3.08641975e-06,
      

In [592]:
#pexp= 3

#test_space= np.linspace(1,4,20,dtype= int)
#test= [poisson.cdf(x,mu=3) for x in test_space]

figwl= [go.Scatter(
    mode='markers',
    x=Theta_range,
    y=prob_store,
    #z=background[:,2]
)]

layout= go.Layout(
    xaxis= dict(title= 'Theta'),
    yaxis= dict(title= 'prod L'),
    height= 900,
    width= 900
)

Figure_wl= go.Figure(data= figwl, layout= layout)

iplot(Figure_wl)

In [301]:
pexp= 1.8

test_space= np.linspace(0.1,4,20,dtype= int)
test= [poisson.pmf(x,mu=pexp) for x in test_space]


figwl= [go.Scatter(
    mode='markers',
    x=test_space,
    y=test,
    #z=background[:,2]
)]

layout= go.Layout(
    xaxis= dict(title= 'Theta'),
    yaxis= dict(title= 'prod L'),
    height= 900,
    width= 900
)

Figure_wl= go.Figure(data= figwl, layout= layout)

iplot(Figure_wl)