In [4]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint

import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)



## Simulations

- Hein, J., Schierup, M., & Wiuf, C. (2004). Gene genealogies, variation and evolution: a primer in coalescent theory. Oxford University Press, USA. (*)

(*) referenced as GGVE throughout.


### Discrete and continuous coalescent - functions.

i. Geometric distribution.

- GGVE pp. 36 through 38.

In [6]:
from functools import reduce
from math import factorial
from operator import mul    # or mul=lambda x,y:x*y
from fractions import Fraction

def nCk(n,k):
  return int( reduce(mul, (Fraction(n-i, i+1) for i in range(k)), 1) )


def prob_noneF(k,Nsamp= 10):
    
    um= [1 - i / 2 / Nsamp for i in range(1, k)]
    
    um= reduce((lambda x, y: x * y), um)
    
    return um

def probOneF(k,Nsamp= 10):
    
    um= 1 - prob_noneF(k,Nsamp)
    
    return um

def CoalGeomF(j,k= 4,Nsamp= 10):
    '''
    Probability that (at least) 2 genes out of K find a common ancestor j generations ago.
    '''
    
    none_h= prob_noneF(k,Nsamp)
    prob_h= 1 - none_h
    
    um= (none_h)**(j-1) * prob_h
    
    return um


###
###

def probOne(k,Nsamp= 10):
    
    um= nCk(k,2) / 2 / Nsamp
    
    return um

def CoalGeom(j,k= 4,Nsamp= 10):
    '''
    Probability that (at least) 2 genes out of K find a common ancestor j generations ago.
    '''
    
    dt= 1 - probOne(k,Nsamp)
    dt= dt**(j-1)
    dt= dt * probOne(k,Nsamp)
    
    return dt

def CoalGeom_mean(j,mean= 2.5):
    '''
    Probability that (at least) 2 genes out of K find a common ancestor j generations ago.
    '''
    prob= 1 / mean
    
    dt= 1 - prob
    dt= dt**(j-1)
    dt= dt * prob
    
    return dt


ii. Exponential Distribution.

- GGVE pp. 39.

In [230]:
def expTk_cdf(t,k= 4):
    
    power= (-1) * nCk(k,2) * t
    um= 1 - np.exp(power)
    
    return um


def expTk_pdf(t,k= 4):
        
    a= nCk(k,2)
    
    um= a * np.exp(-a * t)
    
    return um


def tkdens_mean(t,meanA= 2.5):
    
    a= 1 / meanA
    
    um= a * np.exp(-a * t)
    
    return um


**plot**

In [394]:
Nsamp= 10
k= 4
example_mean= 2.5


In [395]:

X_geom= list(range(11))
X_exp= list(np.linspace(0,10,50))


geomF= [CoalGeomF(x,k= k,Nsamp=Nsamp) for x in X_geom]
geomS= [CoalGeom(x,k= k,Nsamp=Nsamp) for x in X_geom]
geomM= [CoalGeom_mean(x,mean=example_mean) for x in X_geom]

Exp_cdf= [expTk_cdf(x,k= k) for x in X_exp]
Exp_pdf= [expTk_pdf(x,k= k) for x in X_exp]
Exp_mean= [tkdens_mean(x,meanA= example_mean) for x in X_exp]

fig_c= [go.Scatter(
    x= X_geom,
    y= geomF,
    mode= 'lines',
    name= 'Geom full'
)]

fig_geom2= [go.Scatter(
    x= X_geom,
    y= geomS,
    mode= 'lines',
    name= 'Geom Simp'
)]

fig_geom_mean= [go.Scatter(
    x= X_geom,
    y= geomS,
    mode= 'lines',
    name= 'Geom X= {}'.format(example_mean)
)]


fig_exp_II= [go.Scatter(
    x= X_exp,
    y= Exp_mean,
    mode= 'lines',
    name= 'Exp dens X= {}'.format(example_mean)
)]


fig_c.extend(fig_geom2)
fig_c.extend(fig_exp_II)
fig_c.extend(fig_geom_mean)



layout= go.Layout(
    title= 'Discrete and continuous coalescent. k= {}; n= {} if not mean.'.format(k,Nsamp),
    yaxis= dict(title= 'P'),
    xaxis= dict(title= 't')
)


Fig= go.Figure(data= fig_c, layout= layout)
iplot(Fig)

In [402]:
fig_exp= []

##
fig_exp_cdf= [go.Scatter(
    x= X_exp,
    y= Exp_cdf,
    mode= 'lines',
    name= '1 - exp(-at)'
)]

fig_exp_pdf= [go.Scatter(
    x= X_exp,
    y= Exp_pdf,
    mode= 'lines',
    name= 'a * exp(-at)'
)]

fig_exp.extend(fig_exp_cdf)
fig_exp.extend(fig_exp_pdf)

layout= go.Layout(
    title= 'dist of Tck for k -> k-1; k= {} n= {}.'.format(k,Nsamp),
    yaxis= dict(title= 'P'),
    xaxis= dict(title= 't')
)


Fig= go.Figure(data= fig_exp, layout= layout)
iplot(Fig)

#### Algorithm I.

Simple coalescence.

- GGVE pp. 39.

In [419]:
from random import choices
from scipy.stats import expon
   
def get_time(k= 4,Nt=10):
    
    data= expon.rvs(size= Nt,scale= 1 / nCk(k,2))
    
    return data


def sim_coales(k= 4):
    
    Tcs= []
    
    for co in range(k-1):
        
        td= get_time(k= k - co,Nt=1)
        
        Tcs.extend(td)
        
        
    return Tcs


def SimI_net(k= 4):

    times= sim_coales(k= k)
    times_acc= [sum(times[:(x+1)]) for x in range(len(times))]
    #times_acc= times_acc[::-1]
    

    node_ord= np.random.choice(list(range(k)), k, replace= False)

    sim_keys= {z: [z] for z in range(k)}
        
    edges= []
    
    surface= list(range(k))
    random.shuffle(surface)
    
    for cl in range(len(times)):
        
        if cl == len(times) - 1:
            new_nd= -1
        
        else:
            new_nd= cl + k
        
        pair_idx= np.random.choice(list(range(len(surface))),2,replace= False)
        
        pair= tuple([surface[x] for x in pair_idx])
        
        new_edges= [tuple([new_nd,x]) for x in pair]
        edges.extend(new_edges)
        
        surface= [surface[x] for x in range(len(surface)) if x not in pair_idx]        
        surface.append(new_nd)
        
        sim_keys[new_nd]= list(pair)
        
        
        
        ti= round(times_acc[cl],2)
        leaves[new_nd]= ['t: {}'.format(ti)]
        
    
    return sim_keys, times_acc, leaves, edges





In [421]:
from structure_tools.Coalesce_plots import plot_phyl_net

k= 7

sim_keys, times_acc, leaves, edges= SimI_net(k= k)

##
node_list= list(sim_keys.keys())
root= True
nodes_as_seqs= False

plot_phyl_net([],leaves,node_list,edges,
              nodes_as_seqs= nodes_as_seqs,root= root)
