In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint

import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)



## Chapter III - Sample convergence

#### References

- Griffiths, R. (2006). Coalescent lineage distributions. Advances in Applied Probability, 38(2), 405-429. 
doi:10.1239/aap/1151337077

- Hein, J., Schierup, M., & Wiuf, C. (2004). Gene genealogies, variation and evolution: a primer in coalescent theory. Oxford University Press, USA. (*)

(*) referenced as GGVE throughout.


### Index

A. Coalescent only.

    i. Nested subsamples;
    ii. Hanging subtrees;
    iii. Unbalanced Trees;
    iv. Disjoint subsamples.

B. Partition by Mutation.

    i. Sample MRCA-Mut expected time;
    ii. MRCA time of 2 sequences.
    iii. Probability of k ancestors at time t.
    iv. Probability of m ancestors at t1 knowing n and k for times 0, t.
    


#### i. Nested sub-samples.

(m,n) such that m is nested in n.

In [2]:
def nested_jumps(m,n):
    '''
    - jump process probabilities.
    '''
    jump_dict= {
        (m-1,n-1): m * (m-1) / j / (j - 1),
        (m,n - 1): 1 - i*(i-1) / j / (j-1)
    }
    
    return jump_dict
    

def Nested_sameMRCA(m,n):
    '''
    probability of same MRCA between nested subsamples.
    '''
    mrc= (n+1) * (m - 1) 
    mrc= mrc / ((n-1) * (m+1))
    
    return mrc



#### ii. Hanging subtrees

Descendent configuration knowing initial ancestor and descendent number. 

Probability of labelled/unlabelled, ordered/unordered partitions given a set of k ancestors and n samples at t0.

In [3]:
from functools import reduce
from math import factorial


### unlabelled, ordered.
def UnlOrd(k,n):
    up= factorial(n - k) * factorial(k - 1)
    up= up / factorial(n - 1)
    
    return UnlOrd


### labelled ordered
def LabOrd(k,n,conf_vec):
    
    up= [UnlOrd(k,n), *[factorial(x) for x in conf_vec]]
    up= reduce((lambda x, y: x * y), up)
    up= up / factorial(n)
    
    return up


### unlabelled unordered
def UnlUn(k,n,conf_vec):
    freq_conf= [len(np.where(np.array(conf_vec) == x)[0]) for x in conf_vec]
    freq_conf= [factorial(x) for x in freq_conf]
    freq_conf= reduce((lambda x, y: x * y), freq_conf)
    
    up= UnlOrd(k,n) * factorial(k)
    up= up / freq_conf
    
    return up


### labelled, unordered
def LabUn(k,n,conf_vec):
    
    up= LabOrd(k,n,conf_vec) * factorial(k)
    
    return up


#### iii. Unbalancedd trees.

Basal split between two samples. Tree is said to be unbalanced if one of the sides retains a single sample at t= 0.

for (i, n-i), i= 1,2,.., [n/2].

In [4]:
## probability of basal split resulting in labelled unordered partition (i, n-i):
def BasalSplitP_lab(n,i):
    '''
    probability of labelled partition (i,n-i) given basal split.
    '''
    if i == 1:
        return 2 / n / (n-1)
    
    up= factorial(2 * i) * factorial(n - i)
    up= up / factorial(n) / (n-1)
    
    return up

## same for unlabelled partition
def BasalSplitP_unl(n,i):
    '''
    probability of unlabelled partition (i,n-i) given basal split.
    '''
    
    up= 2 - int(i % 2 and i == n/2)
    up= up / n-1
    
    return up



#### iv. Disjoint Subsamples

What is the probability that all genes in subsample A coalesce with each
other before coalescing with any of the genes in B?

- Let the number of genes in A be k such that there are n − k genes in B.
- Jump process to continue until (1,j) has been reached for some j. (0,j) denotes full merger of A into B.

In [5]:
def disjoint_jumps(m,n,ref= True):
    
    jump_dict= {
        (m - 1,n): (m + int(ref)) / (m + n),
        (m,n - 1): (n - int(ref)) / (m + n) 
    }
    
    return jump_dict


def loneMRCA(k,n):
    '''
    probability that subsample of size k out of n finds MRCA before merging.
    '''
    
    up= 2 * factorial(k - 1)
    dn= [n - x for x in range(1,k)]
    dn= [k+1,*dn]
    dn= reduce((lambda x, y: x * y), dn)
    
    return up / dn


def sub_MRCA(n,k):
    '''
    probability at at least one of the subSamples finds its MRCA before merging.
    '''
    up= loneMRCA(k,n) + lineMRCA(n-k,n) - BasalSplitP_Unl(n,k)
    
    return up



def disjoint_summary(k,n):
    '''
    mean time until MRCA and absorption for sample of size k out of N. 
    Expected size of the rest of the sample at the time of absorption.
    '''
    Exp= {
        'MRCA': (k - 1) / n,
        'mut': 2 * k / n,
        'ANCb': 3 * (n + 1) / (K + 2) - 2
    }
    
    return Exp


### Partition with Mutation.

#### Time

In [6]:
from operator import mul    # or mul=lambda x,y:x*y
from fractions import Fraction

def nCk(n,k):
  return int( reduce(mul, (Fraction(n-i, i+1) for i in range(k)), 1) )


def mut_jumps(m,n,ref= True):
    '''
    - jump process.
    - (1,j) denotes MRCA of A; (0,j) denotes occurence of mutation. 
    '''
    jump_dict= {
        (m - 1,n): m / (m + n - 1),
        (m,n - 1): (n - 1) / (m + n - 1) 
    }
    
    return jump_dict


def Time_mut(k,n):
    
    sum_p= 0
    
    for i in range(2,n - k + 2):
        lin= nCr(n-i-1, k-2) / i
        sum_p += lin
    
    tMRCA= 2 * k / (n - 1) - 2 / nCr(n-1,k)
    tMRCA= up * sum_p
    
    ###
    sum_p= 0
    
    for i in range(2,n - k + 2):
        lin= nCr(n-i-1, k-1) / i
        sum_p += lin
    
    tmut= 2 * k / (n - 1) - 2 / nCr(n-1,k)
    tmut= up * sum_p
    
    Exp= {
        'MRCA': tMRCA,
        'mut': tmut,
        'ANCb': -1
    }
    
    return Exp

###
### 

def TimeMut_simple(k,n):
    
    X= k / n
    
    Exp= {
        'MRCA': 2 * X + 2 * X**2 / (1 - X)**2 * (np.log(X) + 1 - X),
        'mut': -(2 * X)/ (1-X) * np.log(X),
        'ANCb': 2 / X
    }
    
    return Exp
    

#### Age of MRCA of 2 sequences.

GGVE pp. 91.

In [7]:

def tMRCA2s(t,k,theta):
    
    ft= ((1 + theta)**(k+1)) / factorial(k) 
    ft= ft * t**k 
    ft= ft * np.exp(-(1 + theta)*t)
    
    return ft

def t2MRCA_mean(k,theta):
    
    tm= (k + 1) / (theta + 1)
    
    return tm



In [8]:
from math import factorial
from scipy.stats import gamma


mut_rate= 9.5e-9
Theta= 2.04
Nt= Theta / (mut_rate * 4)

k= 3

mean_t= t2MRCA_mean(k,Theta)


X_plot= list(np.linspace(0,4,50))
Tdens= [tMRCA2s(x,k,Theta) for x in X_plot]
gamma_compare= [gamma.pdf(x, a = k+1, scale = Theta + 1) for x in X_plot]


In [9]:

fig_MRCA= [go.Scatter(
    x= X_plot,
    y= Tdens,
    mode= 'markers',
    name= 'GGVE'
)]

fig_gamm= [go.Scatter(
    x= X_plot,
    y= gamma_compare,
    mode= 'markers',
    name= 'Gamma'
)]

#fig_MRCA.extend(fig_gamm)
#fig_MRCA= [fig_MRCA, fig_gamm]


layout= go.Layout(
    title= 'density of MRCA T2 for S2= {}'.format(k),
    yaxis= dict(title= 'P'),
    xaxis= dict(title= 't, theta= {}'.format(Theta)),
    shapes= [
        # Line Vertical
        {
            'type': 'line',
            'x0': mean_t,
            'y0': 0,
            'x1': mean_t,
            'y1': 1,
            'line': {
                'color': 'rgb(30,144,255)',
                'width': 3,
            },
        }]
)


Fig= go.Figure(data= fig_MRCA, layout= layout)
iplot(Fig)

### Probability of n to k ancestors.

GGVE pp. 92


In [10]:
from scipy.special import comb

#### Walsh 2001
def proBnk(t,n,k):
    
    pi_sum= 0
    
    for i in range(k,n+1):
        
        nbi= [n - x for x in range(i)]
        nbi= reduce((lambda x, y: x * y), nbi) 
        
        npi= [n + x for x in range(i)]
        npi= reduce((lambda x, y: x * y), npi) 
        
        if i == 1:
            kis= k
        else:
            kis= [k + x for x in range(i - 1)]
            kis= reduce((lambda x, y: x * y), kis) 
        
        ip= (2 * i - 1) * (-1)**(i-k) * kis * nbi
        
        ip= ip / (factorial(k) * factorial(i - k) * npi)
        
        ip= ip * np.exp(-1 * nCk(i,2) * t)
        
        pi_sum += ip
    
    return pi_sum


def probKGT(t,n,j,theta):
    pi_sum= 0
    
    for k in range(j,n + 1):
        
        nbi= [n - x for x in range(k)]
        nbi= reduce((lambda x, y: x * y), nbi) 
        
        npi= [n + theta + x for x in range(k)]
        npi= reduce((lambda x, y: x * y), npi) 
        
        if k == 1:
            kis= j + theta 
        else:
            kis= [j + theta + x for x in range(k - 1)]
            kis= reduce((lambda x, y: x * y), kis)
        
        ##
        pkt= np.exp((-1) * k * (k + theta-1) * t / 2)
        
        up= (2 * k + theta - 1) * kis * nbi
        dn= factorial(j) * factorial(k - j) * npi
        
        pi= pkt * (-1)**(k - j)
        pi= pi / (up / dn)
        
        pi_sum += pi
    
    return pi_sum




In [11]:
from numpy import prod

n= 8
range_k= list(range(1,n + 1))
X_plot= list(np.linspace(0,4.5,100))

fig_k = []

for k in range_k:
    
    Tk= [proBnk(x,n,k) for x in X_plot]
    #Tk= [probKGT(x,n,k,Theta) for x in X_plot]
    
    trace1= go.Scatter(
        x= X_plot,
        y= Tk,
        mode= 'lines',
        name= str(k)
    )
    
    fig_k.append(trace1)


layout= go.Layout(
    title= 'Prob of k anc. in time for n= {}, k= {}'.format(n,','.join([str(x) for x in range_k])),
    yaxis= dict(title= 'P'), #,range= [0,1]),
    xaxis= dict(title= 'time')
)


Fig= go.Figure(data= fig_k, layout= layout)
iplot(Fig)

### probablility of m ancestors at time t1 given n and k and times 0 and t respectively.

In [12]:
n= 7
k= 4
tk= 1

Ptk= proBnk(tk,n,k)

range_m= list(range(k,n + 1))
X_plot= list(np.linspace(0,tk,100))

fig_k = []

for m in range_m:
    
    Tk= [proBnk(x,n,m) * proBnk(tk - x,m,k) / Ptk for x in X_plot]
    
    trace1= go.Scatter(
        x= X_plot,
        y= Tk,
        mode= 'lines',
        name= str(m)
    )
    
    fig_k.append(trace1)


layout= go.Layout(
    title= 'Prob of m anc. in time for n= {}, k= {} at times 0 and {}'.format(n,
                                                                              k,
                                                                             tk),
    yaxis= dict(title= 'P'), #,range= [0,1]),
    xaxis= dict(title= 'time')
)


Fig= go.Figure(data= fig_k, layout= layout)
iplot(Fig)