In [5]:
import numpy as np
import itertools as it

import allel
import pandas as pd

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

from datetime import datetime

import collections
def recursively_default_dict():
    return collections.defaultdict(recursively_default_dict)

import matplotlib
matplotlib.use('Agg')

import matplotlib.pyplot as plt

## The coalescent and sampling - Attempt II.

In this notebook we study the evolution of allele frequencies in a given population in time. 

**Objective**
Estimate expected number of segregating private alleles given sample size for a given population. 

**Available**
- MRCA for our populations of interest, i.e. external branch length. 
- Effective population size. 
- Mutation rate
- Number of positions considered.

**Methods**
i) Using coalescent theory we estimate the expected number of novel mutations along the branch of interest. 

ii) We use a Markov Chain to estimate the present SFS of mutations given: number of generations, Ne, initial frequency (1 / 2N).

iii) We split the total number of mutations along the branch of interest into bins; we estimate the present SFS of each time bin, and the proportion of mutations that are fixed in the meantime. 

iv) Using bin specific SFS, bin specific number of segregating alleles, we estimate the expected number of segregating private alleles given number of haplotypes sampled.

**Caveat** 
Unlinked markers. Results can be considered has having selected a number `seqL` of positions randomly along the genome.


### Requirements

    - effective population size per generation;
    - number of generations;
    - scaled mutation rate.
    
   
### Index

#### I. Model assumptions.

#### II. Total number of alleles expected.

#### III. SFS expectations.

#### IV. 


### I. Model assumptions.

**Demographics**

We will deal with terminal branches only. 

We're going to go with populations of species X, for which models have been proposed.
see model dict below. `pop` dictionaries contain model parameters per population specifc branches:

- `N`: Population effective size;
- `T`: Branch length


In [242]:
model_dict= {
    'test': {
        'N': 1000,
        'T': 5000
    },
    'schweinfurthii': {
        'N':10528, 
        'T':17925
    },
    'troglodytes': {
        'N': 72001, 
        'T': 17925
        },
    'ellioti': {
        'N': 6033,
        'T': 33402
    },
    'verus': {
        'N': 5710,
        'T': 33402
    }
}

**mutations** 

We re going to refer here to coalescent theory. Our objective will not concern statistics on observed data, but simulations. For this we will rely on the notebook [Simulations](https://nbviewer.jupyter.org/github/SantosJGND/Coalescent/blob/master/Simulations.ipynb), section **Algorithms for simulating sequence evolution**,  from this [repository](https://github.com/SantosJGND/Coalescent) focused on coalescent explorations in python.


To keep it simple we will consider in this notebook a single simulation. 

Given :
- time;
- effective population size;
- scaled mutation rate;

we draw the number of expected mutations emerging in our branch of interest until the present. 

**select branch**

In [320]:
pop_select= 'test'

We begin with a simple example: the number of private mutations expected expected in a sample of size `k` of haplotypes of size `seqL`, given an estimate of mutation rate per generation per base pair `muNuc`.

From this input we calculate the parameters `mu`, the expected mutation rate per generation and `theta`, the Watterson estimator of population genetic diversity. 


## I. total number of mutations expected.

Exponential of rate `Theta / 2`

In [321]:
muNuc= 1.08e-8
seqL= 1e6

mu= muNuc * seqL
Ne= model_dict[pop_select]["N"]
k= 100 # Number of samples
Theta= 4 * Ne * mu

rate_Pmut= Theta / 2

print('Ne: {}'.format(Ne))
print('Theta: {}\nTotal mut rate: {}'.format(round(Theta,3),round(rate_Pmut,3)))

Ne: 1000
Theta: 43.2
Total mut rate: 21.6


Given the length of the branch

In [322]:
branch_len= model_dict[pop_select]['T']
MRCA= branch_len / Ne / 2

print("branch length: {}".format(MRCA))

branch length: 2.5


We estimate the number of mutations expected:

In [323]:
Pexp= 2 * Ne * MRCA * Theta / 2 ## Multiply the standard poisson rate by 2Ne to do all the pop.

muts= np.random.poisson(Pexp,1)[0]
print('Nmuts: {}'.format(muts))

Nmuts: 108357


## III. SFS expectations

We develop functions to extract the probability f(x) of an mutation occuring at time t in the past being found at frequency `x` today. 



In [429]:
from tools.sfs_utilities import(
    single_gen_matrix, freq_progression
    )

ModuleNotFoundError: No module named 'tools.sfs_utilities'

Because i don't know how to work with continuous Markov Models, we will work with discrete generations.

With `t < MRCA`, we will imagine that we have a number `nbins` of mutations falling uniformily along this branch. 


In [401]:
nbins= 21
bins= np.linspace(0,MRCA,nbins)
bins= np.round(bins,4)
bins= [(bins[x-1],bins[x]) for x in range(1,len(bins))]
t_list= [sum(x)/2 for x in bins]


In [402]:
gens= [x * branch_len / MRCA for x in t_list]
gens= np.array(gens,dtype= int)
print('mutation bin gens :')
print(gens)


mutation bin gens :
[ 125  375  625  875 1125 1375 1625 1875 2125 2375 2625 2875 3125 3375
 3625 3875 4125 4375 4625 4875]


We extract `f(x)` for x within ]0,1[

In [403]:
#gens= [200,500]

precision= 2 * Ne
sel_coeff= 0
remove_tails= True
multiplier= precision / Ne

freqi= 1
#freqi= int(freqi * multiplier)

print(freqi)
freq_matrix= single_gen_matrix(Ne= Ne,precision= precision,s=sel_coeff)

freq_prog= [freq_progression(freqi,n_gens= x, freq_matrix= freq_matrix,remove_tails= remove_tails) for x in gens]
fixed_tally= [x[1] for x in freq_prog]
freq_prog= [x[0] for x in freq_prog]

print(freq_matrix.shape)

1
(2000, 2000)


This gives us an approximation of the expected SFS of still segregating mutations that occured at different points in the past. We can consider these together since in real data we might not know which are which : 



In [404]:
surface= 'N'
surf= np.linspace(0,1,freq_matrix.shape[0])
surf_dict= {
    'freq': surf,
    'N': [int(x * Ne) for x in surf]
}

fig= [go.Scatter(
    x= surf_dict[surface],
    y= freq_prog[x][0],
    name= 't= ' + str(gens[x])
) for x in list(range(len(gens)))]

layout= go.Layout(
)

figure= go.Figure(data= fig,layout= layout)
iplot(figure)

**SFS of segregating alleles starting at different generations in the past.**

If we want to consider segregating mutations with different ages, we extract a weighed average of the data above. Weights are calculated using the relative numbers of alleles with different ages still segregating today. 

In [426]:
stand_num= np.array(nseg_tal) / sum(nseg_tal)
seg_total_matrix= np.array([x[0] for x in freq_prog])
seg_total_matrix= stand_num.reshape(-1,1) * seg_total_matrix
seg_total_matrix= np.sum(seg_total_matrix,axis= 0)
seg_total_matrix= seg_total_matrix / np.sum(seg_total_matrix)
seg_total_matrix.shape

(2000,)

In [427]:
fig= [go.Scatter(
    x= np.linspace(0,1,freq_matrix.shape[0]),
    y= seg_total_matrix
)]

layout= go.Layout(
    xaxis= dict(
        range= [-0.05,1.05]
    )
)

figure= go.Figure(data= fig,layout= layout)
iplot(figure)

**SFS of segregating private mutations** 


## IV. Fixed mutations.



In [409]:
def get_fixedtally(tally_array, total_prop= 1):
    '''
    get total propotion of fixed alleles.
    '''
    
    survived= []
    for idx in range(len(tally_array)):

        survived.append(tally_array[idx] * total_prop)
        
        total_prop= total_prop * (1 - tally_array[idx])
    
    survived= sum(survived)
    return total_prop, survived

tallies= [get_fixedtally(x) for x in fixed_tally]


In [410]:

muts= np.random.poisson(Pexp,1)[0]

mut_bin= int(muts / (nbins - 1))

nseg_tal= [x[0] * mut_bin for x in tallies]
nseg_tal= np.array(nseg_tal,dtype= int)

for idx in range(len(nseg_tal)):
    print('t: {}, N: {}'.format(gens[idx],nseg_tal[idx]))



t: 125, N: 85
t: 375, N: 29
t: 625, N: 18
t: 875, N: 13
t: 1125, N: 10
t: 1375, N: 8
t: 1625, N: 7
t: 1875, N: 6
t: 2125, N: 5
t: 2375, N: 5
t: 2625, N: 4
t: 2875, N: 4
t: 3125, N: 4
t: 3375, N: 3
t: 3625, N: 3
t: 3875, N: 3
t: 4125, N: 2
t: 4375, N: 2
t: 4625, N: 2
t: 4875, N: 2


## V. Sampling haplotypes

We now have the expected number of segregating mutations since the MRCA for this population, 

We can estimate the number of segregating alleles obtained by sampling from this distribution. 

We will make the assumption that frequencies are uncorrelated.

In [411]:
freq_surface= np.linspace(0,1,freq_matrix.shape[0])

## sampling parameters
Nreps= 50
sample_range= [2,1000]
Nsteps= 50
range_use= np.linspace(*sample_range,Nsteps,dtype= int)

ksamp_dict= {
    x: [] for x in range_use
}


for si in range_use:
    
    for proxy in range(Nreps):
        
        ###
        ###
        count_bin= 0
        for idx in range(nbins-1):
            probs= np.random.choice(freq_surface,nseg_tal[idx], p= freq_prog[idx][0])
            binary= [np.random.choice([0,1],si,p= [1-x,x]) for x in probs]
            binary= np.array(binary).T

            binary= np.sum(binary,axis= 0)
            binary[binary > 0]= 1
            
            binary= sum(binary)
            count_bin += binary
        
        ksamp_dict[si].append(count_bin)

        

In [428]:
stats_dict= {
    z: {
        'mean': np.mean(g),
        'std': np.std(g)
    } for z,g in ksamp_dict.items()
}

samp_order= sorted(ksamp_dict.keys())

fig= [go.Scatter(
    x= samp_order,
    y= [stats_dict[x]['mean'] for x in samp_order],
    error_y= dict(
        array= [stats_dict[x]['std'] for x in samp_order],
        type= 'data',
        #symmetric= True,
        visible=True
    )
)]

layout= go.Layout(
    title= pop_select,
    xaxis= dict(
        title= 'Nsamp'
    ),
    yaxis= dict(
        title= 'private alleles'
    ),
    width= 800,
    height= 800
)

Figure= go.Figure(data=fig, layout= layout)
iplot(Figure)