In [1]:
import numpy as np
import itertools as it

import allel
import pandas as pd

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)
    
from datetime import datetime

import collections
def recursively_default_dict():
    return collections.defaultdict(recursively_default_dict)

import matplotlib
matplotlib.use('Agg')

import matplotlib.pyplot as plt

## Private allele SFS.


Needed: 
    - effective population size per generation;
    - number of generations;
    - scaled mutation rate.

### Model

We're going to go with chimp populations, for which models have been proposed.
see model dict below. `pop` dictionaries contain model parameters per population specifc branches:

- `N`: Population effective size;
- `T`: Branch length


In [2]:
model_dict= {
        'schweinfurthii': {
            'N':10528, 
            'T':17925
        },
        'troglodytes': {
            'N': 72001, 
            'T': 17925
            },
        'ellioti': {
            'N': 6033,
            'T': 33402
        },
        'verus': {
            'N': 5710,
            'T': 33402
        }
}

### Mutations 

We re going to refer here to coalescent theory. Our objective will not concern statistics on observed data, but simulations. For this we will rely on the notebook [Simulations](https://nbviewer.jupyter.org/github/SantosJGND/Coalescent/blob/master/Simulations.ipynb), section **Algorithms for simulating sequence evolution**,  from this [repository](https://github.com/SantosJGND/Coalescent) focused on coalescent explorations in python.


To keep it simple we will consider in this notebook a single simulation. 

Given :
- time;
- effective population size;
- scaled mutation rate;

we draw the number of expected mutations emerging in our branch of interest until the present. 

## Attempt I. Coalescent given number of samples.

We use a procedure of coalescent simulation given the number of samples. We extract per simulation the number of mutations simulated that are more recent than the MRCA of this branch. See the notebook above for a visual inspection of the output of the simulation procedure. We will use the function SimIII from that notebook, modified here for our purposes. 

We first chose a model from the dictionary above. 


In [3]:
pop_select= 'schweinfurthii'

We begin with a simple example: the number of private mutations expected expected in a sample of size `k` of haplotypes of size `seqL`, given an estimate of mutation rate per generation per base pair `muNuc`.

From this input we calculate the parameters `mu`, the expected mutation rate per generation and `theta`, the Watterson estimator of population genetic diversity. 

In [4]:
muNuc= 1.08e-8
seqL= 1e6

mu= muNuc * seqL
Ne= model_dict[pop_select]["N"]
k= 100 # Number of samples
Theta= 4 * Ne * mu


We now perform coalescent simulations. This returns a network containing coalescent and mutation events from present haplotypes to their MRCA (function `SimIII`). Each event has an associated time stamp. We use the function `get_PA` to extract the time stamps of mutation events more recent than the branch of interest. 

In [7]:
from tools.coal_sims import SimIII, get_PA

sim_keys, leaves, edges= SimIII(k= k, Theta= Theta)

branch_len= model_dict[pop_select]['T']
MRCA= branch_len / Ne / 2

PA_obtain= get_PA(leaves,Ne,MRCA)
print('number of private alleles obtained: {}'.format(len(PA_obtain)))

number of private alleles obtained: 1701


#### get results across sample sizes

We now move to obtain results for a range of sample sizes. 

A number of repeats is perform per sample size, simulations are probablistic.

In [12]:
Nreps= 50
sample_range= [2,500]
Nsteps= 20

range_use= np.linspace(*sample_range,Nsteps)
Nstudy_dict= {int(z):[] for z in range_use}

for si in range_use:
    k= int(si)
    for nr in range(Nreps):
        
        sim_keys, leaves, edges= SimIII(k= k, Theta= Theta)
        
        PA_obtain= get_PA(leaves,Ne,MRCA)
        
        Nstudy_dict[k].append(len(PA_obtain))
        

The number of private alleles sampled is plotted against sample size. 

In [13]:
stats_dict= {
    z: {
        'mean': np.mean(g),
        'std': np.std(g)
    } for z,g in Nstudy_dict.items()
}
samp_order= sorted(Nstudy_dict.keys())

fig= [go.Scatter(
    x= samp_order,
    y= [stats_dict[x]['mean'] for x in samp_order],
    error_y= dict(
        array= [stats_dict[x]['std'] for x in samp_order],
        type= 'data',
        #symmetric= True,
        visible=True
    )
)]

layout= go.Layout(
    title= pop_select,
    xaxis= dict(
        title= 'Nsamp'
    ),
    yaxis= dict(
        title= 'private alleles'
    )
)

Figure= go.Figure(data=fig, layout= layout)
iplot(Figure)

### Across populations:

We now scale this approach to cover all branches in `model_dict`. 

**common parameters**

In [15]:
### biological parameters
muNuc= 1.08e-8
seqL= 1e6
mu= muNuc * seqL

## sampling parameters
Nreps= 10
sample_range= [2,100]
Nsteps= 20
range_use= np.linspace(*sample_range,Nsteps)

**deployment**

In [16]:
pops_dict= {}

for pop_select in model_dict.keys():
    Ne= model_dict[pop_select]["N"]
    
    Theta= 4 * Ne * mu
    
    branch_len= model_dict[pop_select]['T']
    MRCA= branch_len / Ne / 2

    Nstudy_dict= {int(z):[] for z in range_use}

    for si in range_use:
        k= int(si)
        for nr in range(Nreps):

            sim_keys, leaves, edges= SimIII(k= k, Theta= Theta)

            PA_obtain= get_PA(leaves,Ne,MRCA)

            Nstudy_dict[k].append(len(PA_obtain))
    
    stats_dict= {
        z: {
            'mean': np.mean(g),
            'std': np.std(g)
        } for z,g in Nstudy_dict.items()
    }
    samp_order= sorted(Nstudy_dict.keys())
    
    pops_dict[pop_select]= stats_dict


**plot**

In [21]:
fig= [go.Scatter(
    x= samp_order,
    y= [pops_dict[pop][x]['mean'] for x in samp_order],
    error_y= dict(
        array= [pops_dict[pop][x]['std'] for x in samp_order],
        type= 'data',
        #symmetric= True,
        visible=True
    ),
    name= pop
) for pop in model_dict.keys()]

layout= go.Layout(
    title= 'pop comparison',
    xaxis= dict(
        title= 'Nsamp'
    ),
    yaxis= dict(
        title= '# of private alleles'
    ),
    height= 950,
    width= 950
)

Figure= go.Figure(data=fig, layout= layout)
iplot(Figure)