In [1]:
import numpy as np
import itertools as it

import allel
import pandas as pd

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

from datetime import datetime

import collections
def recursively_default_dict():
    return collections.defaultdict(recursively_default_dict)

import matplotlib
matplotlib.use('Agg')

import matplotlib.pyplot as plt

## Branching

In this notebook we expand on the use of SFS for population genetic simulations. 

From notebooks I. and II. from this repository we will inport functions that rely on Markov Chain models of allele frequency evolution. So far we have seen how these work for a single population. In section II we developped functions to allow for effective population size to change in time. 

In this notebook we will connect single population simulations to work on branches along a given evolutionary tree. 

### I. The model

read from a demos model. This format holds the times of each split and the effective population sizes. 

In [2]:
from tools.ABC_utilities import (
    demos_to_SLiM
)

from tools.branch_utilities import (
    demo_file_branchProcess
)

batch= 'pm13_4a_1m_median'
recipe_dir= 'Recipes/demos_mat/'


demo_file= 'demos/test_small.txt'
template= 'Recipes/demos_mat/template_simple.slim'

anc_r= '0'
Nsamp= 1
sizes= 1000
burnin= 5e4
med_samp= True
rescale_dict= {}

directed= False
M_convert= True


tree, demo_data, tree_summ, tree_demo= demo_file_branchProcess(demo_file)

### II. Simulations. 

In [3]:

from scipy.stats import binom

from tools.sfs_utilities import (
    single_gen_matrix_v2, freq_progr_func, get_fixedtally_v2
)

from tools.ne_util import (
    theta_constant, theta_exp
)

from tools.branch_utilities import (
    node_assign, get_edge_dict,
    branch_progress, traverse_sfs
)

from tools.ABC_utilities import (
    sample_dist_beta, return_replica
)


In [23]:

sample_func= sample_dist_beta

scale_sim= False
burnin= 20

seqL= 1e6

muG= 1.08e-9

s= 0 # selection coefficient.
ploidy= 2 # ploidy.
Nsamp= 1


replic= [return_replica(x,sample_func=sample_func,func=int,rescale_dict= rescale_dict,med_samp= med_samp) for x in tree_demo]

node_dict= node_assign(replic, tree_summ)

theta_dict= {
    'func': theta_constant,
    'kargs': {}
}

anc_name= '0'

anc_size= demo_data['N'][anc_r]
anc_sample= sample_func(1,*anc_size,func= int,func_args= [],med_samp= med_samp)
asize= int(anc_sample[0])

sim_sfs= traverse_sfs(node_dict,tree_summ,theta_dict,node= '0',fr= 1,Ne= asize,Ne0=asize,
                        T= 1,muG= muG, ploidy= ploidy,s= s, seqL= seqL,
                        scale_sim= scale_sim,sample_func= sample_func,
                        med_samp= med_samp)



invalid value encountered in long_scalars


invalid value encountered in true_divide



### III. Generating allele frequencies.

The above simulations provide us with the probability with segregation of an allele introduced at any generation in the simulated past, together with the SFS of mutations segregating today, by population. In order to generate allele frequency profiles it is not as straightforward as combining these two data. We must consider the probability that segregating alleles are shared between populations. 

This happens an allele is introduced to the simulation in ancestral branches. 


In [24]:
### get frequencies
from tools.branch_utilities import (
    get_probs, merge_branch_info, pop_frequencies, hap_sample
)

node_stats, leaf_tracks= get_probs(sim_sfs,tree_summ)
gen_track, pseg_array, pick_array= merge_branch_info(node_stats,leaf_tracks,tree_summ)

leaves= tree_summ['leaves']

freq_array= pop_frequencies(gen_track, pseg_array, pick_array,node_stats,leaves)

In [28]:
pseg_array.shape

(801, 4)

In [33]:
list(range(1,pseg_array.shape[1]+1))

[1, 2, 3, 4]

In [36]:

fig_data= [go.Scatter(
    x= list(range(1,pseg_array.shape[0]+1))[::-1],
    y= pseg_array[:,i],
    mode= "lines",
  name= leaves[i]
) for i in range(pseg_array.shape[1])]


layout = go.Layout(
    yaxis=dict(
        title='p'),
    xaxis=dict(
    title= 'time')
)

fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)


### IV. Sample and plot

In [19]:

Sizes= [50,50,50,50]
N_pops= len(Sizes)
data, pop_names, labels= hap_sample(freq_array,Sizes= 50)
print(data.shape)


(200, 3112)


In [10]:
from sklearn.decomposition import PCA
n_comp = 100

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
features = pca.fit_transform(data)

var_comps= pca.explained_variance_ratio_
#print("; ".join(['PC{0}: {1}'.format(x+1,round(var_comps[x],3)) for x in range(n_comp)]))
print(features.shape)

(200, 100)


In [21]:
## lets visualize the result now:
colors_pres= ['red','black','yellow','blue']


fig_data= [go.Scatter(
    x= features[[x for x in range(sum(Sizes)) if labels[x] == pop_names[i]],0],
    y= features[[x for x in range(sum(Sizes)) if labels[x] == pop_names[i]],1],
    mode= "markers",
    marker= {
    'color': colors_pres[i],
    'line': {'width': 0},
    'size': 8,
    'symbol': 'circle',
  "opacity": .8
  },
  name= pop_names[i]
) for i in range(N_pops)]


layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    yaxis=dict(
        title='PC2: {}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= 'PC1: {}'.format(round(var_comps[0],3)))
)

fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)


## Putting it all together



In [None]:

def deploy_sim(demo_file,
                anc_r= '0',
                Nsamp= 1,
                sizes= 1000,
                med_samp= True,
                rescale_dict= {},
                directed= False,
                M_convert= True,
                sample_func= sample_dist_beta,
                scale_sim= False,
                burnin= 20,
                seqL= 1e6,
                muG= 1.08e-9,
                s= 0,
                ploidy= 2,
                Sizes= 50,
                ):
    '''
    deploy sims.
    '''
    tree, demo_data, tree_summ, tree_demo= demo_file_branchProcess(demo_file)


    replic= [return_replica(x,sample_func=sample_func,func=int,rescale_dict= rescale_dict,med_samp= med_samp) for x in tree_demo]

    node_dict= node_assign(replic, tree_summ)

    theta_dict= {
        'func': theta_constant,
        'kargs': {}
    }

    anc_name= '0'

    anc_size= demo_data['N'][anc_r]
    anc_sample= sample_func(1,*anc_size,func= int,func_args= [],med_samp= med_samp)
    asize= int(anc_sample[0])

    sim_sfs= traverse_sfs(node_dict,tree_summ,theta_dict,node= '0',fr= 1,Ne= asize,Ne0=asize,
                            T= '0',muG= muG, ploidy= ploidy,s= s, seqL= seqL,
                            scale_sim= scale_sim,sample_func= sample_func,
                            med_samp= med_samp)
    node_stats, leaf_tracks= get_probs(sim_sfs,tree_summ)
    gen_track, pseg_array, pick_array= merge_branch_info(node_stats,leaf_tracks,tree_summ)

    leaves= tree_summ['leaves']

    freq_array= pop_frequencies(gen_track, pseg_array, pick_array,node_stats,leaves)


    N_pops= len(Sizes)
    data, pop_names, labels= hap_sample(freq_array,Sizes= 50)
    
    return data, pop_names, labels




