In [57]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint

import matplotlib.pyplot as plt
import itertools as it

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)



## Time estimation

One of the by-products of the coalescent is the possibility to derive estimates of expected time for events, haplotypes and parameters.

### A. Get Haplotype Time. 

Get approximated time in generations until an ancestor is encountered. 

Assuming the exponential behavior of the probabilities of mutation and coalescence, with parameters `theta / 2` and `kC2 / 2N` each, then the average time in generations for a single event to occur is `1 / p`.

Probability is locally adjusted for number of genes for coalescence. mu (probability of mutation) must be provided, the effective population size used will be inferred from Theta as `Ne= Theta / (4 * mu)`. We will infer theta from the previous section. 

In [2]:
from structure_tools.Coal_index import get_config
###Generate data from config

dataT= [
    [1,0,0,0],
    [0,0,1,0],
    [0,0,1,0]
]

### example from figure 2.10.

dataT= [
    [1,1,0,0],
    [1,1,0,1],
    [0,0,0,0],
    [0,0,1,0],
    [0,0,1,0]
]

dataT= np.array(dataT)

nsamp= dataT.shape[0]

config_dataw, hap_str= get_config(dataT,nsamp)

hap_sol= list(hap_str.keys())
hap_sun= np.array([np.array(list(x),dtype= int) for x in hap_sol])

hap_size= [len(hap_str[x]) for x in hap_sol]
hap_size= {z:[x for x in range(len(hap_size)) if hap_size[x] == z] for z in list(set(hap_size))}



passing= hap_size.keys()
pack= list(it.chain(*[hap_size[x] for x in passing]))
passport= list(it.chain(*[[x]*len(hap_size[x]) for x in passing]))

pack= [[pack[x],passport[x]] for x in range(len(pack))]
pack= sorted(pack)
pack= np.array(pack)

Dict_mat= {0: 
           {
               -2: hap_sun,
               -1: [0] * hap_sun.shape[0],
               0: pack
              }
          }

point_up= recursively_default_dict()


### Indexing layers

In [3]:
from structure_tools.Coal_index import Inf_sites

root_lib, point_up = Inf_sites(Dict_mat,point_up,layer_range= 10,sub_sample= 0,poppit= False)


layer: 0; len: 2
layer: 1; len: 2
layer: 2; len: 3
layer: 3; len: 5
layer: 4; len: 5
layer: 5; len: 5
layer: 6; len: 4
layer: 7; len: 1
layer: 8; len: 1
time elapsed: 0.112 s


#### Theta estimation - constant

see notebook [InfSites](https://nbviewer.jupyter.org/github/SantosJGND/Coalescent/blob/master/Model_proba.ipynb)

In [6]:
from structure_tools.Coal_probab import Ascent_return, tree_ascent
from structure_tools.Coalesce_plots import plot_rec_InfSites

func_names= ['tree_construct']
funcs= [
        Ascent_return    #Descent_return      # runUp_balance # tree_construct
       ]

range_theta= np.linspace(0.01,10,50)

plot_rec_InfSites(point_up,root_lib,funcs,func_names,range_theta,height= 500)

This is the format of your plot grid:
[ (1,1) x1,y1 ]



### Age estimation

Climb up the tree, backwards in time. Add time estimate for events encountered. Estimate average time in generations as `1 / probability of event`, assuming probability is scaled by population. 

Get time to first Ancestral Combination where targeted haplotype is registered. 

In [7]:
from structure_tools.Coal_tools import tree_descent_gen
from structure_tools.Coal_probab import prob_coal, prob_mut


mut_rate= 9.5e-9
Theta= 2.04
Nt= Theta / (mut_rate * 4)



sink= max(root_lib.keys())

if 0 not in root_lib[sink].keys():
    while 0 not in root_lib[sink].keys():
        sink -= 1


node_weigths, paths_reverse, node_bins, paths_vector = tree_descent_gen(root_lib,point_up,sink,Theta= Theta,mu= mut_rate)

paths_vector= paths_reverse[0][0]
average_gen= np.mean(paths_vector)
var_gen= np.std(paths_vector)

print('estimated time: {} generations'.format(round(average_gen,3)))

estimated time: 212126.316 generations


In [8]:
from structure_tools.Coalesce_plots import plot_InfSites_gens

sink= max(root_lib.keys())

if 0 not in root_lib[sink].keys():
    while 0 not in root_lib[sink].keys():
        sink -= 1


Anc_poss= root_lib[sink][-2]

hap_frame, fig_gens= plot_InfSites_gens(Anc_poss,point_up,
                                        root_lib,range_theta,
                                        Theta= Theta,mut_rate= mut_rate,height= 500,width= 900)

iplot(fig_gens)


In [9]:
hap_frame

Unnamed: 0,hap_id,hap,t
0,0,1100,105863.158
1,1,10,158494.737
2,2,0,212126.316
3,3,1101,52898.246
4,4,100,158494.737
5,5,1000,158494.737


## Theta in time

Faisal Algorithm II. 

Search over combinations of theta over time. The tree ascent times algorithm uses an array, `theta_time_array`, to change the value of theta across layers.

We consider that Theta can vary across different periods of time in the past. For simplification, we consider non-overlapping blocks of time, e.g. 100k years. Say we have Nt time periods and that theta can take any value within some range at any time period. Then the number of possible combinations is equal to the length of the that range times to the power Nt. 

Another way to look at this is as a parameter optimization problem. Our parameters are the time periods, the values of those parameters are the theta value for that time period. We have a model in that we can calculate the probability of any combination of parameters by slightly modifying our `tree_ascent` algorithm, to estimate and vary theta according to the estimated time of each node (AC).

I have attempted two solutions to this problem. Both rely on an initial random search trough combinations of parameter values.
This search is conoducted bellow.The distribution of resulting probabilities is plotted. Plot above standard threshold. 

In [10]:
from structure_tools.Coal_index import theta_time, theta_function, tree_ascent_times

import random

sink= max(root_lib.keys())

mut_rate= 9.5e-9
max_time= 4e5
Ngaps= 7
permut_max= 2000

range_theta= np.array(np.linspace(.1,3,Ngaps))

###
if 0 not in root_lib[sink].keys():
    while 0 not in root_lib[sink].keys():
        sink -= 1

###
permuts= list(it.permutations(list(range_theta)))

print(len(permuts))

if len(permuts) > permut_max:
                
    chose_some= np.linspace(0,len(permuts)-1,permut_max)
    #chose_some= sorted(chose_some)
    chose_some= np.array(chose_some,dtype= int)
    
    permuts= [permuts[x] for x in chose_some]

print(len(permuts))
####

Theta_record= recursively_default_dict()

for combo in range(len(permuts)):
    
    combi= permuts[combo]
    theta_array= theta_time(list(combi),max_time,Ngaps)
    
    node_weigths, paths_backward, node_times = tree_ascent_times(root_lib,point_up,sink,
                                                                 mu= 9e-8,theta_time_array= theta_array)
    
    Theta_record[combi]= {
        'probs': node_weigths[sink][0],
        'times': node_times,
        'comb': theta_array
    }



5040
2000


In [11]:

from sklearn.neighbors import KernelDensity

probs_keys= list(Theta_record.keys())
probs_vector= [Theta_record[th]['probs'] for th in probs_keys]
probs_vector= np.array(probs_vector).reshape(-1,1)

Z= (probs_vector - np.mean(probs_vector)) / np.std(probs_vector)

bandwidth = estimate_bandwidth(Z, quantile=0.2, n_samples=500)

X_plot = np.linspace(-2, 8, 100)[:, np.newaxis]

kde_plot = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(Z)
log_dens = kde_plot.score_samples(X_plot)

fig_dens_I= [go.Scatter(x=X_plot[:, 0], y=np.exp(log_dens),
                            mode='lines', fill='tozeroy',
                            line=dict(color='#AAAAFF', width=2))]

layout= go.Layout(
    title= 'max {}, z: {}'.format(max(probs_vector)[0], max(Z)[0])
)

Figure= go.Figure(data= fig_dens_I, layout= layout)
iplot(Figure)


### PCA optimization

This approach is based on the observation that similar vectors of theta are correlated linearly. This implies that vectors close to an optimum combination of values should cluster together in PCA space. See Figure bellow. 

We estimate the kernel density estimation in feature space of vectors with high probability to sample from. Parameter vectors are recovered using the function `PCA.inverse transformation`. This should introduce some variation that might bring us closer to an optimum parameter set. The plot bellow includes inverse transformed obsevations in orange. 

The algorithm proposed performs runs of dimensionality reduction, selection of higher probability and sampling from the inferred space to create new parameter vector data sets.

KDE sampling is not optimal and neither is the inverse transform. Sampled vectors are retained if they are higher then the previous run's average minus standard deviation. Values higher than one are also removed, as they are the result of negative theta values from inverse transformation.


In [48]:
from sklearn.preprocessing import scale
from structure_tools.Coalesce_plots import theta_PCAms_plot 

data_combs= [x for x in probs_keys]
data_combs= np.array(data_combs)


N_samp= 50

Figure, new_data, feats_combs= theta_PCAms_plot(data_combs,Z,N_samp= 50,n_comp= 4)

iplot(Figure)

(15, 4)


In [80]:
from plotly import tools
from structure_tools.Coalesce_plots import PCA_sumplot

Ncols= 2
PC_select= 2
height= 600
width= 1000

PCA_sumplot(Z,zprime,Theta_record,pca,Ncols= Ncols,PC_select= PC_select,height= height, width= width)

This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]



### PCA optimization

In [27]:
from structure_tools.coal_thetTime_opt import pca_optimize
from structure_tools.Coalesce_plots import PCA_sumplot

###
from sklearn.neighbors.kde import KernelDensity
from sklearn.model_selection import GridSearchCV
##

N_samp= 350
Ncomps= 4
Nlayers= 50
prob_mean, prob_median, prob_sd, pca_theta, pca_record= pca_optimize(feats_combs,data_combs,Z,pca,
                                                                     root_lib,point_up,sink,
                                                                     N_samps= N_samp,
                                                                     Nlayers=Nlayers,max_time= max_time, 
                                                                     Ngaps= Ngaps,Ncomps= Ncomps)


(27, 5)


#### Run summary



In [37]:
run_stats= [
    prob_mean, prob_median, prob_sd
]
stats_names= ['mean','median','sd']

run_stats= np.array(run_stats).T

fig_stats= [
    go.Scatter(
        x= list(range(run_stats.shape[0])),
        y= run_stats[:,i],
        mode= 'lines',
        name= stats_names[i]
    ) for i in range(run_stats.shape[1])
]

layout= go.Layout(
    title= 'PCA optimization run stats. p-values by run',
    xaxis= dict(title='mean, median, sd'),
    yaxis= dict(title= 'run')
)

Figure= go.Figure(data= fig_stats, layout= layout)

iplot(Figure)

#### Theta in time

Plot combination output

In [81]:
from structure_tools.Coalesce_plots import plot_thetatime

plot_thetatime(pca_record,max_time= max_time)

Somewhat better. Relative coherence. Higher probabilities than found randomly, and through the brute Monte-Carlo-ish update attempted with the `gem_sampler`. see below.

### Update from random chance.

Another approach. Improvement on initial selection using by iteratively updating and retainning only if higher probability is achieved. 

In [40]:
### chose combis based on Z to
dens_combs= kde_plot.score_samples(probs_vector)
dens_combs= np.exp(dens_combs)

threshold_p = 4.5
best_combis= [x for x in range(len(probs_vector)) if Z[x] > threshold_p]

print(len(best_combis))

5


In [45]:
from structure_tools.coal_thetTime_opt import gem_sampler

Btheta_dict= {}

for g in best_combis:
    new_array= list(probs_keys[g])
    theta_array= theta_time(list(new_array),max_time,Ngaps)
    
    Theta_lib, comb_likes, comb_theta = gem_sampler(root_lib,point_up,range_theta,theta_array= theta_array,max_time= 4e5,
                                               Ngaps= Ngaps,sink= sink,permN= 100,Ave_vec= new_array,sig= 1,step= .2)
    
    
    Btheta_dict[tuple(new_array)]= Theta_lib
    print(Theta_lib['probs'])



0.04974
0.03722
0.04987
0.04987
0.02771


In [46]:

fig_best_times= []

for combi in Btheta_dict.keys(): 
    
    x_plot= np.linspace(1,max(Btheta_dict[combi]['comb'][:,0]) + 100, 100)
    y_plot= [theta_function(x, theta_time_array= Btheta_dict[combi]['comb']) for x in x_plot]
    
    fig= go.Scatter(
        x= x_plot,
        y= y_plot,
        mode= 'lines',
        name= 'prob: {}'.format(round(Btheta_dict[combi]['probs'], 5))
    )
    
    fig_best_times.append(fig)

layout= go.Layout(
    title= 'best_times, Z score: {}'.format(threshold_p),
    xaxis= dict(title= 'generations'),
    yaxis= dict(title= 'theta')
)

Figure= go.Figure(data= fig_best_times, layout= layout)
iplot(Figure)