In [1]:
import sys
import argparse
import numpy as np
from scipy.stats import chi2_contingency
from itertools import product
import itertools as it

import allel
import pandas as pd

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)
    
from datetime import datetime
import tempfile
import os
import gzip
import subprocess
import time

import collections
def recursively_default_dict():
    return collections.defaultdict(recursively_default_dict)


import matplotlib
matplotlib.use('Agg')

import matplotlib.pyplot as plt

In [2]:
from tools.mcounter_tools import (
    read_vcf_allel, ind_assignment_scatter_v1, MC_sample_matrix_v1,
    heatmap_v2
)

In [3]:
#from tools.SLiM_pipe_tools import mutation_counter_launch
import re
import pandas as pd


## directories
main_dir= os.getcwd() + '/'
count_dir= main_dir + 'mutation_counter/count/'
dir_launch= main_dir + 'mutation_counter'
muted_dir= main_dir + 'mutation_counter/data/mutation_count/'
sims_dir= main_dir + 'mutation_counter/data/sims_dem_1MB/'
diffs= False

mutlog= 'toMut.log'
min_size= 5
sampling= [50,50,5]
stepup= 'increment'
sample_sim= 50

data, data_freqs = MC_sample_matrix_v1(min_size= min_size, samp= sampling, stepup= stepup, count_dir= count_dir, 
                        dir_launch= dir_launch,main_dir= main_dir,sim_dir= sims_dir,
                          muted_dir= muted_dir, diffs= diffs,
                       exclude= False,sample_sim= sample_sim)


missing: 0, no vcf: 79
available 241
sample 50
crashC10.98920299
(1092, 3225)
crashC20.10510075
(1092, 3348)
equilC3.143007902
(1092, 5471)
growthC14.77723805
(1092, 9499)
growthC13.109802410
(1092, 9635)
crashC11.120388005
(1092, 3235)
growthC1.106111588
(1092, 9517)
crashC15.31666194
(1092, 3336)
crashC3.196634556
(1092, 3223)
equilC13.51270977
(1092, 5731)
equilC17.50775782
(1092, 5719)
crashC9.14610793
(1092, 3415)
growthC21.40503315
(1092, 9571)
equilC8.5053139
(1092, 5731)
crashC8.39998948
(1092, 3140)
crashC7.134896362
(1092, 3298)
crashC3.170336721
(1092, 3364)
growthC11.75384030
(1092, 9719)
growthC5.117092206
(1092, 9291)
equilC7.19793757
(1092, 5735)
equilC10.115654380
(1092, 5708)
equilC10.94642386
(1092, 5633)
crashC13.64531284
(1092, 3322)
equilC11.65733404
(1092, 5735)
crashC8.75710112
(1092, 3253)
growthC9.73868670
(1092, 9470)
growthC11.11542679
(1092, 9637)
growthC3.79105307
(1092, 9493)
growthC22.42085272
(1092, 9362)
equilC5.164817904
(1092, 5630)
growthC16.9740954


## Data analysis - Pairwise population comparison

In [7]:

def run_stats(ref_sim,ref_pair,data,data_freqs= {}):
    '''
    co-factor function to md counter comparisons, deploy heatmap and calculate kmer proportion differences 
    between pairs of population.
    - ref pair: list of tuples. can't be dictionary because of repeated pops / reference tags. 
    '''
    batch= ref_sim.split('C')[0]
    sizes= [data[x[0]]['sizes'][x[1]] for x in ref_pair]
    #

    chromosomes= [ref_sim.split('.')[0].split('C')[1]]

    pop_counts= {
        g: data[g[0]]['counts'][g[1]] for g in ref_pair
    }

    num_variants= {
        g: data[g[0]]['Nvars'][g[1]] for g in ref_pair
    }

    ratio_grid, sig_cells= heatmap_v2(chromosomes,pop_counts,num_variants,
                                      {},frequency_range, exclude, p_value, muted_dir,tag= '',
                                      test= test_m,output= 'pval')
    
    pop_counts= {
        z: s / np.sum(s) for z,s in pop_counts.items()
    }

    grid_diffs= pop_counts[ref_pair[0]] - pop_counts[ref_pair[1]]

    comb_stats= {
        'grids': ratio_grid,
        'sigs': sig_cells,
        'sizes': sizes,
        'batch': batch,
        'diffs': grid_diffs
    }

    if data_freqs:
        comb_stats['freqs']= {
            x: data_freqs[x[0]][x[1]] for x in ref_pair
        }
    
    return comb_stats



def md_reference_comp(data,p_value= 1e-5, test_m= 'fisher', individually= False, Nbins= 10,
                            exclude= False, frequency_range= [0,1], data_freqs= {}, extract= 'pval',
                            muted_dir= '', tag_ref= '_ss'):
    '''
    Parse data dictionary.
        data: {sim: {counts:{pop:g}, Nvars:{pop:g}, sizes:{pop:g}}}
    i: use sim and pop IDs to create dictionary connecting original populations to 
    subset populations created using ind_assignment_scatter_v1.
    ii: for each pair of reference populations, launch heatmapv2. return grid pvals or proportions,
    and proportion of mutations in subset population. allows for fisher or chi2 test for pval.
    '''
    
    bins= np.linspace(0,1,Nbins)
    bins= np.round(bins,4)
    bins= [(bins[x-1],bins[x]) for x in range(1,len(bins))]
            
    avail= list(data.keys())
    ref_idx= [int(tag_ref in avail[x]) for x in range(len(avail) )]
    categ= {
        z: [x for x in range(len(avail)) if ref_idx[x] == z] for z in [0,1]
    }
    
    print([len(categ[x]) for x in [0,1]])
    
    ### possible combinations per simulation.
    ref_combos= {}
       
    for idx in categ[0]:
        ref= avail[idx]
        ref_combs= list(data[ref]['counts'].keys())
        ref_combs= it.combinations(ref_combs,2)
        ref_combs= list(ref_combs)
        
        comb_dict= {
            x: {} for x in ref_combs
        }
        
        comb_stats= {}
        
        for pair in ref_combs:
            pop1, pop2= pair
            
            ref_pair= [(ref,pop1),(ref,pop2)]
            
            comb_stats[pair]= run_stats(ref,ref_pair,data,data_freqs= data_freqs)
                    
        ref_combos[ref]= {
            'combs': comb_dict,
            'sizes': data[ref]['sizes'],
            'stats': comb_stats
        }
    
    #### population size diffs per population per simulation
    pop_asso= {avail[x]:recursively_default_dict() for x in categ[0]}
    
    for av in categ[1]:
        dat= [x for x in data[avail[av]]['counts'].keys() if tag_ref in x]
        dat_size= [data[avail[av]]['sizes'][x] for x in dat]
        ref_sim= avail[av].split(tag_ref)[0]
        ref_pop= [x.split('.')[0].strip(tag_ref) for x in dat]
        dat_size= [dat_size[x] / data[ref_sim]['sizes'][ref_pop[x]] for x in range(len(dat))]
        dat_size= [round(x,3) for x in dat_size]
        for p in range(len(dat)):
            pop_asso[ref_sim][ref_pop[p]][dat_size[p]][avail[av]]= dat[p]
    
    d= 0
    ### combine simulation combination and population size ranges.
    
    for ref_sim in pop_asso.keys():
        print(ref_sim)
        batch= ref.split('C')[0]
        
        for combo in ref_combos[ref_sim]['combs'].keys():
            
            # sort availbale sizes for each population pair.
            # arrange them by bin.
            # deploy count comparisons for pairwise combinations of the two pops within each bin.
            
            pop1, pop2= combo
            
            available_sizes= {
                z: sorted(list(pop_asso[ref_sim][z].keys())) for z in combo
            }
            
            bins_dict= {
                b: {
                    z: [x for x in available_sizes[z] if x > b[0] and x <= b[1]] for z in combo
                } for b in bins
            }
            
            bins_combs= {
                b: [(x,y) for x in bins_dict[b][pop1] for y in bins_dict[b][pop2]] for b in bins
            }
            
            for bend in bins_combs.keys():
                ref_combos[ref_sim]['combs'][combo][bend]= []
                
                for size_combo in bins_combs[bend]:
                    i,j= size_combo
                    
                    for sub1 in pop_asso[ref_sim][pop1][i].keys():
                        for sub2 in pop_asso[ref_sim][pop2][j].keys():
                            
                            
                            ref_pair= [(sub1, pop_asso[ref_sim][pop1][i][sub1]),(sub2, pop_asso[ref_sim][pop2][j][sub2])]
                            
                            comb_stats= run_stats(ref_sim,ref_pair,data,data_freqs= data_freqs)
                            
                            ref_combos[ref_sim]['combs'][combo][bend].append(comb_stats)
    
    return pop_asso, ref_combos




In [16]:
p_value= 1e-5
test_m= 'chi2'
individually= False
exclude= False
frequency_range= [0,1]
extract= 'pval'
Nbins= 100

pop_asso, ref_combos= md_reference_comp(data,p_value= p_value, test_m= test_m, individually= individually, Nbins= Nbins,
                                        exclude= exclude, frequency_range= frequency_range, extract= extract,
                                     muted_dir= muted_dir, data_freqs= data_freqs)


[10, 10000]
splitEqualC4.105437567
splitEqualC16.71863131
splitEqualC11.23389887
splitEqualC7.15825957
splitEqualC6.9061292
splitEqualC5.57749228
splitEqualC10.63582818
splitEqualC20.24427133
splitEqualC13.79379092
splitEqualC7.26189423


In [17]:

bins= np.linspace(0,1,Nbins)
bins= np.round(bins,4)
bins= [(bins[x-1],bins[x]) for x in range(1,len(bins))]

requested= 'diffs'

sim_stats= list(ref_combos.keys())

combo_grids= {
    ref: {comb: {
            z: [] for z in bins
        } for comb in ref_combos[ref]['combs'].keys()
    } for ref in ref_combos.keys()
}

combo_ref= {x: [] for x in ref_combos[sim_stats[0]]['combs'].keys()}

for sim in sim_stats:
    for comb in ref_combos[sim]['combs'].keys():
        combo_ref[comb].append(ref_combos[sim]['stats'][comb][requested])
        
        for bi in ref_combos[sim]['combs'][comb].keys():
            for idx in ref_combos[sim]['combs'][comb][bi]:
                combo_grids[sim][comb][bi].append(idx[requested])


In [18]:
### comparison to self comparisons along gradient.
grad_comb= {
    ref: {
        comb: {
            bi: [np.sqrt(np.sum((x - ref_combos[ref]['stats'][comb][requested])**2)) for x in combo_grids[ref][comb][bi]] for bi in combo_grids[ref][comb].keys()
        } for comb in combo_grids[ref].keys()
    } for ref in combo_grids.keys()
}

grad_comb= {
    comb: {
        bi: list(it.chain(*[grad_comb[r][comb][bi] for r in grad_comb.keys()])) for bi in bins
    } for comb in ref_combos[sim_stats[0]]['combs'].keys()
}


In [19]:
ydict= {
    comb: {
        round(sum(bi)/ 2,3): np.mean(g) for bi,g in grad_comb[comb].items()
    } for comb in grad_comb.keys()
}

yerror= {
    comb: {
        round(sum(bi)/ 2,3): np.std(g) for bi,g in grad_comb[comb].items()
    } for comb in grad_comb.keys()
}


xdict= {
    comb: sorted(ydict[comb].keys()) for comb in ydict.keys()
}

fig= [
    go.Scatter(
        x= xdict[i],
        y= [ydict[i][x] for x in xdict[i]],
        error_y= dict(
            array= [yerror[i][x] for x in xdict[i]],
            type= 'data',
            #symmetric= True,
            visible=True
        ),
        name= '-'.join(i)
    ) for i in xdict.keys()
]

layout= go.Layout()

Figure= go.Figure(data=fig, layout= layout)
iplot(Figure)

In [20]:

### comparison to all other comparisons along gradient.

anti_comb= {
    ref: {
        comb: {
            bi: [[np.sqrt(np.sum((x - ref_combos[l]['stats'][cb][requested])**2)) for l in combo_grids.keys() for cb in combo_grids[ref].keys() if [l,cb] != [ref,comb]] for x in combo_grids[ref][comb][bi]] for bi in combo_grids[ref][comb].keys()
        } for comb in combo_grids[ref].keys()
    } for ref in combo_grids.keys()
}

anti_comb= {
    comb: {
        bi: list(it.chain(*[anti_comb[r][comb][bi] for r in anti_comb.keys()])) for bi in bins
    } for comb in ref_combos[sim_stats[0]]['combs'].keys()
}

anti_comb= {
    comb: {
        bi: list(it.chain(*anti_comb[comb][bi])) for bi in bins
    } for comb in anti_comb.keys()
}

combined= {
    'ref': grad_comb,
    'anti': anti_comb
}

In [21]:
ydict= {
    ty: {
        comb: {
            round(sum(bi)/ 2,3): np.mean(g) for bi,g in combined[ty][comb].items()
        } for comb in combined[ty].keys()
    } for ty in combined.keys()
}

yerror= {
    ty: {
        comb: {
            round(sum(bi)/ 2,3): np.std(g)*3 for bi,g in combined[ty][comb].items()
        } for comb in combined[ty].keys()
    } for ty in combined.keys()
}



xdict= {
    comb: sorted(ydict['anti'][comb].keys()) for comb in ydict['ref'].keys()
}

fig= [
    go.Scatter(
        x= xdict[i],
        y= [ydict[ty][i][x] for x in xdict[i]],
        error_y= dict(
            array= [yerror[ty][i][x] for x in xdict[i]],
            type= 'data',
            #symmetric= True,
            visible=True
        ),
        name= ty
    ) for i in xdict.keys() for ty in yerror.keys()
]

layout= go.Layout(
    title= 'combination comparison',
    yaxis= dict(
        title= 'sqrt SSD'
    ),
    xaxis= dict(
        title='relative sample size'
    )
)

Figure= go.Figure(data=fig, layout= layout)
iplot(Figure)

In [22]:
### ref norms
import plotly.figure_factory as ff

comb_norms= {
    z: np.array([x.reshape(1,np.prod(x.shape)) for x in combo_ref[z]]) for z in combo_ref.keys()
}

comb_norms= {
    z: g.reshape(1,np.prod(g.shape)) for z,g in comb_norms.items()
}


hist_data= [comb_norms[x][0] for x in comb_norms.keys()]

group_labels = ['-'.join(x) for x in comb_norms.keys()] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels,
                        bin_size=.005)


fig.update_layout(title_text='mean {} distribution for mutation count matrices'.format(requested))
fig.update_layout(xaxis_title= requested)
fig.update_layout(yaxis_title= 'density')


In [23]:
### gradient values and standardization
comb_norms= {
    z: [np.mean(g),np.std(g)] for z,g in comb_norms.items()
}


grad_comb= {
    ref: {
        comb: {
            bi: np.array([x.reshape(1,np.prod(x.shape)) for x in combo_grids[ref][comb][bi]]) for bi in combo_grids[ref][comb].keys()
        } for comb in combo_grids[ref].keys()
    } for ref in combo_grids.keys()
}

grad_comb= {
    ref: {
        comb: {
            bi: g.reshape(1,np.prod(g.shape)) for bi,g in grad_comb[ref][comb].items()
        } for comb in grad_comb[ref].keys()
    } for ref in grad_comb.keys()
}


grad_comb= {
    ref: {
        comb: {
                bi: (g - comb_norms[comb][0]) / comb_norms[comb][1] for bi,g in grad_comb[ref][comb].items()
           } for comb in grad_comb[ref].keys()
    } for ref in grad_comb.keys()
}

grad_comb= {
    comb:{
        bi: list(it.chain(*[grad_comb[ref][comb][bi] for ref in grad_comb.keys()])) for bi in bins
    } for comb in ref_combos[sim_stats[0]]['combs'].keys()
}

In [24]:
###
sig= 1.96

ydict= {
    comb: {
        round(sum(bi)/ 2,3): np.mean(g[0]) for bi,g in grad_comb[comb].items()
    } for comb in grad_comb.keys()
}

yerror= {
    comb: {
        round(sum(bi)/ 2,3): np.std(g[0]) for bi,g in grad_comb[comb].items()
    } for comb in grad_comb.keys()
}


xdict= {
    comb: sorted(ydict[comb].keys()) for comb in ydict.keys()
}

fig= [
    go.Scatter(
        x= xdict[i],
        y= [ydict[i][x] for x in xdict[i]],
        error_y= dict(
            array= [yerror[i][x] for x in xdict[i]],
            type= 'data',
            #symmetric= True,
            visible=True
        ),
        name= '-'.join(i)
    ) for i in xdict.keys()
]

layout= go.Layout()

Figure= go.Figure(data=fig, layout= layout)
iplot(Figure)

## Data analysis - within populations 

So, we calculated mutation type counts for each population and compared them accross simulations. We are interested to see if sampling could have an impact on the variance of count differences. 

The function `heatmap` returns a matrix of count proportions across types for each pairwise comparison. We will calculate the variance of each matrix, and plot it against the relative sampling across populations

In [9]:


def mcounter_deploy_v2(data,p_value= 1e-5, test_m= 'fisher', individually= False,
                            exclude= False, frequency_range= [0,1], data_freqs= {}, extract= 'pval',
                            muted_dir= '', tag_ref= '_ss'):
    '''
    Parse data dictionary.
        data: {sim: {counts:{pop:g}, Nvars:{pop:g}, sizes:{pop:g}}}
    i: use sim and pop IDs to create dictionary connecting original populations to 
    subset populations created using ind_assignment_scatter_v1.
    ii: for each pair of reference/subset populations, launch heatmapv2. return grid pvals or proportions,
    and proportion of mutations in subset population. allows for fisher or chi2 test for pval.
    - v2: compares sub pops to ref full pops other than its own
    '''
    
    avail= list(data.keys())
    ref_idx= [int(tag_ref in avail[x]) for x in range(len(avail) )]
    categ= {
        z: [x for x in range(len(avail)) if ref_idx[x] == z] for z in [0,1]
    }

    pop_asso= {avail[x]:recursively_default_dict() for x in categ[0]}

    for av in categ[1]:
        dat= [x for x in data[avail[av]]['counts'].keys() if tag_ref in x]
        ref_sim= avail[av].split(tag_ref)[0]
        ref_pop= [x.split('.')[0].strip(tag_ref) for x in dat]
        for p in range(len(dat)):
            pop_asso[ref_sim][ref_pop[p]][avail[av]]= dat[p]

    d= 0
    count_data= recursively_default_dict()

    for ref in pop_asso.keys():
        batch= ref.split('C')[0]
        
        for pop in pop_asso[ref].keys():
            for sub in pop_asso[ref][pop].keys():
                
                ref_pair= [(ref, pop),(sub, pop_asso[ref][pop][sub])]
                
                count_data[d]= run_stats(ref,ref_pair,data,data_freqs= data_freqs)
                                
                count_data[d]['other']= [] 
                
                for ref2 in pop_asso.keys():
                    for pop2 in pop_asso[ref2].keys():
                        if [ref,pop] == [ref2,pop2]:
                            continue
                        if ref2.split('C')[0] != batch: 
                            continue
                        ##
                        pop_dict= {
                            ref2: pop2,
                            sub: pop_asso[ref][pop][sub]
                        }
                        ref_pair= [(ref2, pop2),(sub, pop_asso[ref][pop][sub])]
                        
                        pair_stats= run_stats(ref,ref_pair,data,data_freqs= data_freqs)
                                                
                        count_data[d]['other'].append(pair_stats['diffs'])
                
                d += 1
    
    return pop_asso, count_data





In [10]:
### new - pair reference sims and subsetted populations.
### extract kmer comparisons (proportions or pvals) using heatmap_v2.
### make function. 

from tools.mcounter_tools import mcounter_deploy

p_value= 1e-5
test_m= 'fisher'
individually= False
exclude= False
frequency_range= [0,1]
extract= 'pval'

pop_asso, count_data= mcounter_deploy_v2(data,p_value= p_value, test_m= test_m, individually= individually,
                                        exclude= exclude, frequency_range= frequency_range, extract= extract,
                                     muted_dir= muted_dir, data_freqs= data_freqs)


############## mutation grids
############## mutation grids

In [17]:

from functools import reduce  # forward compatibility for Python 3
import operator

from tools.fasta_utilities import (
    get_mutations, kmer_comp_index, kmer_mut_index
)

bases= 'ATCG'
ksize= 3

mutations= get_mutations(bases= bases,ksize= ksize)
kmers, kmer_idx= kmer_comp_index(mutations)

mut_lib= kmer_mut_index(mutations)
labels= [kmer_idx[x][0] for x in sorted(kmer_idx.keys())]
grid_labels= np.array(labels).reshape(24,4)
list_labels= grid_labels.reshape(1,np.prod(grid_labels.shape))[0]
##############
############## process grids

sims_dir= main_dir + 'mutation_counter/data/sims/'

available= list(count_data.keys())
subsamp= len(count_data)
avail_sub= np.random.choice(available,subsamp,replace= False)

### 1. extract grids
grids= [count_data[s]['grids'] for s in avail_sub]
grid_shape= grids[0].shape
grid_total= np.prod(grid_shape)

grid_diffs= [count_data[s]['diffs'] for s in avail_sub]
## extract statistics per mutation.
mut_grid= {}
mut_diffs= {}

for row in range(grid_shape[0]):
    for col in range(grid_shape[1]):
        mut= grid_labels[row,col]
        mut_grid[mut]= []
        mut_diffs[mut]= []

        for idx in range(len(avail_sub)):
            mut_grid[mut].append(grids[idx][row,col])
            mut_diffs[mut].append(grid_diffs[idx][row,col]**2)

## mask infinite values and compute std.
#grids= [np.ma.masked_where(a == np.inf, a) for a in grids]
#grid_mean= [np.mean(x) for x in grids] 
#grid_std= [np.std(x) for x in grids]
#prop_mean= [np.mean(x) for x in props]

### 2. calculate proportions across smulations
pop_proportions= [count_data[s]['sizes'][1] / count_data[s]['sizes'][0] for s in avail_sub]
pop_proportions= [round(x,3) for x in pop_proportions]
### 3. batch names
batch_names= [count_data[s]['batch'] for s in avail_sub]
batch_dict= {
    z:[x for x in range(len(avail_sub)) if batch_names[x] == z] for z in list(set(batch_names))
}




invalid value encountered in true_divide


divide by zero encountered in true_divide



In [18]:


fig_dir= 'Figures/kmers'
os.makedirs(fig_dir, exist_ok=True)
fig_dir= fig_dir + '/'



In [25]:
##############################################################
############################################################## SFS

pop_vector= [count_data[s]['pop'] for s in avail_sub]
pop_set= list(set(pop_vector))

pop_batch_dict= {
    ba: {
        pop: [x for x in batch_dict[ba] if pop_vector[x] == pop] for pop in pop_set
    } for ba in batch_dict.keys()
}


fig_dir= 'Figures/kmers/'
mu= 1e-8
N_inds= 100
xlab= 'sampling proportion'
ylab= 'sum of sqared differences.'

batch_sfs= {ba: {} for ba in pop_batch_dict.keys()}

for i in batch_dict.keys():
    for pop_i in pop_batch_dict[i].keys():
    
        sims= [avail_sub[x] for x in pop_batch_dict[i][pop_i]]

        counts= []
        props= [pop_proportions[x] for x in pop_batch_dict[i][pop_i]] 

        for sim_idx in sims:
            freqs_dict= count_data[sim_idx]['freqs']
            sfs= []
            sizes_sim= count_data[sim_idx]['sizes']
            N_inds= min(sizes_sim) #- int(min(sizes_sim) % 2 == 0)
            #N_inds= int(np.mean(sizes_sim))

            for pop in freqs_dict.keys():

                #print(gen_time)
                freqs= freqs_dict[pop]

                sizeN= [x[1] for x in freqs if x[1] > 0]
                freqs= np.repeat([x[0] for x in freqs if x[1] > 0],sizeN) / sizes_sim[pop]

                #freqs= [x for x in freqs if x > 0]
                bin_count= np.histogram(freqs,bins= N_inds,range= [0,1])[0]
                bin_count= bin_count / np.sum(bin_count)
                #bin_count= np.array(bin_count)
                sfs.append(bin_count)

            dist_vec= sfs[0] - sfs[1] 

            dist_vec= dist_vec**2
            #dist_vec= np.mean(dist_vec)
            dist_vec= np.sqrt(np.sum(dist_vec)) / N_inds
            #dist_vec= np.sqrt(sum(dist_vec)) / N_inds
            
            counts.append(dist_vec)

        props_dict= {
            z: [counts[x] for x in range(len(props)) if props[x] == z] for z in list(set(props))
        }

        props_sorted= sorted(props_dict.keys())
        props_means= [np.mean(props_dict[x]) for x in props_sorted]
        props_std= [np.std(props_dict[x]) for x in props_sorted]

        batch_sfs[i][pop_i]= [props_sorted,props_means,props_std]



In [36]:

fig= [go.Scatter(
    x= batch_sfs[i][pop][0],
    y= batch_sfs[i][pop][1],
    error_y= dict(
        array= batch_sfs[i][pop][2],
        type= 'data',
        #symmetric= True,
        visible=True
    ),
    name= '-'.join([pop,str(i)])
) for i in batch_sfs.keys() for pop in batch_sfs[i].keys()]

layout= go.Layout(
    title= 'SFS distances / sample size',
    xaxis= dict(
        title= 'relative sample size',
        range= [-.01, 1.03]
    ),
    yaxis= dict(
        title= 'sum of squared diffs'
    )
)

Figure= go.Figure(data= fig,layout=layout)

iplot(Figure)

In [45]:
####################################################
#################################################### grid SSD

bins= np.linspace(0,1,Nbins)
bins= np.round(bins,4)
bins= [(bins[x-1],bins[x]) for x in range(1,len(bins))]

other_diffs= [count_data[s]['other'] for s in avail_sub]
pop_vector= [count_data[s]['pop'] for s in avail_sub]
pop_set= list(set(pop_vector))

pop_batch_dict= {
    ba: {
        pop: [x for x in batch_dict[ba] if pop_vector[x] == pop] for pop in pop_set
    } for ba in batch_dict.keys()
}

xlab= 'relative sampling'
ylab= 'mean matrix p-val'

view_sets= ['ref','anti']
grid_whole= {
    pop: {    
        view:{} for view in view_sets
    } for pop in pop_set
}

for i in batch_dict.keys():
    for pop in pop_batch_dict[i].keys():
    
        xprep= [pop_proportions[x] for x in pop_batch_dict[i][pop]]
        
        xprep= {
            sum(bi) / 2: [x for x in range(len(xprep)) if xprep[x] > bi[0] and xprep[x] <= bi[1]] for bi in bins
        }
        #xprep= {
        #     z: [x for x in range(len(xprep)) if xprep[x] == z] for z in list(set(xprep))
        #}

        ### grids
        batch_grids= [grid_diffs[x] for x in pop_batch_dict[i][pop]]
        y_prep= {
            z: [batch_grids[x] for x in xprep[z]] for z in xprep.keys()
        }

        y_prep= {
            z: [np.sqrt(np.sum(x**2)) for x in y_prep[z]] for z in y_prep.keys()
        }

        surface= sorted(xprep.keys())
        y= [np.mean(y_prep[x]) for x in surface]
        error= [np.std(y_prep[x])*3 for x in surface]

        grid_whole[pop]['ref'][i]= [surface,y,error]

        ###
        batch_grids= [other_diffs[x] for x in pop_batch_dict[i][pop]]
        xprep= [pop_proportions[x] for x in pop_batch_dict[i][pop]]
        xprep= np.repeat(xprep,[len(x) for x in batch_grids])
        batch_grids= list(it.chain(*batch_grids))

        xprep= {
            sum(bi) / 2: [x for x in range(len(xprep)) if xprep[x] > bi[0] and xprep[x] <= bi[1]] for bi in bins
        }
        
        y_prep= {
            z: [batch_grids[x] for x in xprep[z]] for z in xprep.keys()
        }

        y_prep= {
            z: [np.sqrt(np.sum(x**2)) for x in y_prep[z]] for z in y_prep.keys()
        }

        surface= sorted(xprep.keys())
        y= [np.mean(y_prep[x]) for x in surface]
        error= [np.std(y_prep[x])*3 for x in surface]

        grid_whole[pop]['anti'][i]= [surface,y,error]

############################################################# 
############################################################# STRATA


In [46]:
fig= [go.Scatter(
    x= grid_whole[pop][ep][i][0],
    y= grid_whole[pop][ep][i][1],
    error_y= dict(
        array= grid_whole[pop][ep][i][2],
        type= 'data',
        #symmetric= True,
        visible=True
    ),
    name= '-'.join([str(i),ep,pop])
) for pop in grid_whole.keys() for ep in grid_whole[pop].keys() for i in grid_whole[pop][ep].keys()
]

layout= go.Layout(
    title= 'self grid SSD sample size',
    xaxis= dict(
        title= 'relative sample size',
        range= [-.01, 1.03]
    ),
    yaxis= dict(
        title= 'sum of squared diffs'
    )
)
Figure= go.Figure(data= fig,layout=layout)

iplot(Figure)

In [52]:



def md_increment_compare(data, muted_dir= '', tag_ref= '_ss'):
    '''
    Parse data dictionary.
        data: {sim: {counts:{pop:g}, Nvars:{pop:g}, sizes:{pop:g}}}
    i: use sim and pop IDs to create dictionary connecting original populations to 
    subset populations created using ind_assignment_scatter_v1.
    ii. for each population for each reference, organise first by popuation sizes (not proportions).
    iii. calculate pairwise differences between sets of counts are contiguous sample sizes. 
    '''
    avail= list(data.keys())
    ref_idx= [int(tag_ref in avail[x]) for x in range(len(avail) )]
    categ= {
        z: [x for x in range(len(avail)) if ref_idx[x] == z] for z in [0,1]
    }
    
    #### population size diffs per population per simulation
    pop_asso= {avail[x]:recursively_default_dict() for x in categ[0]}
    
    for av in categ[1]:
        dat= [x for x in data[avail[av]]['counts'].keys() if tag_ref in x]
        dat_size= [data[avail[av]]['sizes'][x] for x in dat]
        
        ref_sim= avail[av].split(tag_ref)[0]
        ref_pop= [x.split('.')[0].strip(tag_ref) for x in dat]
        dat_size= [dat_size[x] for x in range(len(dat))]
        dat_size= [round(x,3) for x in dat_size]
        
        for p in range(len(dat)):
            pop_asso[ref_sim][ref_pop[p]][dat_size[p]][avail[av]]= dat[p]
    
    d= 0
    ### combine simulation combination and population size ranges.
    stats_dict= recursively_default_dict()
    for ref_sim in pop_asso.keys():
        batch= ref_sim.split('C')[0]
        
        for pop in data[ref_sim]['counts'].keys():
            
            available_sizes= sorted(list(pop_asso[ref_sim][pop].keys()))
            available_sizes= [x for x in available_sizes if x < 50]
            size_counts= {}
            
            for si in available_sizes:
                t= [(v,g) for v,g in pop_asso[ref_sim][pop][si].items()]
                t= [data[v[0]]['counts'][v[1]] for v in t]
                t= [x.reshape(1,np.prod(x.shape)) / np.sum(x) for x in t if np.sum(x)]
                
                t= [x[0] for x in t]
                
                t= np.array(t)
                
                size_counts[si]= t
            
            stats_dict[ref_sim][pop]= {
                'means': [],
                'stds': []
            }
            
            for idx in range(1,len(available_sizes)):
                set1= size_counts[available_sizes[idx]]
                set2= size_counts[available_sizes[idx - 1]]
                
                dists= set_SSD(set1,set2)
                
                stats_dict[ref_sim][pop]['means'].append(np.mean(dists))
                stats_dict[ref_sim][pop]['stds'].append(np.std(dists))
            
            stats_dict[ref_sim][pop]['sizes']= available_sizes[1:]
                
    
    return stats_dict



def set_SSD(set1,set2):
    '''
    return sum of squared differences between every pair of vectors across two sets.
    '''
    dists= []
    
    for indian in set1:
        
        dist_vec= [(x - indian) for x in set2] #/ np.sum(indian + x)
        dist_vec= [z**2 for z in dist_vec]
        dist_vec= [np.sum(x) for x in dist_vec]
        dists.extend(dist_vec)
    
    return dists


In [53]:
stats_dict= md_increment_compare(data)

In [54]:
batch_keys= list(stats_dict.keys())
batch_list= [x.split('C')[0] for x in batch_keys]
batch_dict= {
    z: [batch_keys[x] for x in range(len(batch_keys)) if batch_list[x] == z] for z in list(set(batch_list))
}

## average results by pops within batches:
## assumes all simulations within the same batch have the same populations. 

batch_stats= {}

for ba in batch_dict.keys():
    pops= list(stats_dict[batch_dict[ba][0]].keys())
    
    pop_dict= {
        pop: {
            z: np.array([stats_dict[ref][pop][z] for ref in batch_dict[ba]]) for z in stats_dict[batch_dict[ba][0]][pops[0]].keys()
        } for pop in pops
    }
    
    pop_dict= {
        pop: {
            z: np.mean(g,axis= 0) for z,g in pop_dict[pop].items()
        } for pop in pops
    }
    
    batch_stats[ba]= pop_dict

            

In [55]:
fig= [go.Scatter(
    x= batch_stats[ep][pop]['sizes'],
    y= batch_stats[ep][pop]['means'],
    error_y= dict(
        array= batch_stats[ep][pop]['stds'] * 3,
        type= 'data',
        #symmetric= True,
        visible=True
    ),
    name= '-'.join([ep,pop])
) for ep in batch_stats.keys() for pop in batch_stats[ep].keys()
]

layout= go.Layout(
    title= 'self grid SSD sample size',
    xaxis= dict(
        title= 'relative sample size'
    ),
    yaxis= dict(
        title= 'sum of squared diffs'
    )
)
Figure= go.Figure(data= fig,layout=layout)

iplot(Figure)

In [75]:
from scipy import stats
thet= recursively_default_dict()

for ba in batch_stats.keys():
    for pop in batch_stats[ba].keys():
        ts= []
        sizes_avail= batch_stats[ba][pop]['sizes']
        
        for idx in range(1,len(sizes_avail)):
            rvs= {
                z: [batch_stats[ba][pop]['means'][z],batch_stats[ba][pop]['stds'][z]] for z in [idx-1,idx]
            } 
            rvs= {
                z: stats.norm.rvs(loc=g[0],scale=g[1],size=1000) for z,g in rvs.items()
            }
            
            tstat= stats.ttest_ind(rvs[idx],rvs[idx-1],equal_var = False)
            
            ts.append(tstat[1])
        
        thet[ba][pop]= {
            'ts': ts,
            'sizes': sizes_avail[1:]
        }



In [81]:
fig= [go.Scatter(
    x= thet[ep][pop]['sizes'],
    y= [np.log(x) for x in thet[ep][pop]['ts']],
    name= '-'.join([ep,pop]),
    mode= 'markers'
) for ep in thet.keys() for pop in thet[ep].keys()
]

layout= go.Layout(
    title= 'self grid SSD sample size',
    xaxis= dict(
        title= 'relative sample size'
    ),
    yaxis= dict(
        title= 'sum of squared diffs'
    )
)
Figure= go.Figure(data= fig,layout=layout)

iplot(Figure)


divide by zero encountered in log



### Mutation profile 

- Extract mutation counts from mutation counter output
- Compare populations across mutation types using Chi2
- return matrix of **proportions or p-vals** (*) _per_ population comparison _per_ simulation.

The `data` dictionary below stores grids and respective significance indicators per simulation. 

In [7]:
sims_dir= main_dir + 'mutation_counter/data/sims/'

available= list(count_data.keys())
subsamp= len(count_data)
avail_sub= np.random.choice(available,subsamp,replace= False)

### 1. extract grids
grids= [count_data[s]['grids'] for s in avail_sub]
props= [count_data[s]['prop'] for s in avail_sub]

#grids= list(it.chain(*grids))

## mask infinite values and compute std.
#grids= [np.ma.masked_where(a == np.inf, a) for a in grids]
grid_mean= [np.mean(x) for x in grids] 
grid_std= [np.std(x) for x in grids]
prop_mean= [np.mean(x) for x in props]

### 2. calculate proportions across smulations
pop_proportions= [count_data[s]['sizes'][1] / count_data[s]['sizes'][0] for s in avail_sub]

### 3. batch names
batch_names= [count_data[s]['batch'] for s in avail_sub]
batch_dict= {
    z:[x for x in range(len(avail_sub)) if batch_names[x] == z] for z in list(set(batch_names))
}


In [8]:
from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)


####
####
fig= [go.Scatter(
    x= [pop_proportions[x] for x in batch_dict[i]],
    y= [grid_mean[x] for x in batch_dict[i]],
        error_y=dict(
            type='data', 
            array=grid_std,
            visible=False),
    name= str(i),
    mode= 'markers'
)  for i in list(batch_dict.keys())]

layout= go.Layout(
    title= 'Mutation spectrum divergence and relative sampling',
    xaxis= dict(
        title= 'relative sampling',
        range= [0,1.05]
    ),
    yaxis= dict(
        title= 'mean  matrix p-val',
        range= [0,1.05]
    ),
    font=dict(
        family="Courier New, monospace",
        size=15,
        color="#7f7f7f"
    )
)

figure= go.Figure(data= fig,layout= layout)

iplot(figure)

In [15]:
from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)


####
####
fig= [go.Scatter(
    x= [pop_proportions[x] for x in batch_dict[i]],
    y= [prop_mean[x] for x in batch_dict[i]],
        error_y=dict(
            type='data', 
            array=grid_std,
            visible=False),
    name= str(i),
    mode= 'markers'
)  for i in list(batch_dict.keys())]

layout= go.Layout(
    title= 'Mutation spectrum divergence and relative sampling',
    xaxis= dict(
        title= 'relative sampling',
        range= [0,1.05]
    ),
    yaxis= dict(
        title= 'ave. count proportions',
        range= [0,1.05]
    ),
    font=dict(
        family="Courier New, monospace",
        size=15,
        color="#7f7f7f"
    )
)

figure= go.Figure(data= fig,layout= layout)

iplot(figure)

In [13]:
batch_names= [count_data[s]['batch'] for s in available]
batch_dict= {
    z:[x for x in range(len(available)) if batch_names[x] == z] for z in list(set(batch_names))
}

### 1. extract grids
grids= [count_data[s]['grids'] for s in available]
grids= list(it.chain(*grids))

## mask infinite values and compute std.
grids= [np.ma.masked_where(a == np.inf, a) for a in grids]
grid_mean= [[np.mean(grids[x]) for x in batch_dict[i]] for i in batch_dict.keys()]
grid_std=  [[np.std(grids[x]) for x in batch_dict[i]] for i in batch_dict.keys()]



In [23]:
import plotly.figure_factory as ff
import numpy as np

np.random.seed(1)

hist_data= [np.array(x) for x in grid_mean]
hist_data= [x[~np.isnan(x)] for x in hist_data]
#hist_data= [hist_data]
group_labels = list(batch_dict.keys()) # name of the dataset

fig = ff.create_distplot(hist_data, group_labels,
                        bin_size=.005,show_rug=False)


fig.update_layout(title_text='mean p-val distribution for mutation count matrices at 10Mb windows.')
#fig.update_layout(xaxis_title= 'pval')
fig['layout']["xaxis"].update(title= 'pval',range= [0,1.03])
fig.update_layout(yaxis_title= 'density')

iplot(fig)

In [15]:

### 1. extract grids
grid_list= [count_data[s]['grids'] for s in available]
grid_list= [[grid_list[x] for x in batch_dict[i]] for i in batch_dict.keys()]

#grids= list(it.chain(*grids))
mut_means= []

for bat in range(len(grid_list)):
    grids= grid_list[bat]
    shape_muts= grids[0].shape

    ## mask infinite values and compute std.
    #grids= [np.ma.masked_where(a == np.inf, a) for a in grids]

    mut_pvals= []

    for i in range(shape_muts[0]):
        for j in range(shape_muts[1]):
            mut_vec= [x[i,j] for x in grids]
            mut_pvals.append(mut_vec)

    mut_m= [np.nanmean(x) for x in mut_pvals]
    mut_m= np.array(mut_m)
    mut_means.append(mut_m)



In [16]:
hist_data= mut_means
hist_data= [x[~np.isnan(x)] for x in hist_data]

group_labels = list(batch_dict.keys()) # name of the dataset

fig = ff.create_distplot(hist_data, group_labels,
                        bin_size=.005, show_rug=False)


fig.update_layout(title_text='mean p-val distribution for mutation count matrices at 10Mb windows.')
fig['layout']["xaxis"].update(title= 'pval',range= [0,1.03])
fig.update_layout(yaxis_title= 'density')


In [45]:
len(data)

31031

In [46]:
len(available)

31000

In [47]:
[len(x) for x in batch_dict.values()]

[304, 342, 354]