In [1]:
import sys
import argparse
import numpy as np
from scipy.stats import chi2_contingency
from itertools import product
import itertools as it

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)
    
from datetime import datetime
import tempfile
import os
import gzip
import subprocess

import collections
def recursively_default_dict():
    return collections.defaultdict(recursively_default_dict)

## Sim pipeline


In [2]:
from tools.SLiM_pipe_tools import (
    read_chrom_sizes, region_samplev2,
    fasta_RextractUnif, return_seqs, write_fastaEx, 
    process_recipe, SLiM_dispenserv1, 
)

>  Establish directories

### Launch Mutation counter. 


In [3]:

def mutation_counter_launch(logfile,count_dir= './count/', 
                dir_launch= '..',main_dir= './', outlog= 'muted.log'):
    '''
    launch mutation counter.
    - read mut.log to know which have not been yet processed.
    - launch process_chromosomes.py using simulation name. 
    '''
    with open(logfile,'r') as fp:
        lines= fp.readlines()
    
    
    sims= [x.strip() for x in lines]
    chroms= [x.split('.')[0].split('C')[-1].strip('chr') for x in sims]
    
    job= 'python process_chromosomes.py -c {} -r {} -s {} -v {}_ -q {} -d {}'
    
    sims= [job.format(chroms[x],*[sims[x]]*4,dir_launch) for x in range(len(sims))]
    
    os.chdir(count_dir)
    for sim in sims:
        
        os.system(sim)
    
    os.chdir(main_dir)

    with open(outlog,'a') as fp:
        fp.write('\n' + ''.join(lines))

    open(logfile,'w').close()



######################################
######################################

def dict_write(new_dict,inds,outemp= 'ind_assignments{}.txt',dir_sim= '',tag= ''):
    '''
    cofactor to ind_assignment_scatter()
    '''
    inds= np.array(inds)
    new_array= [[(inds[x,0],v) for x in new_dict[v]] for v in new_dict.keys()]
    new_array= list(it.chain(*new_array))
    new_array= ['\t'.join(x) for x in new_array]
    
    out= dir_sim + outemp.format(tag)
    
    with open(out,'w') as f:
        f.write('\n'.join(new_array))




def ind_assignment_scatter(reference,main_dir= '',pops= 'ind_assignments.txt',
                          min_size= 80, samp= [5,20,10], outemp= 'ind_assignments{}.txt'):
    '''
    read ind assignments for a given window; 
    chose one population;
    subset that pop in some way. 
    '''
    dir_sim= dir_launch + '/data/sims/' + reference + '/'
    ind_assignments= dir_sim + pops
    
    with open(ind_assignments,'r') as f:
        inds= f.readlines()
    
    inds= [x.split() for x in inds]
    pops= np.array(inds)[:,1]
    pop_dict= {
        z: [x for x in range(len(pops)) if pops[x] == z] for z in list(set(pops))
    }
    
    tag_list= []
    
    ## criterium of choice. chose only one pop.
    pop_chose= [x for x in pop_dict.keys() if len(pop_dict[x]) >= min_size]
    if len(pop_chose):
        pop_chose= pop_chose[0]
        N= len(pop_dict[pop_chose])
        pop_list= pop_dict[pop_chose]

        for each in np.linspace(samp[0],N,samp[1]):  
            each= int(each)
            for perm in range(samp[2]):
                tag= '_' + '.'.join([pop_chose,str(each),str(perm)])
                
                smaller= np.random.choice(pop_list,each,replace= False)
                smaller= [int(x in smaller) for x in pop_list]

                new_pop= {
                    tag + '.s' + str(z): [pop_list[x] for x in range(len(smaller)) if smaller[x] == z] for z in list(set(smaller))
                }
                
                new_dict= {v:g for v,g in pop_dict.items() if v != pop_chose}
                new_dict.update(new_pop)

                dict_write(new_dict,inds,outemp= outemp, dir_sim= dir_sim, tag= tag)

                tag_list.append(tag)

    return tag_list



def process_log(muted, sims_dir= ''):
    '''
    verify directories indicated in log exist in given directory.
    '''
    available= get_available_muts(muted)

    ### cleaning data set 
    ### i.e. accounting for aborted runs.
    available, miss_data= check_availability(available, dir_check=sims_dir)
    available, empty= clean_empty(available,str_format= '',dir_check= sims_dir,requested= ['.vcf.gz'])

    return available




def MC_sample_split(logfile, min_size= 80, samp= [5,20,10], pops= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',
                    count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/',
                    outlog= 'indy.log'):
    '''
    launch mutation counter pipeline on manipulated population assignments.
    '''
    
    sims= process_log(logfile,sims_dir= main_dir+sim_dir)
        
    job= 'python process_chromosomes.py -c {} -r {} -s {} -v {}_ -q {} -d {} -i {} -p {}'
    
    tags= []
    sim_extend= []
    chroms= []
    
    for sim in sims:
        
        chrom= sim.split('.')[0].split('C')[-1].strip('chr')
        tag_list= ind_assignment_scatter(sim,main_dir= main_dir,pops= pops,
                          min_size= min_size, samp= samp, outemp= outemp)
        #print(tag_list)
        
        if len(tag_list):
            ###
            sim_extend.append(sim)
            tags.append('')
            chroms.append(chrom)
            ###
            sim_extend.extend([sim]*len(tag_list))
            tags.extend(tag_list)
            chroms.extend([chrom]*len(tag_list))
    
    sims= [job.format(chroms[x],*[sim_extend[x]]*4,dir_launch,tags[x],outemp.format(tags[x])) for x in range(len(sim_extend))]
    
    print(len(sims))
    os.chdir(count_dir)
    for sim in sims:
        
        os.system(sim)
    
    os.chdir(main_dir)
    
    return sim_extend, tags, chroms, sims


    

In [4]:
#from tools.SLiM_pipe_tools import mutation_counter_launch
from tools.compare_utilities import (
    get_available_muts, count_compare, deploy_count, pops_from_sim, check_availability, clean_empty
)


## directories
main_dir= os.getcwd() + '/'
count_dir= main_dir + 'mutation_counter/count/'
dir_launch= main_dir + 'mutation_counter'
muted_dir= main_dir + 'mutation_counter/data/mutation_count/'
sims_dir= main_dir + 'mutation_counter/data/sims/'

mutlog= 'toMut.log'
min_size= 70
sampling= [5,20,10]

sim_extend, tags, chroms, sims= MC_sample_split(mutlog, min_size= min_size, samp= sampling, count_dir= count_dir, 
                        dir_launch= dir_launch,main_dir= main_dir,muted_dir= muted_dir)



30753


In [5]:
sim= process_log(mutlog,sims_dir= sims_dir)
len(sim)

259

In [6]:
def deploy_count_v2(sim_extend, tags, chroms, frequency_range= [0,1],muted_dir= './mutation_counter/data/mutation_count/',
                  sims_dir= './mutation_counter/data/sims/', outemp= 'ind_assignments{}.txt'):
    '''
    deploy count_compare() across simulations read
    '''
    
    #open(logfile,'w').close()
    
    #####
    ##### deploy counts, get data back
    
    data= {}
    missing= 0
    
    print(len(sim_extend))
    for idx in range(len(sim_extend)):
        sim= sim_extend[idx]
        tag= tags[idx]
        ind_file= outemp.format(tags[idx])
        
        pop_counts, num_variants, pop_sizes= count_per_pop(sim, frequency_range= frequency_range, tag= tag,
                                               muted_dir= muted_dir, sims_dir= sims_dir, ind_file= ind_file)
        
        if pop_counts:
        
            data[sim + tag] ={
                'counts':pop_counts, 
                'Nvars': num_variants,
                'sizes': pop_sizes
            }
        else:
            missing += 1
    
    print('{} missing indfiles.'.format(missing))
    return data



def count_per_pop(sim, frequency_range= [0,1], tag= '', muted_dir= './mutation_counter/data/mutation_count/',
                  sims_dir= './mutation_counter/data/sims/', exclude= False, ind_file= "ind_assignments.txt"):
    
    ''' perform pairwise population comparison of mutation counts for particular simulation'''
    pops= pops_from_sim(sim,sims_dir= sims_dir, ind_file= ind_file,pop_set= False)
    
    if len(pops)==0:
        return {}, {}, {}
    
    pop_sizes= {
        z: len([x for x in range(len(pops)) if pops[x] == z]) for z in list(set(pops))
    }
    pops= list(set(pops))
    
    ## chromosome 
    chromosomes= [sim.split('.')[0].split('C')[1]]
    chromosome_groups = [chromosomes]
    
    pop_counts, num_variants= get_counts(
            chromosomes, pops, frequency_range, exclude, 
            sim, muted_dir, tag= tag
        )
    
    return pop_counts, num_variants, pop_sizes


def pops_from_sim(sim,sims_dir= './mutation_counter/data/sims/',ind_file= "ind_assignments.txt",pop_set= True):
    '''read sim specific int to pop assignment, return pops.'''
    sim_dir= sims_dir + '{}/'.format(sim)
    avail= [name for name in os.listdir(sim_dir)]
    
    ID_file= sim_dir + ind_file
    
    pops= []
    
    if ID_file.split('/')[-1] in avail:
        with open(ID_file,'r') as sample_id_lines:
            for line in sample_id_lines:
                line= str.encode(line)
                sample_id, population = line.split()[:2]
                pops.append(population.decode())
    
    if pop_set:
        return list(set(pops))
    else:    
        return pops


def get_counts(chromosomes, pops, frequency_range, exclude, 
                short,muted_dir,tag= ''):

    outdir= muted_dir + '{}{}_finescale_mut_spectra_vcf.{}/'.format(short,tag,short)

    if exclude:
        files= read_exclude()
    else:
        files= {}

    pop_counts = {}
    num_variants = {}

    for pop in pops:
        path = (outdir + 'mut_type_v_allele_freq_' + pop + '_chr%s_nosingle.txt')
        pop_counts[pop] = frequency_breakdown(path, chromosomes,
                                              frequency_range)
        if exclude:

            for file in files:
                file_name= file.split('.')[0]
                repeats_path = (outdir + file_name + '_mut_type_v_allele_freq_' +
                                pop + '_chr%s_nosingle.txt')
                pop_counts[pop] -= frequency_breakdown(repeats_path, chromosomes,
                                                       frequency_range)
        
        num_variants[pop] = pop_counts[pop].sum()
    
    return pop_counts, num_variants




In [7]:
from tools.plot_utilities import Population, frequency_breakdown

store_now= list(sim_extend)

individually= False
exclude= False
p_value= 1e-5
frequency_range= [0,1]
extract= 'pval'

sim_extend, miss_data= check_availability(sim_extend, dir_check=sims_dir)
sim_extend, empty= clean_empty(sim_extend,str_format= '',dir_check= sims_dir,requested= ['.vcf.gz',])

sim_extend, miss_count= check_availability(sim_extend, str_format= '{}_finescale_mut_spectra_vcf.{}',
                                          dir_check=muted_dir)

sim_extend, empty_count= clean_empty(sim_extend,str_format= '{}_finescale_mut_spectra_vcf.{}',
                              dir_check= muted_dir,requested= ['mut_type_v'])

ladder= [len(sim_extend),len(empty_count),len(miss_count)]
print('available: {}; empty: {}, uncounted: {}'.format(*ladder))


data= deploy_count_v2(sim_extend, tags, chroms, frequency_range= frequency_range,
                                                muted_dir= muted_dir)



available: 29949; empty: 804, uncounted: 0
29949
8610 missing indfiles.


In [8]:


comp = {
    'A': 'T',
    'C': 'G',
    'G': 'C',
    'T': 'A',
}
ypos, ylabel = [], []

mut_index = {}
row, col = 0, 0

for b2, d in [('A', 'T'), ('A', 'C'), ('A', 'G'),
              ('C', 'T'), ('C', 'G'), ('C', 'A')]:
    for b1 in 'ACGT':
        col = 0
        ypos.append(row+0.5)
        if b1 == 'T' and b2 == 'C' and d == 'A':
            ylabel.append('5\'-'+b1)
        elif b1 == 'C':
            ylabel.append(b2+r'$\to$'+d+r'  '+b1)
        else:
            ylabel.append(b1)
        for b3 in 'ACGT':
            mut_index[(b1+b2+b3, d)] = (row, col)
            mut_index[(comp[b3]+comp[b2]+comp[b1], comp[d])] = (row, col)
            col += 1
        row += 1



def heatmap_v1(chromosomes,population_dict,frequency_range, exclude, 
                p_value, muted_dir,tag= '',output= 'pval'):


    if exclude:
        files= read_exclude()
    else:
        files= {}

    pop_counts = {}
    num_variants = {}

    for sim in population_dict.keys():
        pop= population_dict[sim]
        sim= sim.split('_')
        short= sim[0]
        if len(sim) > 1:
            tag= '_' + sim[1]
        else: tag= ''
        
        outdir= muted_dir + '{}{}_finescale_mut_spectra_vcf.{}/'.format(short,tag,short)
        path = (outdir + 'mut_type_v_allele_freq_' + pop + '_chr%s_nosingle.txt')
        pop_counts[pop] = frequency_breakdown(path, chromosomes,
                                              frequency_range)
        if exclude:

            for file in files:
                file_name= file.split('.')[0]
                repeats_path = (outdir + file_name + '_mut_type_v_allele_freq_' +
                                pop + '_chr%s_nosingle.txt')
                
                pop_counts[pop] -= frequency_breakdown(repeats_path, chromosomes,
                                                       frequency_range)
                

        num_variants[pop] = pop_counts[pop].sum()
    
    
    refpop, pop = list(pop_counts.keys())

    ratio_grid = np.zeros((row, col))
    sig_x, sig_y = [], []
    
    for i in range(row):
        for j in range(col):
            chi_array= np.array([
                    [pop_counts[pop][i][j], num_variants[pop]],
                    [pop_counts[refpop][i][j], num_variants[refpop]]
                ])

            chi_0= np.sum(chi_array,axis= 1)
            chi_1= np.sum(chi_array,axis= 0)
            
            if chi_0[0] == 0 or chi_0[1] == 0:
                ratio_grid[i][j] = np.nan
                sig_x.append(j+0.5)
                sig_y.append(i+0.5)
            
            elif chi_1[0] == 0 or chi_1[1] == 0:
                ratio_grid[i][j] = 1
            
            else:
                #print(chi_array)
                ##
                _, this_pval, _, _ = chi2_contingency(
                    chi_array
                )
                if output == 'pval':
                    ratio_grid[i][j] = this_pval
                else:
                    ratio_grid[i][j] = (pop_counts[pop][i][j] * num_variants[refpop] /
                                        (num_variants[pop] * pop_counts[refpop][i][j]))
                if this_pval < p_value:
                    sig_x.append(j+0.5)
                    sig_y.append(i+0.5)

    return ratio_grid, (sig_x, sig_y)



comp = {
    'A': 'T',
    'C': 'G',
    'G': 'C',
    'T': 'A',
}
ypos, ylabel = [], []

mut_index = {}
row, col = 0, 0

for b2, d in [('A', 'T'), ('A', 'C'), ('A', 'G'),
              ('C', 'T'), ('C', 'G'), ('C', 'A')]:
    for b1 in 'ACGT':
        col = 0
        ypos.append(row+0.5)
        if b1 == 'T' and b2 == 'C' and d == 'A':
            ylabel.append('5\'-'+b1)
        elif b1 == 'C':
            ylabel.append(b2+r'$\to$'+d+r'  '+b1)
        else:
            ylabel.append(b1)
        for b3 in 'ACGT':
            mut_index[(b1+b2+b3, d)] = (row, col)
            mut_index[(comp[b3]+comp[b2]+comp[b1], comp[d])] = (row, col)
            col += 1
        row += 1




def heatmap_v2(chromosomes,pop_counts, num_variants, population_dict,frequency_range, exclude, 
                p_value, muted_dir,tag= '',output= 'pval'):


    if exclude:
        files= read_exclude()
    else:
        files= {}
    
    refpop, pop = list(pop_counts.keys())

    ratio_grid = np.zeros((row, col))
    sig_x, sig_y = [], []
    
    for i in range(row):
        for j in range(col):
            chi_array= np.array([
                    [pop_counts[pop][i][j], num_variants[pop]],
                    [pop_counts[refpop][i][j], num_variants[refpop]]
                ])

            chi_0= np.sum(chi_array,axis= 1)
            chi_1= np.sum(chi_array,axis= 0)
            
            if chi_0[0] == 0 or chi_0[1] == 0:
                ratio_grid[i][j] = np.nan
                sig_x.append(j+0.5)
                sig_y.append(i+0.5)
            
            elif chi_1[0] == 0 or chi_1[1] == 0:
                ratio_grid[i][j] = 1
            
            else:
                #print(chi_array)
                ##
                _, this_pval, _, _ = chi2_contingency(
                    chi_array
                )
                if output == 'pval':
                    ratio_grid[i][j] = this_pval
                else:
                    ratio_grid[i][j] = (pop_counts[pop][i][j] * num_variants[refpop] /
                                        (num_variants[pop] * pop_counts[refpop][i][j]))
                if this_pval < p_value:
                    sig_x.append(j+0.5)
                    sig_y.append(i+0.5)

    return ratio_grid, (sig_x, sig_y)



In [37]:
from tools.mcounter_tools_II import mcounter_deploy

p_value= 1e-5
test_m= 'fisher'
individually= False
exclude= False
frequency_range= [0,1]
extract= 'pval'
tag_ref='_pop'

pop_asso, count_data= mcounter_deploy(data,p_value= p_value, test_m= test_m, individually= individually,
                                        exclude= exclude, frequency_range= frequency_range, extract= extract,
                                     muted_dir= muted_dir,tag_ref=tag_ref)

KeyError: '1'

In [45]:
avail= list(data.keys())
ref_idx= [int('pop' in avail[x]) for x in range(len(avail) )]
categ= {
    z: [x for x in range(len(avail)) if ref_idx[x] == z] for z in [0,1]
}

pop_asso= {avail[x]:recursively_default_dict() for x in categ[0]}

for av in categ[1]:
    dat= [x for x in data[avail[av]]['counts'].keys() if '_' in x]
    ref_sim= avail[av].split('_')[0]
    ref_pop= [x.split('.')[0].strip('_') for x in dat]
    for p in range(len(dat)):
        pop_asso[ref_sim][ref_pop[p]][avail[av]]= dat[p]


d= 0
count_data= recursively_default_dict()

for ref in pop_asso.keys():
    
    for pop in pop_asso[ref].keys():
        for sub in pop_asso[ref][pop].keys():
            
            pop_dict= {
                ref: pop,
                sub: pop_asso[ref][pop][sub]
            }
            
            sizes= [data[ref]['sizes'][pop], data[sub]['sizes'][pop_asso[ref][pop][sub]]]
            #print(sizes)
            
            chromosomes= [x.split('.')[0].split('C')[1] for x in pop_dict.keys()]
            
            pop_counts= {
                x: data[x]['counts'][z] for x,z in pop_dict.items() 
            }
            
            num_variants= {
                x: data[x]['Nvars'][z] for x,z in pop_dict.items() 
            }
            
            ratio_grid, sig_cells= heatmap_v2(chromosomes,pop_counts,num_variants,
                                              pop_dict,frequency_range, exclude, 
                                                p_value, muted_dir,tag= '',output= 'pval')
            
            dist_prop= pop_counts[sub] / pop_counts[ref]
            
            count_data[d]= {
                'grids': ratio_grid,
                'sigs': sig_cells,
                'sizes': sizes,
                'prop': dist_prop
            }
            
            d += 1




invalid value encountered in true_divide



## Data analysis

So, we calculated mutation type counts for each population and compared them accross simulations. We are interested to see if sampling could have an impact on the variance of count differences. 

The function `heatmap` returns a matrix of count proportions across types for each pairwise comparison. We will calculate the variance of each matrix, and plot it against the relative sampling across populations

### Mutation profile 

- Extract mutation counts from mutation counter output
- Compare populations across mutation types using Chi2
- return matrix of **proportions or p-vals** (*) _per_ population comparison _per_ simulation.

The `data` dictionary below stores grids and respective significance indicators per simulation. 

In [46]:
sims_dir= main_dir + 'mutation_counter/data/sims/'

available= list(count_data.keys())
### 1. extract grids
grids= [count_data[s]['grids'] for s in available]

## mask infinite values and compute std.
#grids= [np.ma.masked_where(a == np.inf, a) for a in grids]
grid_mean= [np.mean(x) for x in grids] 
grid_std= [np.std(x) for x in grids]

### 2. calculate proportions across smulations
pop_proportions= [min(count_data[s]['sizes']) / max(count_data[s]['sizes']) for s in available]


> plotting

In [47]:
sims_dir= main_dir + 'mutation_counter/data/sims/'

available= list(count_data.keys())
subsamp= len(count_data)
avail_sub= np.random.choice(available,subsamp,replace= False)

### 1. extract grids
grids= [count_data[s]['grids'] for s in avail_sub]
props= [count_data[s]['prop'] for s in avail_sub]

#grids= list(it.chain(*grids))

## mask infinite values and compute std.
#grids= [np.ma.masked_where(a == np.inf, a) for a in grids]
grid_mean= [np.mean(x) for x in grids] 
grid_std= [np.std(x) for x in grids]
prop_mean= [np.mean(x) for x in props]

### 2. calculate proportions across smulations
pop_proportions= [count_data[s]['sizes'][1] / count_data[s]['sizes'][0] for s in avail_sub]

### 3. batch names
batch_names= ['test'] * len(available)
batch_dict= {
    z:[x for x in range(len(avail_sub)) if batch_names[x] == z] for z in list(set(batch_names))
}


In [48]:
from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)


####
####
fig= [go.Scatter(
    x= [pop_proportions[x] for x in batch_dict[i]],
    y= [grid_mean[x] for x in batch_dict[i]],
        error_y=dict(
            type='data', 
            array=grid_std,
            visible=False),
    name= str(i),
    mode= 'markers'
)  for i in list(batch_dict.keys())]

layout= go.Layout(
    title= 'Mutation spectrum divergence and relative sampling',
    xaxis= dict(
        title= 'relative sampling',
        range= [0,1.05]
    ),
    yaxis= dict(
        title= 'mean  matrix p-val',
        range= [0,1.05]
    ),
    font=dict(
        family="Courier New, monospace",
        size=15,
        color="#7f7f7f"
    )
)

figure= go.Figure(data= fig,layout= layout)

iplot(figure)

In [49]:
from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)


####
####
fig= [go.Scatter(
    x= [pop_proportions[x] for x in batch_dict[i]],
    y= [prop_mean[x] for x in batch_dict[i]],
        error_y=dict(
            type='data', 
            array=grid_std,
            visible=False),
    name= str(i),
    mode= 'markers'
)  for i in list(batch_dict.keys())]

layout= go.Layout(
    title= 'Mutation spectrum divergence and relative sampling',
    xaxis= dict(
        title= 'relative sampling',
        range= [0,1.05]
    ),
    yaxis= dict(
        title= 'ave. count proportions',
        range= [0,1.05]
    ),
    font=dict(
        family="Courier New, monospace",
        size=15,
        color="#7f7f7f"
    )
)

figure= go.Figure(data= fig,layout= layout)

iplot(figure)

In [20]:

### 1. extract grids
grids= [count_data[s]['grids'] for s in available]
#grids= list(it.chain(*grids))

## mask infinite values and compute std.
grids= [np.ma.masked_where(a == np.inf, a) for a in grids]
grid_mean= [np.mean(x) for x in grids] 
grid_std= [np.std(x) for x in grids]




In [21]:
import plotly.figure_factory as ff
import numpy as np

np.random.seed(1)

hist_data= np.array(grid_mean)
hist_data= hist_data[~np.isnan(hist_data)]
hist_data= [hist_data]
group_labels = ['distplot'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels,
                        bin_size=.005,show_rug=False)


fig.update_layout(title_text='mean p-val distribution for mutation count matrices at 10Mb windows.')
fig.update_layout(xaxis_title= 'pval')
fig.update_layout(yaxis_title= 'density')

iplot(fig)

In [67]:

### 1. extract grids
grids= [count_data[s]['grids'] for s in available]
print(list(set([s.shape for s in grids])))
#grids= list(it.chain(*grids))
shape_muts= grids[0].shape

## mask infinite values and compute std.
#grids= [np.ma.masked_where(a == np.inf, a) for a in grids]

mut_pvals= []

for i in range(shape_muts[0]):
    for j in range(shape_muts[1]):
        mut_vec= [x[i,j] for x in grids]
        mut_pvals.append(mut_vec)

mut_means= [np.nanmean(x) for x in mut_pvals]
mut_means= np.array(mut_means)


[(24, 4)]


In [69]:
hist_data= mut_means[~np.isnan(mut_means)]
hist_data= [hist_data]
group_labels = ['distplot'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels,
                        bin_size=.005, show_rug=False)


fig.update_layout(title_text='mean p-val distribution for mutation count matrices at 10Mb windows.')
fig.update_layout(xaxis_title= 'pval')
fig.update_layout(yaxis_title= 'density')


In [50]:
len(data)

16314

In [51]:
len(grids)

16190

In [52]:
len(available)

16190