In [1]:
import sys
import argparse
import numpy as np
from scipy.stats import chi2_contingency
from itertools import product
import itertools as it

import allel
import pandas as pd

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)
    
from datetime import datetime
import tempfile
import os
import gzip
import subprocess

import collections
def recursively_default_dict():
    return collections.defaultdict(recursively_default_dict)

In [2]:
from tools.SLiM_pipe_tools import (
    read_chrom_sizes, region_samplev2,
    fasta_RextractUnif, return_seqs, write_fastaEx, 
    process_recipe, SLiM_dispenserv1, 
)


from tools.fasta_utilities import (
    reference_sequence, get_mutations, get_by_path, 
    set_by_path, fasta_get_freq, get_complement,
    complement_dicts,collapse_freqs, kmer_dict_init,
    vcf_kmers, geno_kmers, vcf_muts_matrix,
    geno_muts_v2, kmer_comp_index, kmer_mut_index
)



In [7]:
### New, adapt to using matrices instead of written files. 


def ind_assignment_scatter_v1(reference,dir_sim= '',indfile= 'ind_assignments.txt',
                          min_size= 80, samp= [5,20,10], outemp= 'ind_assignments{}.txt',write= False):
    '''
    read ind assignments for a given window; 
    chose one population;
    subset that pop in some way.
    - v1: instead of writting new pop_assignment files, return them. 
    '''
    
    ind_assignments= dir_sim + reference + '/' + indfile
    
    with open(ind_assignments,'r') as f:
        inds= f.readlines()
    
    inds= [x.split() for x in inds]
    pops= np.array(inds)[:,1]
    pop_dict= {
        z: [x for x in range(len(pops)) if pops[x] == z] for z in list(set(pops))
    }
    
    tag_list= []
    tag_dict= {}
    
    ## criterium of choice. chose only one pop.
    pop_chose= [x for x in pop_dict.keys() if len(pop_dict[x]) >= min_size]
    if len(pop_chose):
        pop_chose= pop_chose[0]
        N= len(pop_dict[pop_chose])
        pop_list= pop_dict[pop_chose]

        for each in np.linspace(samp[0],N,samp[1]):  
            each= int(each)
            for perm in range(samp[2]):
                tag= '_' + '.'.join([pop_chose,str(each),str(perm)])
                
                smaller= np.random.choice(pop_list,each,replace= False)
                smaller= [int(x in smaller) for x in pop_list]

                new_pop= {
                    tag + '.s' + str(z): [pop_list[x] for x in range(len(smaller)) if smaller[x] == z] for z in list(set(smaller))
                }
                
                new_dict= {v:g for v,g in pop_dict.items() if v != pop_chose}
                new_dict.update(new_pop)
                
                if write:
                    dict_write(new_dict,inds,outemp= outemp, dir_sim= dir_sim, tag= tag)
                else:
                    tag_dict[tag]= new_dict
                tag_list.append(tag)

    if write:
        return tag_list
    else: 
        return tag_list, tag_dict, pop_dict



def MC_sample_matrix(logfile, min_size= 80, samp= [5,20,10], pops= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',
                    count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/',
                    outlog= 'indy.log', row= 24,col= 4,exclude= False):
    '''
    launch mutation counter pipeline on manipulated population assignments.
    Use matrix multiplication to extract counts. 
    '''
    
    sims= process_dir(sims_dir= main_dir+sim_dir)
    print(len(sims))
    tags= []
    sim_extend= []
    chroms= []
    
    data= {}
    
    for sim in sims:
        
        ## chromosome
        chrom= sim.split('.')[0].split('C')[-1].strip('chr')
        chromosomes= [sim.split('.')[0].split('C')[1]]
        chromosome_groups = [chromosomes]

        if exclude:
            files= read_exclude()
        else:
            files= {}


        ### read vcf

        row_info= 6
        header_info= 9
        phased= False
        vcf_dir= sims_dir + sim + '/'
        vcf_file= vcf_dir + sim + '_' + 'chr' + chrom + '.vcf.gz'

        genotype, summary, Names= read_geno_nanumv3(vcf_file, header_info= header_info,phased= phased)
        
        
        ## read fasta
        fasta_file= vcf_dir + 'chr{}_{}.fa.gz'.format(chrom,sim)

        with gzip.open(fasta_file,'r') as f:
            lines= f.readlines()
            lines= [x.decode() for x in lines]

        refseq= lines[1].strip()

        ###
        positions= [int(x) for x in summary.POS]
        wstart= int(min(positions))
        wend= int(max(positions))
        
        Wlen= wend - wstart
        ksize= 3 # odd.
        bases = 'ACGT'
        collapsed= True
        
        
        genotype_parse= [x for x in range(summary.shape[0]) if int(summary.POS[x])-1 >= wstart and int(summary.POS[x])-1 <= wend]
        Window= genotype[:,genotype_parse]
        subset_summary= summary.loc[genotype_parse,:].reset_index()
        
        ##
        mut_matrix, flag_reverse= vcf_muts_matrix_v1(refseq,subset_summary,start= wstart,end= wend,ksize= ksize,bases=bases, collapse= collapsed)
        if flag_reverse:
            Window[:,flag_reverse]= 2 - Window[:,flag_reverse]
        
        ind_collapsed_mat= geno_muts_v2(np.array(Window), mut_matrix)
        
        tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,main_dir= main_dir,
                          min_size= min_size, samp= samp, outemp= outemp)
        #print(tag_list)
        
        ## counts for no tag sim:
        pop_counts= {
            z: np.sum(ind_collapsed_mat[pop_dict[z],:],axis= 0) for z in pop_dict.keys()
        }
        
        pop_counts= {
            z:g.reshape(row,col) for z,g in pop_counts.items()
        }
        
        num_variants= {
            z: np.sum(ind_collapsed_mat[pop_dict[z],:]) for z in pop_dict.keys()
        }
        
        data[sim]= {
            'counts': pop_counts,
            'Nvars': num_variants,
            'sizes': {z:len(g) for z,g in pop_dict.items()}
        }
        
        if len(tag_list):
            ###
            sim_extend.append(sim)
            tags.append('')
            chroms.append(chrom)
            ###
            
            for idx in range(len(tag_list)):
                
                sim_extend.extend([sim]*len(tag_list))
                tags.extend(tag_list)
                chroms.extend([chrom]*len(tag_list))
                
                ##
                tag= tag_list[idx]
                ind_file= outemp.format(tags[idx])
                new_sim= sim + tag

                pop_dict= tag_dict[tag]

                pop_sizes= {
                    z: len(g) for z,g in pop_dict.items()
                }
                
                pops= list(set(pop_dict.keys()))
                
                ###
                pop_counts= {
                    z: np.sum(ind_collapsed_mat[pop_dict[z],:],axis= 0) for z in pop_dict.keys()
                }
                
                pop_counts= {
                    z:g.reshape(row,col) for z,g in pop_counts.items()
                }
                num_variants= {
                    z: np.sum(ind_collapsed_mat[pop_dict[z],:]) for z in pop_dict.keys()
                }
                
                data[new_sim]= {
                    'counts': pop_counts,
                    'Nvars': num_variants,
                    'sizes': {z:len(g) for z,g in pop_dict.items()}
                }
    
    return data




def count_popKmers(Window, mut_matrix, pop_dict, single= True, frequency_range= [0,1],row=24,col=4):
    '''
    Extract population mutation counts from _ind x kmer_ mutation matrix. 
    '''
    
    pop_counts= {}
    num_variants= {}
    
    for pop in pop_dict.keys():
        pop_gen= Window[pop_dict[pop],:]
        freqs= np.sum(pop_gen,axis= 0) / pop_gen.shape[0]
        ## discount alleles outside freq range.
        in_out= (freqs < frequency_range[0]) | (freqs > frequency_range[1])
        
        pop_gen[:,in_out]= 0
        
        if single: 
            pop_gen= np.sum(pop_gen,axis= 0) > 0
            pop_gen= np.array(pop_gen,dtype= int).reshape(1,len(pop_gen))
        
        pop_collapsed_mat= geno_muts_v2(pop_gen, mut_matrix)
        pop_summed= np.sum(pop_collapsed_mat,axis= 0)
        
        pop_counts[pop]= pop_summed.reshape(row,col)

        num_variants[pop]= np.sum(pop_collapsed_mat)

    return {
        'counts': pop_counts,
        'Nvars': num_variants,
        'sizes': {z:len(g) for z,g in pop_dict.items()}
    }





def MC_sample_matrix_v1(min_size= 80, samp= [5,20,10], frequency_range= [0,1],pops= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',
                    count_dir= './count/', dir_launch= '..',main_dir= './', sim_dir= 'mutation_counter/data/sims/', muted_dir= 'mutation_counter/data/mutation_count/',
                    outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False):
    '''
    launch mutation counter pipeline on manipulated population assignments.
    Use matrix multiplication to extract counts. 
    - v1 relies on count_popKmers() function to count mutations per pop. allows freq. filter and single mutaiton count.  
    '''
    
    sims= process_dir(sims_dir= sim_dir)
    print(len(sims))
    tags= []
    sim_extend= []
    chroms= []
    
    data= {}
    
    for sim in sims:
        
        ## chromosome
        chrom= sim.split('.')[0].split('C')[-1].strip('chr')
        chromosomes= [sim.split('.')[0].split('C')[1]]
        chromosome_groups = [chromosomes]

        if exclude:
            files= read_exclude()
        else:
            files= {}
        
        ### read vcf

        row_info= 6
        header_info= 9
        phased= False
        vcf_dir= sims_dir + sim + '/'
        vcf_file= vcf_dir + sim + '_' + 'chr' + chrom + '.vcf.gz'

        genotype, summary, Names= read_vcf_allel(vcf_file)
        
        if len(genotype) == 0:
            continue
        
        print(genotype.shape, sim)
        ## read fasta
        fasta_file= vcf_dir + 'chr{}_{}.fa.gz'.format(chrom,sim)

        with gzip.open(fasta_file,'r') as f:
            lines= f.readlines()
            lines= [x.decode() for x in lines]

        refseq= lines[1].strip()

        ###
        positions= [int(x) for x in summary.POS]
        wstart= int(min(positions))
        wend= int(max(positions))
        
        Wlen= wend - wstart
        ksize= 3 # odd.
        bases = 'ACGT'
        collapsed= True
        
        genotype_parse= [x for x in range(summary.shape[0]) if int(summary.POS[x])-1 >= wstart and int(summary.POS[x])-1 <= wend]
        Window= genotype[:,genotype_parse]
        subset_summary= summary.loc[genotype_parse,:].reset_index()
        
        ##
        mut_matrix, flag_reverse= vcf_muts_matrix_v1(refseq,subset_summary,start= wstart,end= wend,ksize= ksize,bases=bases, collapse= collapsed)
        if flag_reverse:
            Window[:,flag_reverse]= 2 - Window[:,flag_reverse]
        
        ind_collapsed_mat= geno_muts_v2(np.array(Window), mut_matrix)
        
        tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir,
                          min_size= min_size, samp= samp, outemp= outemp)
        #print(tag_list)
        total_inds= sum([len(x) for x in pop_dict.values()])
        if Window.shape[0] < total_inds:
            continue
        ## counts for no tag sim:
        data[sim]= count_popKmers(Window, mut_matrix, pop_dict, single= single, 
                                  frequency_range= frequency_range,row=row,col=col)
        
        if len(tag_list):
            ###
            sim_extend.append(sim)
            tags.append('')
            chroms.append(chrom)
            ###
            
            for idx in range(len(tag_list)):
                
                sim_extend.extend([sim]*len(tag_list))
                tags.extend(tag_list)
                chroms.extend([chrom]*len(tag_list))
                
                ##
                tag= tag_list[idx]
                ind_file= outemp.format(tags[idx])
                new_sim= sim + tag

                pop_dict= tag_dict[tag]
                
                data[new_sim]= count_popKmers(Window, mut_matrix, pop_dict, single= single, 
                                  frequency_range= frequency_range,row=row,col=col)
    
    return data



def vcf_muts_matrix_v1(refseq,summary,start= 0,end= 0,ksize= 3,bases='ATCG', collapse= True):
    ''' 
    Return matrix of mutation contexts by SNP in genotype array
    Each mutation is mapped to list of possible mutations as a binary vector.
    - v1 determines if alternative allele = reference allele in fasta. 
        if so, allele is switched, position idx is flagged. 
    '''
    
    mutations= get_mutations(bases= bases,ksize= ksize)
    kmers, kmer_idx= kmer_comp_index(mutations)
    
    mut_lib= kmer_mut_index(mutations)
    
    if end == 0:
        end= max(summary.POS)
    
    k5= int(ksize/2)
    k3= ksize - k5
    pos_mut= []
    flag_reverse= []
    
    for x in range(summary.shape[0]):
        pos= int(summary.POS[x]) - 1
        if pos >=  start and pos <= end:
            kmer= refseq[pos-k5: pos + k3]
            mut= kmer + summary.ALT[x]
            
            if kmer[1] == summary.ALT[x]:
                flag_reverse.append(x)
                mut= kmer+summary.REF[x]
            
            
            if len(mut) != 4: 
                print(kmer)
                print(summary.REF[x],summary.ALT[x])
                print(x,pos)
                print(len(refseq),summary.shape[0])
                if collapse:
                    mut_array=np.zeros(len(kmer_idx))
                    pos_mut.append(mut_array)
                    continue
                else:
                    mut_array=np.zeros(len(mutations))
                    pos_mut.append(mut_array)
                    continue
            if collapse:
                mut_index= kmers[mut]
                mut_array=np.zeros(len(kmer_idx))
            else:
                mut_index= get_by_path(mut_lib, list(mut))
                mut_array=np.zeros(len(mutations))
            
            mut_array[mut_index]= 1
            pos_mut.append(mut_array)
    
    pos_mut= np.array(pos_mut).T
    
    return pos_mut, flag_reverse


In [None]:
#from tools.SLiM_pipe_tools import mutation_counter_launch
import re
import pandas as pd
from tools.compare_utilities import (
    get_available_muts, count_compare, deploy_count, pops_from_sim, check_availability, clean_empty
)


## directories
main_dir= os.getcwd() + '/'
count_dir= main_dir + 'mutation_counter/count/'
dir_launch= main_dir + 'mutation_counter'
muted_dir= main_dir + 'mutation_counter/data/mutation_count/'
sims_dir= main_dir + 'mutation_counter/data/sims_1000G_10MB/'

mutlog= 'toMut.log'
min_size= 70
sampling= [5,100,5]

data = MC_sample_matrix_v1(min_size= min_size, samp= sampling, count_dir= count_dir, 
                        dir_launch= dir_launch,main_dir= main_dir,sim_dir= sims_dir,
                          muted_dir= muted_dir, 
                       exclude= False)


missing: 0, no vcf: 0
10
/mnt/d/GitHub/fine-scale-mutation-spectrum-master/sim_compare/mutation_counter/data/sims_1000G_10MB/HarrisC1.172486864/HarrisC1.172486864_chr1.vcf.gz
dict_keys(['samples', 'calldata/GT', 'variants/ALT', 'variants/CHROM', 'variants/FILTER_PASS', 'variants/ID', 'variants/POS', 'variants/QUAL', 'variants/REF'])
mutliple ref loci: 4507
(1092, 127144) HarrisC1.172486864


## Data analysis

So, we calculated mutation type counts for each population and compared them accross simulations. We are interested to see if sampling could have an impact on the variance of count differences. 

The function `heatmap` returns a matrix of count proportions across types for each pairwise comparison. We will calculate the variance of each matrix, and plot it against the relative sampling across populations

In [76]:
### new, 
from fisher import pvalue

def heatmap_v2(chromosomes,pop_counts, num_variants, population_dict,frequency_range, exclude, 
                p_value, muted_dir,tag= '',output= 'pval',row= 24, col= 4, test= 'fisher'):

    '''
    pairwise comparison of count matrices. Chi2 applied cell-wise. 
    p-value or proportion - output argument. 
    - v2: count matrices are provided in pop_counts dictionary. 
    '''
    if exclude:
        files= read_exclude()
    else:
        files= {}
    
    refpop, pop = list(pop_counts.keys())

    ratio_grid = np.zeros((row, col))
    sig_x, sig_y = [], []
    
    for i in range(row):
        for j in range(col):
            chi_array= np.array([
                    [pop_counts[pop][i][j], num_variants[pop]],
                    [pop_counts[refpop][i][j], num_variants[refpop]]
                ])

            chi_0= np.sum(chi_array,axis= 1)
            chi_1= np.sum(chi_array,axis= 0)
            
            if chi_0[0] == 0 or chi_0[1] == 0:
                ratio_grid[i][j] = np.nan
                sig_x.append(j+0.5)
                sig_y.append(i+0.5)
            
            elif chi_1[0] == 0 or chi_1[1] == 0:
                ratio_grid[i][j] = 1
            
            else:
                ##
                if test == 'chi2':
                    _, this_pval, _, _ = chi2_contingency(
                        chi_array
                    )
                else:
                    p= pvalue(pop_counts[pop][i][j], num_variants[pop],
                        pop_counts[refpop][i][j], num_variants[refpop])
                    this_pval= p.two_tail
                    
                if output == 'pval':
                    ratio_grid[i][j] = this_pval
                else:
                    ratio_grid[i][j] = (pop_counts[pop][i][j] * num_variants[refpop] /
                                        (num_variants[pop] * pop_counts[refpop][i][j]))
                if this_pval < p_value:
                    sig_x.append(j+0.5)
                    sig_y.append(i+0.5)

    return ratio_grid, (sig_x, sig_y)



In [None]:
### new - pair reference sims and subsetted populations.
### extract kmer comparisons (proportions or pvals) using heatmap_v2.
### make function. 

individually= False
exclude= False
p_value= 1e-5
test_m= 'fisher'
frequency_range= [0,1]
extract= 'pval'


avail= list(data.keys())
ref_idx= [int('pop' in avail[x]) for x in range(len(avail) )]
categ= {
    z: [x for x in range(len(avail)) if ref_idx[x] == z] for z in [0,1]
}

pop_asso= {avail[x]:recursively_default_dict() for x in categ[0]}

for av in categ[1]:
    dat= [x for x in data[avail[av]]['counts'].keys() if '_' in x]
    ref_sim= avail[av].split('_')[0]
    ref_pop= [x.split('.')[0].strip('_') for x in dat]
    for p in range(len(dat)):
        pop_asso[ref_sim][ref_pop[p]][avail[av]]= dat[p]


d= 0
count_data= recursively_default_dict()

for ref in pop_asso.keys():
    
    for pop in pop_asso[ref].keys():
        for sub in pop_asso[ref][pop].keys():
            
            batch= ref.split('C')[0]
            
            pop_dict= {
                ref: pop,
                sub: pop_asso[ref][pop][sub]
            }
            
            sizes= [data[ref]['sizes'][pop], data[sub]['sizes'][pop_asso[ref][pop][sub]]]
            #print(sizes)
            
            chromosomes= [x.split('.')[0].split('C')[1] for x in pop_dict.keys()]
            
            pop_counts= {
                x: data[x]['counts'][z] for x,z in pop_dict.items() 
            }
            
            num_variants= {
                x: data[x]['Nvars'][z] for x,z in pop_dict.items() 
            }
            
            ratio_grid, sig_cells= heatmap_v2(chromosomes,pop_counts,num_variants,
                                              pop_dict,frequency_range, exclude, 
                                                p_value, muted_dir,tag= '',test= test_m,output= 'pval')
            
            dist_prop= pop_counts[sub] / pop_counts[ref]
            
            count_data[d]= {
                'grids': ratio_grid,
                'sigs': sig_cells,
                'sizes': sizes,
                'batch': batch,
                'prop': dist_prop
            }
            
            d += 1


### Mutation profile 

- Extract mutation counts from mutation counter output
- Compare populations across mutation types using Chi2
- return matrix of **proportions or p-vals** (*) _per_ population comparison _per_ simulation.

The `data` dictionary below stores grids and respective significance indicators per simulation. 

In [74]:
##
available= list(count_data.keys())
subsamp= 9000
avail_sub= np.random.choice(available,subsamp,replace= False)

### 1. extract grids
grids= [count_data[s]['grids'] for s in avail_sub]
props= [count_data[s]['prop'] for s in avail_sub]

#grids= list(it.chain(*grids))

## mask infinite values and compute std.
#grids= [np.ma.masked_where(a == np.inf, a) for a in grids]
grid_mean= [np.mean(x) for x in grids] 
grid_std= [np.std(x) for x in grids]
prop_mean= [np.mean(x) for x in props]

### 2. calculate proportions across smulations
pop_proportions= [count_data[s]['sizes'][1] / count_data[s]['sizes'][0] for s in avail_sub]

### 3. batch names
batch_names= [count_data[s]['batch'] for s in avail_sub]
batch_dict= {
    z:[x for x in range(len(avail_sub)) if batch_names[x] == z] for z in list(set(batch_names))
}


In [75]:
from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)


####
####
fig= [go.Scatter(
    x= [pop_proportions[x] for x in batch_dict[i]],
    y= [grid_mean[x] for x in batch_dict[i]],
        error_y=dict(
            type='data', 
            array=grid_std,
            visible=False),
    name= str(i),
    mode= 'markers'
)  for i in list(batch_dict.keys())]

layout= go.Layout(
    title= 'Mutation spectrum divergence and relative sampling',
    xaxis= dict(
        title= 'relative sampling',
        range= [0,1.05]
    ),
    yaxis= dict(
        title= 'mean  matrix p-val',
        range= [0,1.05]
    ),
    font=dict(
        family="Courier New, monospace",
        size=15,
        color="#7f7f7f"
    )
)

figure= go.Figure(data= fig,layout= layout)

iplot(figure)

In [72]:
batch_names= [count_data[s]['batch'] for s in available]
batch_dict= {
    z:[x for x in range(len(available)) if batch_names[x] == z] for z in list(set(batch_names))
}

### 1. extract grids
grids= [count_data[s]['grids'] for s in available]
grids= list(it.chain(*grids))

## mask infinite values and compute std.
grids= [np.ma.masked_where(a == np.inf, a) for a in grids]
grid_mean= [[np.mean(grids[x]) for x in batch_dict[i]] for i in batch_dict.keys()]
grid_std=  [[np.std(grids[x]) for x in batch_dict[i]] for i in batch_dict.keys()]



In [73]:
import plotly.figure_factory as ff
import numpy as np

np.random.seed(1)

hist_data= [np.array(x) for x in grid_mean]
hist_data= [x[~np.isnan(x)] for x in hist_data]
#hist_data= [hist_data]
group_labels = list(batch_dict.keys()) # name of the dataset

fig = ff.create_distplot(hist_data, group_labels,
                        bin_size=.005,show_rug=False)


fig.update_layout(title_text='mean p-val distribution for mutation count matrices at 10Mb windows.')
#fig.update_layout(xaxis_title= 'pval')
fig['layout']["xaxis"].update(title= 'pval',range= [0,1.03])
fig.update_layout(yaxis_title= 'density')

iplot(fig)

In [21]:

### 1. extract grids
grid_list= [count_data[s]['grids'] for s in available]
grid_list= [[grid_list[x] for x in batch_dict[i]] for i in batch_dict.keys()]

#grids= list(it.chain(*grids))
mut_means= []

for bat in range(len(grid_list)):
    grids= grid_list[bat]
    shape_muts= grids[0].shape

    ## mask infinite values and compute std.
    #grids= [np.ma.masked_where(a == np.inf, a) for a in grids]

    mut_pvals= []

    for i in range(shape_muts[0]):
        for j in range(shape_muts[1]):
            mut_vec= [x[i,j] for x in grids]
            mut_pvals.append(mut_vec)

    mut_m= [np.nanmean(x) for x in mut_pvals]
    mut_m= np.array(mut_m)
    mut_means.append(mut_m)



In [205]:
hist_data= mut_means
hist_data= [x[~np.isnan(x)] for x in hist_data]

group_labels = list(batch_dict.keys()) # name of the dataset

fig = ff.create_distplot(hist_data, group_labels,
                        bin_size=.005, show_rug=False)


fig.update_layout(title_text='mean p-val distribution for mutation count matrices at 10Mb windows.')
fig['layout']["xaxis"].update(title= 'pval',range= [0,1.03])
fig.update_layout(yaxis_title= 'density')


In [91]:
len(data)

241241

In [92]:
len(available)

241000

In [93]:
[len(x) for x in batch_dict.values()]

[81000, 80000, 80000]