In [1]:
import sys
import argparse
import numpy as np
from scipy.stats import chi2_contingency
from itertools import product
import itertools as it

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)
    
from datetime import datetime
import tempfile
import os
import gzip 
import subprocess


### Mutation counter - real data

In [7]:
def mutation_counter_launch(logfile,count_dir= './count/', 
                dir_launch= '..',main_dir= './', outlog= 'muted.log'):
    '''
    launch mutation counter.
    - read mut.log to know which have not been yet processed.
    - launch process_chromosomes.py using simulation name. 
    '''
    with open(logfile,'r') as fp:
        lines= fp.readlines()
    
    
    sims= [x.strip() for x in lines]
    sims= [x for x in sims if x]
    sims= [x.split()[0] for x in sims]
    chroms= [x.split('.')[0].split('C')[-1].strip('chr') for x in sims]
    
    job= 'python process_chromosomes.py -c {} -r {} -s {} -v {}_ -q {} -d {}'
    
    sims= [job.format(chroms[x],*[sims[x]]*4,dir_launch) for x in range(len(sims))]
    
    os.chdir(count_dir)
    for sim in sims:
        
        os.system(sim)
    
    os.chdir(main_dir)

    with open(outlog,'a') as fp:
        fp.write('\n' + ''.join(lines))

    open(logfile,'w').close()




## directories
main_dir= os.getcwd() + '/'
count_dir= main_dir + 'mutation_counter/count/'
dir_launch= main_dir + 'mutation_counter'
mutlog= 'regions.log'

print('launch mutation counter.')
mutation_counter_launch(mutlog,count_dir= count_dir, 
                        dir_launch= dir_launch,main_dir= main_dir)


launch mutation counter.


## Data analysis

So, we calculated mutation type counts for each population and compared them accross simulations. We are interested to see if sampling could have an impact on the variance of count differences. 

The function `heatmap` returns a matrix of count proportions across types for each pairwise comparison. We will calculate the variance of each matrix, and plot it against the relative sampling across populations

### Mutation profile 

- Extract mutation counts from mutation counter output
- Compare populations across mutation types using Chi2
- return matrix of **proportions or p-vals** (*) _per_ population comparison _per_ simulation.

The `data` dictionary below stores grids and respective significance indicators per simulation. 

In [17]:
from tools.plot_utilities import Population, frequency_breakdown, heatmap
from tools.compare_utilities import (
    get_available_muts, deploy_count, pops_from_sim, check_availability, clean_empty
)

from tools.mcounter_tools import process_dir


In [28]:
import plotly.figure_factory as ff
import numpy as np

### set up dirs.
main_dir= '/mnt/d/GitHub/fine-scale-mutation-spectrum-master/sim_compare/'
log_dir= main_dir
sims_dir= main_dir + 'mutation_counter/data/sims/'
muted_dir= main_dir + 'mutation_counter/data/mutation_count/'

muted= log_dir + 'muted.log'

### read log of available counts,
### set parameters for comparison.
extract= 'pval'
ind_file= 'ind_assignments'
#available= get_available_muts(muted)
available= process_dir(sims_dir)

print(available)
### cleaning data set 
### i.e. accounting for aborted runs.
available, miss_data= check_availability(available, dir_check=sims_dir)
available, empty= clean_empty(available,str_format= '',dir_check= sims_dir,requested= ['.vcf.gz'])

available, miss_count= check_availability(available, str_format= '{}_finescale_mut_spectra_vcf.{}',
                                          dir_check=muted_dir)

available, empty_count= clean_empty(available,str_format= '{}_finescale_mut_spectra_vcf.{}',
                              dir_check= muted_dir,requested= ['mut_type_v'])

ladder= [len(available),len(miss_data),len(empty),len(empty_count),len(miss_count)]
print('available: {}; unbuilt: {}, empty: {}, uncounted: {}'.format(*ladder))

available= [x for x in available if x]
individually= False
exclude= False
p_value= 1e-5
frequency_range= [0,1]

data= deploy_count(available, frequency_range= frequency_range, p_value= p_value,
                                                muted_dir= muted_dir, sims_dir= sims_dir,extract= extract,ind_file= ind_file)




missing: 0, no vcf: 0
['testC1.110060044', 'testC1.119658411', 'testC1.126355222', 'testC1.138576699', 'testC1.159660841', 'testC1.217523114', 'testC1.26360778', 'testC1.93074015', 'testC1.94137375', 'testC10.46283417', 'testC10.50625623', 'testC10.68662880', 'testC10.90277336', 'testC11.23124600', 'testC11.28124939', 'testC11.28155649', 'testC11.28900312', 'testC11.45989516', 'testC11.47111690', 'testC13.100800371', 'testC13.24774283', 'testC13.5726666', 'testC13.71774375', 'testC14.40926974', 'testC15.34847109', 'testC15.52385076', 'testC15.74479024', 'testC15.92188926', 'testC16.34096255', 'testC16.44310174', 'testC16.76558226', 'testC17.21307626', 'testC18.16180812', 'testC18.19844905', 'testC19.25748761', 'testC19.35803129', 'testC2.101741390', 'testC2.138776720', 'testC2.18183379', 'testC2.194992072', 'testC20.10583029', 'testC20.3289774', 'testC20.33331848', 'testC20.37746023', 'testC3.103266141', 'testC3.115115555', 'testC3.140370500', 'testC3.168650735', 'testC3.181590698', 't

### Mutation count - population comparison

`data` holds matrices of mutation type count comparison _p_-values for every pair across regions.

> i. Average matrix p-values

In [36]:

### 1. extract grids
grids= [data[s]['grids'] for s in available]
grids= list(it.chain(*grids))

## mask infinite values and compute std.
grids= [np.ma.masked_where(a == np.inf, a) for a in grids]
grid_mean= [np.mean(x) for x in grids] 
grid_std= [np.std(x) for x in grids]

### 2. calculate proportions across smulations
pop_assignment= [[pops_from_sim(x,sims_dir,pop_set=False,ind_file= ind_file)] * len(data[x]['grids']) for x in available]
pop_assignment= list(it.chain(*pop_assignment))
pop_proportions= [{
    z: len([x for x in range(len(y)) if y[x]==z]) for z in list(set(y))
} for y in pop_assignment]

pop_proportions= [sorted(c.values()) for c in pop_proportions]
pop_proportions= [x[0] / x[1] for x in pop_proportions]


> plotting

In [37]:

np.random.seed(1)

hist_data= np.array(grid_mean)
hist_data= hist_data[~np.isnan(hist_data)]
hist_data= [hist_data]
group_labels = ['distplot'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels,
                        bin_size=.005)


fig.update_layout(title_text='mean p-val distribution for mutation count matrices.')
fig.update_layout(xaxis_title= 'pval')
fig.update_layout(yaxis_title= 'density')

iplot(fig)

> ii. mutation-type average p-values across regions.

In [38]:

### 1. extract grids
grids= [data[s]['grids'] for s in available]
grids= list(it.chain(*grids))

## mask infinite values and compute std.
grids= [np.ma.masked_where(a == np.inf, a) for a in grids]

shape_muts= grids[0].shape
mut_pvals= []

for i in range(shape_muts[0]):
    for j in range(shape_muts[1]):
        mut_vec= [x[i,j] for x in grids]
        mut_pvals.append(mut_vec)

mut_means= [np.nanmean(x) for x in mut_pvals]
mut_means= np.array(mut_means)


In [39]:
hist_data= mut_means[~np.isnan(mut_means)]
hist_data= [hist_data]
group_labels = ['distplot'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels,
                        bin_size=.005)


fig.update_layout(title_text='mean p-val distribution for mutation count matrices')
fig.update_layout(xaxis_title= 'pval')
fig.update_layout(yaxis_title= 'density')


### Extracting simulation data.

we now extract mutation count data for the simulated data sets. 

The `dataSim` dictionary stores grids and respective significance indicators per simulation. 

In [40]:
main_dir= '/mnt/d/GitHub/fine-scale-mutation-spectrum-master/slim_pipe/'

os.chdir(main_dir)

'''
from tools.plot_utilities import Population, frequency_breakdown, heatmap
from tools.compare_utilities import (
    get_available_muts, deploy_count, pops_from_sim, check_availability, clean_empty
)
'''
import tools.plot_utilities
import tools.compare_utilities

### set up dirs.
main_dir= '/mnt/d/GitHub/fine-scale-mutation-spectrum-master/slim_pipe/'
log_dir= main_dir
sims_dir= main_dir + 'mutation_counter/data/sims/'
muted_dir= main_dir + 'mutation_counter/data/mutation_count/'

muted= log_dir + 'muted.log'

### read log of available counts,
### set parameters for comparison.
extract= 'pval'
ind_file= "ind_assignments.txt"
#available= tools.compare_utilities.get_available_muts(muted)
available= process_dir(sims_dir)
### cleaning data set 
### i.e. accounting for aborted runs.
available, miss_data= tools.compare_utilities.check_availability(available, dir_check=sims_dir)
available, empty= tools.compare_utilities.clean_empty(available,str_format= '',dir_check= sims_dir,requested= ['.vcf.gz'])

available, miss_count= tools.compare_utilities.check_availability(available, str_format= '{}_finescale_mut_spectra_vcf.{}',
                                          dir_check=muted_dir)

available, empty_count= tools.compare_utilities.clean_empty(available,str_format= '{}_finescale_mut_spectra_vcf.{}',
                              dir_check= muted_dir,requested= ['mut_type_v'])

ladder= [len(available),len(miss_data),len(empty),len(empty_count),len(miss_count)]
print('available: {}; unbuilt: {}, empty: {}, uncounted: {}'.format(*ladder))

available= [x for x in available if x]
individually= False
exclude= False
p_value= 1e-5
frequency_range= [0,1]

dataSim= tools.compare_utilities.deploy_count(available, frequency_range= frequency_range, p_value= p_value,
                                                muted_dir= muted_dir, sims_dir= sims_dir,extract= extract,ind_file= ind_file)




missing: 0, no vcf: 80
available: 228; unbuilt: 0, empty: 123, uncounted: 5


In [42]:

### 1. extract grids

def mut_vals(available,data):
    grids= [data[s]['grids'] for s in available]
    grids= list(it.chain(*grids))

    ## mask infinite values and compute std.
    grids= [np.ma.masked_where(a == np.inf, a) for a in grids]

    shape_muts= grids[0].shape
    mut_pvals= []

    for i in range(shape_muts[0]):
        for j in range(shape_muts[1]):
            mut_vec= [x[i,j] for x in grids]
            mut_pvals.append(mut_vec)

    mut_means= [np.nanmean(x) for x in mut_pvals]
    mut_means= np.array(mut_means)
    
    return mut_means, shape_muts

sim_mut_means, shape_muts= mut_vals(available, dataSim)

### Joint analysis

Looking at mutation count comparisons in simulations and real data together.

> i. Mutation type _p_-values average across windows of simulated and real data.

In [43]:
hist_data= [x[~np.isnan(x)] for x in [mut_means,sim_mut_means]]

group_labels = ['real','sims'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels,
                        bin_size=.005)


fig.update_layout(title_text='mean p-val distribution for mutation count matrices')
fig.update_layout(xaxis_title= 'pval')
fig.update_layout(yaxis_title= 'density')


> ii. standardizing real data mutation-type averages.

Calculate mean and standard deviation of mutation-type p-values across simulations. Use these values to standardize mutation-type p-values calcluated from Rhesus data averaged across windows (10 Mb). plot as heatmap, highlight p-values below `threshold`. 

In [44]:

def mut_labels_k3():
    comp = {
        'A': 'T',
        'C': 'G',
        'G': 'C',
        'T': 'A',
    }
    ypos, ylabel = [], []

    mut_index = {}
    row, col = 0, 0

    labels= []

    for b2, d in [('A', 'T'), ('A', 'C'), ('A', 'G'),
                  ('C', 'T'), ('C', 'G'), ('C', 'A')]:

        for b1 in 'ACGT':
            row_lab= []
            col = 0
            ypos.append(row+0.5)
            if b1 == 'T' and b2 == 'C' and d == 'A':
                ylabel.append('5\'-'+b1)
            elif b1 == 'C':
                ylabel.append(b2+r'$\to$'+d+r'  '+b1)
            else:
                ylabel.append(b1)
            for b3 in 'ACGT':
                mut_index[(b1+b2+b3, d)] = (row, col)

                mut_index[(comp[b3]+comp[b2]+comp[b1], comp[d])] = (row, col)
                row_lab.append('_'.join([b1+b2+b3, d]))

                col += 1
            labels.append(row_lab)
            row += 1
    return labels

labels= mut_labels_k3()


In [45]:
threshold= -1.96

real_dists= [(x - np.mean(sim_mut_means)) / np.std(sim_mut_means) for x in mut_means]
real_dists= np.array(real_dists)
real_dist_grid= real_dists.reshape(shape_muts)

sig_idx= np.where(real_dist_grid < threshold)

from plotly.subplots import make_subplots
from plotly import tools

fig = [go.Heatmap(
    z= [list(x) for x in real_dist_grid],
    zmin= -2,
    zmax= 2,
    text= labels,
    type = 'heatmap',
    colorscale= 'RdBu'
)]

sig_fig= sig_idx


if sum([len(x) for x in sig_fig]):
    
    fig.append(go.Scatter(x= [int(x) for x in sig_fig[1]], y= [int(x) for x in sig_fig[0]],
     mode='markers',line =dict(color='white',width=3),showlegend= False))

for line in range(1,6):
    fig.append(go.Scatter(x= [-.5,3.5], y= [-.5 + 4 * line] * 2,
         mode='lines',line =dict(color='black',width=2),showlegend= False))


layout= go.Layout(
    yaxis= dict(range=[-.5, 23.5], 
                          title= 'A-T{}A-C{}A-G{}C-T{}C-G{}C-A'.format(*['\t'*12]*5)),
    xaxis= dict(title= 'A{}C{}G{}T'.format(*['\t'*15]*3),
               range= [-.5,3.5])
)

fig= go.Figure(data=fig, layout=layout)

fig['layout'].update(title= 'real windows, difference deviation relative to simulated distribution.',width= 500, height= 700)
iplot(fig)