In [1]:
import sys
import argparse
import numpy as np
from scipy.stats import chi2_contingency
from itertools import product
import itertools as it

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)
    
from datetime import datetime
import tempfile
import os
import gzip 
import subprocess


## Sim pipeline


In [2]:
from tools.SLiM_pipe_tools import (
    read_chrom_sizes, region_sample, region_samplev2,
    fasta_RextractUnif, return_seqs, write_fastaEx, 
    process_recipe, SLiM_dispenserv1, 
)

>  Establish directories

In [3]:
## directories
main_dir= '/mnt/d/GitHub/fine-scale-mutation-spectrum-master/slim_pipe/'
slim_dir= '/mnt/d/GitHub/fine-scale-mutation-spectrum-master/slim_pipe/'
fastas_dir= '/mnt/d/GitHub/fine-scale-mutation-spectrum-master/slim_pipe/Fastas/'
##

dir_data= main_dir + 'mutation_counter/data/sims/'
count_dir= main_dir + 'mutation_counter/count/'
dir_launch= main_dir + 'mutation_counter'
slim_soft= slim_dir + 'sim*'

summary_file= 'sims.log'
mutlog= 'toMut.log'


### Provide SLiM recipe. 

Written to take arguments passed on below.

In [4]:
#
##
##
## SLiM recipe.
sim_dir= main_dir + 'Recipes/Human_sims/'
sim_recipe= 'Gravel_2011_frame_sample.slim'
sim_recipe= sim_dir + sim_recipe
##
##
#

### Sample Fasta

Choose assembly. whole genome (chr separated) file in `fasta_dir` above.

- Determine here the number and length of the segments simulated, using variables `N` and `L` respectively.

In [5]:

assembly= 'hg38'

## files & variables
## fasta segment lengths; number of segments / sims.
L= int(5e6)
N= 2


In [35]:
## Read chrom_sizes file to decide where to sample files from. 
chrom_sizes= read_chrom_sizes(assembly)

## Sample fasta.
##
fasta= fastas_dir + assembly + '.fa.gz'
rseqs= region_samplev2(L, chrom_sizes, N, fasta)


{'16': 1, '12': 1}
opening fasta chr: 12
opening fasta chr: 16


### Launch Simulations.

> select batch name (suffix tag in files generated).

In [36]:
## Simulation tag names, assembly to select from.
batch_name= 'Gravel'

**Cookbook**

Simulations are launched using the template file `sim_recipe` (see above). These files are prepared to accept certain varibles. These are to be passed to a `SLiM_dispenser` function in dictionary format. `cook_constants_*` functions prepare this dictionary.

- Below, the function `cook_constants_Gravel2sampleRange` samples two populations along a range of relative sizes, based on a recipe that takes 3 population size variables.

In [38]:
from tools.SLiM_pipe_tools import SLiM_dispenserv1
from tools.cookbook import cook_constants_Gravel2sampleRange

## Perform Simulations
print('launch SLiM jobs.')

cookargs= {
    "nrange": [.05,.5], 
    "step": N,
    "Nmax":100
}

sim_store, cookID= cook_constants_Gravel2sampleRange(rseqs,dir_data= dir_data,
               slim_dir= slim_dir, batch_name= batch_name,**cookargs)


SLiM_dispenserv1(sim_store, sim_recipe, cookID= cookID, slim_dir= slim_dir, batch_name= batch_name,
                    ID= cookID, L= L, logSims= summary_file, mutlog= mutlog)

launch SLiM jobs.


### Launch Mutation counter. 


In [41]:
from tools.SLiM_pipe_tools import mutation_counter_launch

mutlog= 'toMut.log'

print('launch mutation counter.')
mutation_counter_launch(mutlog,count_dir= count_dir, 
                        dir_launch= dir_launch,main_dir= main_dir)


launch mutation counter.


## Data analysis

So, we calculated mutation type counts for each population and compared them accross simulations. We are interested to see if sampling could have an impact on the variance of count differences. 

The function `heatmap` returns a matrix of count proportions across types for each pairwise comparison. We will calculate the variance of each matrix, and plot it against the relative sampling across populations

### Mutation profile 

- Extract mutation counts from mutation counter output
- Compare populations across mutation types using Chi2
- return matrix of **proportions or p-vals** (*) _per_ population comparison _per_ simulation.

The `data` dictionary below stores grids and respective significance indicators per simulation. 

In [8]:
from tools.plot_utilities import Population, frequency_breakdown, heatmap
from tools.compare_utilities import get_available_muts, deploy_count,pops_from_sim

### set up dirs.
main_dir= '/mnt/d/GitHub/fine-scale-mutation-spectrum-master/slim_pipe/'
log_dir= main_dir
sims_dir= main_dir + 'mutation_counter/data/sims/'
muted_dir= main_dir + 'mutation_counter/data/mutation_count/'

muted= log_dir + 'muted.log'

### read log of available counts,
### set parameters for comparison.
extract= 'pval'
available= get_available_muts(muted)
individually= False
exclude= False
p_value= 1e-5
frequency_range= [0,1]

data= deploy_count(available, frequency_range= frequency_range, p_value= p_value,
                                                muted_dir= muted_dir, sims_dir= sims_dir,extract= extract)


In [9]:

### 1. extract grids
grids= [data[s]['grids'] for s in available]
grids= list(it.chain(*grids))

## mask infinite values and compute std.
grids= [np.ma.masked_where(a == np.inf, a) for a in grids]
grid_mean= [np.mean(x) for x in grids] 
grid_std= [np.std(x) for x in grids]

### 2. calculate proportions across smulations
pop_assignment= [[pops_from_sim(x,sims_dir,pop_set=False)] * len(data[x]['grids']) for x in available]
pop_assignment= list(it.chain(*pop_assignment))
pop_proportions= [{
    z: len([x for x in range(len(y)) if y[x]==z]) for z in list(set(y))
} for y in pop_assignment]

pop_proportions= [sorted(c.values()) for c in pop_proportions]
pop_proportions= [x[0] / x[1] for x in pop_proportions]


> plotting

In [None]:
from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

####
####
fig= [go.Scatter(
    x= pop_proportions,
    y= grid_mean,
        error_y=dict(
            type='data', 
            array=grid_std,
            visible=False),
    mode= 'markers'
)]

layout= go.Layout(
    title= 'Mutation spectrum divergence and relative sampling',
    xaxis= dict(
        title= 'relative sampling'
    ),
    yaxis= dict(
        title= 'mean  matrix p-val',
        range= [0,1]
    ),
    font=dict(
        family="Courier New, monospace",
        size=15,
        color="#7f7f7f"
    )
)

figure= go.Figure(data= fig,layout= layout)

iplot(figure)