In [5]:
import sys
import argparse
import numpy as np
from scipy.stats import chi2_contingency
from itertools import product
import itertools as it

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)



# Mutation study

### Population pairs and other arguments


In [6]:

comp = {
    'A': 'T',
    'C': 'G',
    'G': 'C',
    'T': 'A',
}
ypos, ylabel = [], []

mut_index = {}
row, col = 0, 0

labels= []

for b2, d in [('A', 'T'), ('A', 'C'), ('A', 'G'),
              ('C', 'T'), ('C', 'G'), ('C', 'A')]:
    
    for b1 in 'ACGT':
        row_lab= []
        col = 0
        ypos.append(row+0.5)
        if b1 == 'T' and b2 == 'C' and d == 'A':
            ylabel.append('5\'-'+b1)
        elif b1 == 'C':
            ylabel.append(b2+r'$\to$'+d+r'  '+b1)
        else:
            ylabel.append(b1)
        for b3 in 'ACGT':
            mut_index[(b1+b2+b3, d)] = (row, col)
            
            mut_index[(comp[b3]+comp[b2]+comp[b1], comp[d])] = (row, col)
            row_lab.append('_'.join([b1+b2+b3, d]))
            
            col += 1
        labels.append(row_lab)
        row += 1



## Analysis parameters and input 

In [7]:
from plot_utilities import Population, frequency_breakdown, heatmap, make_titles

main_dir= '/mnt/d/GitHub/fine-scale-mutation-spectrum-master/slim_pipe/'
log_dir= main_dir
sims_dir= main_dir + 'mutation_counter/data/sims/'
muted_dir= main_dir + 'mutation_counter/data/mutation_count/'

muted= log_dir + 'muted.log'

with open(muted,'r') as fp:
    available= fp.readlines()

available= [x.strip() for x in available]
sim= available[0]

sim_dir= sims_dir + '{}/'.format(sim)
ID_file= sim_dir + "ind_assignments.txt"

pops= []
with open(ID_file,'r') as sample_id_lines:
    for line in sample_id_lines:
        line= str.encode(line)
        sample_id, population = line.split()[:2]
        pops.append(population.decode())

pops= list(set(pops))

focus= pops[0]
population_pairs= [[focus,x] for x in pops]
population_pairs= list(it.chain(*population_pairs))

#population_pairs= ['k3', 'k1']

align= sim
vcf_data= 'vcf_data'

chromosomes= [sim.split('.')[0].split('C')[1]]

individually= False
exclude= False
p_value= 1e-5
frequency_range= [0,1]


### Processing parameters

In [8]:

pop_pair_names= zip(population_pairs[::2],
                           population_pairs[1::2])

pop_pair_names= ['-'.join(list(x)) for x in pop_pair_names]

population_pairs= zip(population_pairs[::2],
                           population_pairs[1::2])

for chromosome in chromosomes:
    assert chromosome == 'X' or int(chromosome) in range(1, 23)
if individually:
    chromosome_groups = [[chromosome] for chromosome in chromosomes]
else:
    chromosome_groups = [chromosomes]


### Extract counts by kmer, compare across population pairs

In [9]:
chrom_pop= list(product(chromosome_groups,list(population_pairs)))

heatmaps = [
    heatmap(
        chromosomes, population_pair, frequency_range, exclude, 
        p_value, align, muted_dir
    ) for chromosomes, population_pair in chrom_pop
]

ratio_grids, significant_indices = zip(*heatmaps)

plot_title, column_titles = make_titles(
    chromosome_groups, population_pairs, frequency_range, exclude, p_value
)

from plot_utilities import read_exclude

files= read_exclude()
print(files)

['cpgIslandExtUnmasked.txt.gz', 'nestedRepeats.txt.gz', 'microsat.txt.gz']


## Plot

In [17]:
from plotly.subplots import make_subplots
from plotly import tools

titles= pop_pair_names

Ncols= 3
Nrows= int(len(titles) / float(Ncols)) + (len(titles) % Ncols > 0)

fig_subplots = make_subplots(rows= Nrows, cols=Ncols,
                             subplot_titles=tuple(titles))

#####
for gp in range(len(titles)):

    pos1= int(float(gp) / Ncols) + 1
    pos2= gp - (pos1-1)*Ncols + 1

    title= titles[gp]
    
    ####
    sig_idx= gp
    fig = [go.Heatmap(
        z= [list(x) for x in ratio_grids[sig_idx]],
        zmin= 0.85,
        zmax= 1.15,
        text= labels,
        type = 'heatmap',
        colorscale= 'RdBu'
    )]

    sig_fig= significant_indices[sig_idx]


    if sum([len(x) for x in sig_fig]):

        fig.append(go.Scatter(x= [int(x) for x in sig_fig[0]], y= [int(x) for x in sig_fig[1]],
         mode='markers',line =dict(color='white',width=3),showlegend= False))
    
    for line in range(1,6):
        fig.append(go.Scatter(x= [-.5,3.5], y= [-.5 + 4 * line] * 2,
             mode='lines',line =dict(color='black',width=2),showlegend= False))
    
    ####

    for trace1 in fig:
        
        fig_subplots.append_trace(trace1, pos1, pos2)
    
    if pos2 == 1:
        fig_subplots.update_yaxes(range=[-.5, 24], row=pos1, col=pos2,
                                  title= 'A-T{}A-C{}A-G{}C-T{}C-G{}C-A'.format(*['\t'*6]*5))
    
    fig_subplots.update_xaxes(title= 'A{}C{}G{}T'.format(*['\t'*8]*3))


layout= go.Layout(
        xaxis=dict(range=[1.5, 4.5]),
        yaxis=dict(range=[1.5, 4.5])
)

fig= go.Figure(data=fig_subplots, layout=layout)
fig["layout"].update(yaxis=dict(range=[-.5,24]))
fig['layout'].update(height= Nrows * 550,width= 900)
fig['layout'].update(title= plot_title)
iplot(fig)

**Fig. Heatmap** Chi squared test of kmer counts between 1 population versus all others. Population, frequency range, and significance threshold on individual p-values determined in section **Analysis parameters and input** at the top of the page. 