In [1]:
import sys
import argparse
import numpy as np
from scipy.stats import chi2_contingency
from itertools import product
import itertools as it

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *


from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

In [2]:


comp = {
    'A': 'T',
    'C': 'G',
    'G': 'C',
    'T': 'A',
}
ypos, ylabel = [], []

mut_index = {}
row, col = 0, 0

labels= []

for b2, d in [('A', 'T'), ('A', 'C'), ('A', 'G'),
              ('C', 'T'), ('C', 'G'), ('C', 'A')]:
    
    for b1 in 'ACGT':
        row_lab= []
        col = 0
        ypos.append(row+0.5)
        if b1 == 'T' and b2 == 'C' and d == 'A':
            ylabel.append('5\'-'+b1)
        elif b1 == 'C':
            ylabel.append(b2+r'$\to$'+d+r'  '+b1)
        else:
            ylabel.append(b1)
        for b3 in 'ACGT':
            mut_index[(b1+b2+b3, d)] = (row, col)
            
            mut_index[(comp[b3]+comp[b2]+comp[b1], comp[d])] = (row, col)
            row_lab.append('_'.join([b1+b2+b3, d]))
            
            col += 1
        labels.append(row_lab)
        row += 1



In [38]:
from plot_utilities import Population, frequency_breakdown, heatmap, make_titles

pops= ['littoralis', 'brevicaudus', 'tcheliensis', 'lasiotis', 'mulatta', 'CH']

focus= 'tcheliensis'
population_pairs= [[focus,x] for x in pops]
population_pairs= list(it.chain(*population_pairs))

#population_pairs= ['k3', 'k1']

align= 'rheMac10'
vcf_data= 'vcf_data'
chromosomes= list(range(1,21))
chromosomes= [str(x) for x in chromosomes]
#chromosomes= ["1"]
individually= False
exclude= False
p_value= 1e-5
frequency_range= [0,1]


In [39]:


pop_pair_names= zip(population_pairs[::2],
                           population_pairs[1::2])

pop_pair_names= ['-'.join(list(x)) for x in pop_pair_names]

population_pairs= zip(population_pairs[::2],
                           population_pairs[1::2])

for chromosome in chromosomes:
    assert chromosome == 'X' or int(chromosome) in range(1, 23)
if individually:
    chromosome_groups = [[chromosome] for chromosome in chromosomes]
else:
    chromosome_groups = [chromosomes]



In [40]:
def read_exclude(dirf='../data/bed_files/',filename='files_Regexclude.txt'):
    '''
    read files of regions to exclude in mutation analysis. 
    list file of files in "../bed_files/" to read.
    '''
    filename= dirf + filename

    with open(filename) as f:
        files= f.readlines()

    files= [x.strip() for x in files]

    return files


def frequency_breakdown(path, chromosomes, frequency_range):
    
    count_array = np.zeros((row, col))
    for chrom in chromosomes:
        infile = open(path % str(chrom))
        lines = infile.readlines()
        infile.close()

        s = lines[1].strip('\n').split(' ')
        # TODO(mason) do these better.
        start_index = 2
        while float(start_index - 2) / (len(s) - 2) < frequency_range[0]:
            start_index += 1

        end_index = len(s) - 1
        while float(end_index) / (len(s)-2) > frequency_range[1]:
            end_index -= 1

        for line in lines[1:]:
            s = line.strip('\n').split(' ')
            for i in range(start_index, end_index):
                count_array[mut_index[(s[0][:3], s[0][4])]] += int(s[i])
    return count_array



def compartment_heatmap(chromosomes, pop, frequency_range, exclude, p_value, short,vcf_data):
    pop_counts = {}
    num_variants = {}
    outdir= '../{}_finescale_mut_spectra_vcf.{}/'.format(short,vcf_data)

    files= read_exclude()
    
    compartments= {}
    
    path = (outdir + 'mut_type_v_allele_freq_' +
            pop + '_chr%s_nosingle.txt')
    
    compartments[pop] = frequency_breakdown(path, chromosomes,
                                          frequency_range)

    for file in files:
        file_name= file.split('.')[0]
        repeats_path = (outdir + file_name + '_mut_type_v_allele_freq_' +
                        pop + '_chr%s_nosingle.txt')
    
        compart= frequency_breakdown(repeats_path, chromosomes,
                                                frequency_range)
        
        compartments[file_name]= compart
        compartments[pop]-= compart
        num_variants[file_name]= compart.sum()
    
    num_variants[pop] = compartments[pop].sum()
    
    pop_innerds= {}
    
    for file in files:
        file_name= file.split('.')[0]
        
        ratio_grid = np.zeros((row, col))
        sig_x, sig_y = [], []
        for i in range(row):
            for j in range(col):
                chi_array= np.array([
                        [compartments[file_name][i][j], num_variants[file_name]],
                        [compartments[pop][i][j], num_variants[pop]]
                    ])

                _, this_pval, _, _ = chi2_contingency(
                    chi_array
                )
                ratio_grid[i][j] = (compartments[file_name][i][j] * num_variants[pop] /
                                    (num_variants[file_name] * compartments[pop][i][j]))
                if this_pval < p_value:
                    sig_x.append(j+0.5)
                    sig_y.append(i+0.5)
                
        pop_innerds[file_name]= {
            'grid':ratio_grid, 
            'sig': (sig_x, sig_y)
        }

    return pop_innerds


In [41]:

heat_dict= compartment_heatmap(chromosomes, focus, frequency_range, exclude, p_value, align, vcf_data)
titles= list(heat_dict.keys())

ratio_grids= [heat_dict[x]['grid'] for x in titles]
significant_indices= [heat_dict[x]['sig'] for x in titles]

plot_title, column_titles = make_titles(
    chromosome_groups, population_pairs, frequency_range, exclude, p_value
)

from plot_utilities import read_exclude

files= read_exclude()
print(files)


['cpgIslandExtUnmasked.txt.gz', 'nestedRepeats.txt.gz', 'microsat.txt.gz']


In [42]:
from plotly.subplots import make_subplots
from plotly import tools

titles= list(heat_dict.keys())

Ncols= 3
Nrows= int(len(titles) / float(Ncols)) + (len(titles) % Ncols > 0)

fig_subplots = tools.make_subplots(rows= Nrows, cols=Ncols,
                             subplot_titles=tuple(titles))

#####
for gp in range(len(titles)):

    pos1= int(float(gp) / Ncols) + 1
    pos2= gp - (pos1-1)*Ncols + 1

    title= titles[gp]
    
    ####
    sig_idx= gp
    fig = [go.Heatmap(
        z= [list(x) for x in ratio_grids[sig_idx]],
        zmin= 0.85,
        zmax= 1.15,
        text= labels,
        type = 'heatmap'
    )]

    sig_fig= significant_indices[sig_idx]


    if sum([len(x) for x in sig_fig]):

        fig.append(go.Scatter(x= [int(x) for x in sig_fig[0]], y= [int(x) for x in sig_fig[1]],
         mode='markers',line =dict(color='white',width=3),showlegend= False))
    
    for line in range(1,6):
        fig.append(go.Scatter(x= [-.5,3.5], y= [-.5 + 4 * line] * 2,
             mode='lines',line =dict(color='black',width=2),showlegend= False))
    
    ####

    for trace1 in fig:
        
        fig_subplots.append_trace(trace1, pos1, pos2)
    
    if pos2 == 1:
        fig_subplots.update_yaxes(range=[-.5, 24], row=pos1, col=pos2,
                                  title= 'A-T{}A-C{}A-G{}C-T{}C-G{}C-A'.format(*['\t'*6]*5))
    
    fig_subplots.update_xaxes(title= 'A{}C{}G{}T'.format(*['\t'*8]*3))
        #fig_subplots['layout'].update(height= 700,width= 450)


layout= go.Layout(
        xaxis=dict(range=[1.5, 4.5]),
        yaxis=dict(range=[1.5, 4.5])
)

fig= go.Figure(data=fig_subplots, layout=layout)
fig["layout"].update(yaxis=dict(range=[-.5,24]))
fig['layout'].update(height= Nrows * 550,width= 900)
fig['layout'].update(title= plot_title)
iplot(fig)