In [8]:
import sys
import argparse
import numpy as np
from scipy.stats import chi2_contingency
from itertools import product
import itertools as it

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *


from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

In [9]:
comp = {
    'A': 'T',
    'C': 'G',
    'G': 'C',
    'T': 'A',
}
ypos, ylabel = [], []

mut_index = {}
row, col = 0, 0

for b2, d in [('A', 'T'), ('A', 'C'), ('A', 'G'),
              ('C', 'T'), ('C', 'G'), ('C', 'A')]:
    for b1 in 'ACGT':
        col = 0
        ypos.append(row+0.5)
        if b1 == 'T' and b2 == 'C' and d == 'A':
            ylabel.append('5\'-'+b1)
        elif b1 == 'C':
            ylabel.append(b2+r'$\to$'+d+r'  '+b1)
        else:
            ylabel.append(b1)
        for b3 in 'ACGT':
            mut_index[(b1+b2+b3, d)] = (row, col)
            mut_index[(comp[b3]+comp[b2]+comp[b1], comp[d])] = (row, col)
            col += 1
        row += 1


def frequency_muts(path, chromosomes):
    
    mut_lib= {}
    
    for chrom in chromosomes:
        infile = open(path % str(chrom))
        lines = infile.readlines()
        infile.close()

        s = lines[1].strip('\n').split(' ')
        # TODO(mason) do these better.

        for line in lines[1:]:
            s = line.strip('\n').split(' ')
            mut= s[0]
            counts= np.array(s[1:],dtype= int)
            if mut in mut_lib.keys():
                mut_lib[mut]+= counts
            else:
                mut_lib[mut]= counts
    return mut_lib



## Frequency spectrum

Frequency spectrum as produced by mutation counter pipeline.

> 1. Parameters and input data

In [23]:
from plot.plot_utilities import Population, frequency_breakdown, heatmap, make_titles

pops= ['littoralis', 'brevicaudus', 'tcheliensis', 'lasiotis', 'mulatta', 'CH']


focus= 'littoralis'
align= 'rheMac10'
vcf_data= 'vcf_data'
chromosomes= list(range(1,21))
chromosomes= [str(x) for x in chromosomes]

outdir= '{}_finescale_mut_spectra_vcf.{}/'.format(align,vcf_data)

path = (outdir + 'mut_type_v_allele_freq_' + focus + '_chr%s_nosingle.txt')

count_array= frequency_muts(path, chromosomes)

## just a question of rounding them up to an integer.
count_norm= {x: np.array(count_array[x] * 1000 / sum(count_array[x]), dtype= int) for x in count_array.keys()}

count_norm_array= np.array(list(count_norm.values()))
mut_list= list(count_norm.keys())

## Analysis 

> 1. comparison of kmer frequency spectrum using PCA

In [24]:
from sklearn.decomposition import PCA
from plotly.subplots import make_subplots
from plotly import tools

n_comp= 3

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
features = pca.fit_transform(count_norm_array)

var_comps= pca.explained_variance_ratio_


from sklearn.decomposition import PCA

###

mut_indices= {
    z:[x for x in range(len(mut_list)) if mut_list[x][1] in z] for z in ['AT','CG']
}

col_mut= {
    'AT': 'blue',
    'CG': 'red'
}
#
'''
mut_indices= [str(int('CG' in x)) for x in mut_list]
mut_indices= {
    z:[x for x in range(len(mut_indices)) if mut_indices[x] == z] for z in ['0','1']
}

col_mut= {
    '0': 'blue',
    '1': 'red'
}
'''
###

titles= ['PC1-2','PC1-3']
fig_subplots = tools.make_subplots(rows= 1, cols=2,
                             subplot_titles=tuple(titles))

for col in range(1,3):
    fig_data= [go.Scatter(
            x= features[mut_indices[i],0],
            y= features[mut_indices[i],col],
            mode= "markers",
            text= [mut_list[x] for x in mut_indices[i]],
            marker= {
            'color': col_mut[i],
            'line': {'width': 1},
            'size': 8,
            'symbol': 'circle',
          "opacity": 1
          },
            name= str(i)
        ) for i in list(mut_indices.keys())]
    
    for trace1 in fig_data:
        fig_subplots.append_trace(trace1, 1, col)
        
    fig_subplots.update_xaxes(title= 'PC1')

    fig_subplots.update_yaxes(title= 'PC' + str(col + 1),row=1, col=col)

layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    yaxis=dict(
        title='PC2: {}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= 'PC1: {}'.format(round(var_comps[0],3)))
)


fig = go.Figure(data=fig_subplots, layout=layout)
fig['layout'].update(title= 'kmers')
iplot(fig)


> 2. Visualizing frequency spectrum across kmers

In [25]:
mut_counts_norm= (count_norm_array.T/count_norm_array.sum(axis=1)).T

X= []
Y= []
Z= []
labs= []

mut_indices= [['blue','red'][int(x[1] in 'AT')] for x in mut_list]
col_pl= []

for row in range(mut_counts_norm.shape[0]):
    for col in range(mut_counts_norm.shape[1]):
        X.append(row*5)
        Y.append(col * 1/mut_counts_norm.shape[1])
        Z.append(mut_counts_norm[row,col])
        col_pl.append(mut_indices[row])
        labs.append(mut_list[row])

fig_data= [go.Scatter3d(
        x= X,
        y= Y,
        z= Z,
        mode= "markers",
        text= labs,
        marker= {
        'color': col_pl,
        'line': {'width': 1},
        'size': 3,
        'symbol': 'circle',
      "opacity": 1
      }
)]

fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)

**Fig. kmer SFS** Frequency spectrum across uncollapsed kmers. colours undicate wheather the middle ancestral SNP was A/T (red) or C/G (blue).


The observation is roughly the same across other popualtions, but littoralis has the most individuals, which makes the pattern more obvious.