In [1]:
import sys
import argparse
import numpy as np
from scipy.stats import chi2_contingency
from itertools import product
import itertools as it

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *


from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

# kmer PCA 

> 1. establish kmer list with mutations and populations studied.

In [2]:
comp=dict({})
comp['A'],comp['C'],comp['G'],comp['T']='T','G','C','A'
ypos, ylabel=[],[]

inv_mut_index=dict({})
mut_index=dict({})
row, col = 0,0

for (b2,d) in [('A','T'),('A','C'),('A','G'),('C','T'),('C','G'),('C','A')]:
    for b1 in 'ACGT':
        col=0
        ypos.append(row+0.5)
        if b1=='T' and b2=='C' and d=='A':
            ylabel.append('5\' -'+b1)
        elif b1=='C':
            ylabel.append(b2+r'$\to$'+d+r'  '+b1)
        else:
            ylabel.append(b1)
        for b3 in 'ACGT':
            mut_index[(b1+b2+b3,d)]=(row,col)
            inv_mut_index[(row,col)]=b1+b2+b3+'_'+d
            mut_index[(comp[b3]+comp[b2]+comp[b1],comp[d])]=(row,col)
            col+=1
        row+=1

# groups=['EUR','EAS','SAS','AFR']
groups=['EUR','EAS','SAS','AFR','AMR']
pops=dict({})

pops['EAS']=['littoralis', 'brevicaudus', 'tcheliensis', 'lasiotis', 'mulatta', 'CH']


> 2. Parameters and Input

In [3]:

align= 'rheMac10'

vcf_data= 'vcf_data'
chromosomes= list(range(1,21))
chromosomes= [str(x) for x in chromosomes]

#
individually= False
exclude= True
p_value= 1e-4
frequency_range= [0,1]



> 3. Read data

In [4]:
chrom= '1'

outdir= '../{}_finescale_mut_spectra_vcf.{}/'.format(align,vcf_data)
filename= outdir + 'derived_each_lineage_chr{}_nosingle.txt'.format(chrom)
infile=open(filename,'r')
lines=infile.readlines()
infile.close()

s=lines[0].strip('\n').split(' ')

indices = {}
for i in range(1,len(s)):
    try:
        indices[s[i]].append(i-1)
    except KeyError:
        indices[s[i]] = [i - 1]

mut_counts=np.zeros((2*(len(s)-1),len(lines)-1))
mut_list=[]

for chrom in chromosomes:
    filename= outdir + 'derived_each_lineage_chr{}_nosingle.txt'.format(chrom)
    infile=open(filename,'r')
    lines=infile.readlines()
    infile.close()

    for i in range(len(lines)-1):
        s=lines[i+1].strip('\n').split(' ')
        if chrom=='1':
            mut_list.append(s[0])
        for j in range(len(s)-1):
            mut_counts[j][i]+=int(s[j+1])

### 


> 4. Process data

In [5]:
# divide by row sums for each haplotype 
for j in range(len(s)-1):
    der_count=mut_counts[j].sum()
    for i in range(len(mut_counts[j])):
        mut_counts[j][i]*= 1.0/der_count

# could be replaced by: mut_counts= (mut_counts.T/mut_counts.sum(axis=1)).T

## average individual counts
## individual counts averaged over theyr haplotypes? why?
## because haplotypes appear to have been phased.

averaged_mut_counts=[]
for j in range(int((len(s)-1)/2)):
    averaged_mut_counts.append([])
    for i in range(len(mut_counts[0])):
        averaged_mut_counts[-1].append(.5*(mut_counts[2*j][i]+mut_counts[2*j+1][i]))

mut_counts=np.array(averaged_mut_counts)

# Scale data
from sklearn.preprocessing import normalize
mut_counts= normalize(mut_counts)

## Principal component analysis

> 1. kmers as features, samples as observations

In [6]:
from sklearn.decomposition import PCA
n_comp= 3

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
features = pca.fit_transform(mut_counts)

var_comps= pca.explained_variance_ratio_


In [7]:
from plotly.subplots import make_subplots
from plotly import tools

colors_pres= {
'littoralis': 'blue',
'brevicaudus': 'green',
'tcheliensis': 'orange',
'lasiotis': 'brown',
'mulatta': 'purple',
'CH': 'red'
}

titles= ['PC1-2','PC1-3']
fig_subplots = tools.make_subplots(rows= 1, cols=2,
                             subplot_titles=tuple(titles))

for col in range(1,3):
    fig_data= [go.Scatter(
            x= features[indices[i],0],
            y= features[indices[i],col],
            mode= "markers",
            marker= {
            'color': colors_pres[i],
            'line': {'width': 1},
            'size': 8,
            'symbol': 'circle',
          "opacity": 1
          },
          name= str(i)
        ) for i in list(indices.keys())]
    
    for trace1 in fig_data:
        fig_subplots.append_trace(trace1, 1, col)
    
    fig_subplots.update_xaxes(title= 'PC1')

    fig_subplots.update_yaxes(title= 'PC' + str(col + 1),row=1, col=col)


layout = go.Layout(
    title= 'mutation profile PCA'
)


fig = go.Figure(data=fig_subplots, layout=layout)
fig['layout'].update(title= 'individuals')
iplot(fig)


> 2. kmers as observations, samples as features.

In [8]:
from sklearn.decomposition import PCA

mut_indices= {
    z:[x for x in range(len(mut_list)) if mut_list[x][1] in z] for z in ['AT','CG']
}

col_mut= {
    'AT': 'blue',
    'CG': 'red'
}

n_comp= 3

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
Comps = pca.fit_transform(mut_counts.T)

var_comps= pca.explained_variance_ratio_

titles= ['PC1-2','PC1-3']
fig_subplots = tools.make_subplots(rows= 1, cols=2,
                             subplot_titles=tuple(titles))

for col in range(1,3):
    fig_data= [go.Scatter(
            x= Comps[mut_indices[i],0],
            y= Comps[mut_indices[i],col],
            mode= "markers",
            text= [mut_list[x] for x in mut_indices[i]],
            marker= {
            'color': col_mut[i],
            'line': {'width': 1},
            'size': 8,
            'symbol': 'circle',
          "opacity": 1
          },
            name= i
        ) for i in list(mut_indices.keys())]
    
    for trace1 in fig_data:
        fig_subplots.append_trace(trace1, 1, col)
        
    fig_subplots.update_xaxes(title= 'PC1')

    fig_subplots.update_yaxes(title= 'PC' + str(col + 1),row=1, col=col)

layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    yaxis=dict(
        title='PC2: {}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= 'PC1: {}'.format(round(var_comps[0],3)))
)


fig = go.Figure(data=fig_subplots, layout=layout)
fig['layout'].update(title= 'kmers')
iplot(fig)
