In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.preprocessing import scale

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint

import matplotlib.pyplot as plt
import itertools as it
from itertools import product

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from matplotlib.collections import BrokenBarHCollection
import re

import gzip

import tempfile

from structure_tools.Modules_tools import return_fsts

PCA_color_ref= ['darkseagreen','crimson', 'darkorange', 'darkblue', 'darkcyan',
            'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen',
            'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange',
            'darkorchid', 'darkred', 'darksalmon', 'darkseagreen',
            'darkslateblue', 'darkslategray', 'darkslategrey',
            'darkturquoise', 'darkviolet', 'deeppink']

### Read VCF

In [2]:
from structure_tools.vcf_geno_tools import read_geno_nanumv2

Home= 'vcf/'

Chr= '8'
start= '3000000'
end= '5000000'
filename= Home + 'chrchr{}.{}-{}.chr{}.vcf.gz'.format(Chr,start,end,Chr)


row_info= 6
header_info= 9
phased= False


genotype, summary, Names= read_geno_nanumv2(filename, header_info= header_info,phased= phased)


print('Number of markers: {}'.format(genotype.shape[1]))
print('Number of individuals: {}'.format(genotype.shape[0]))
summary.head()

Number of markers: 46437
Number of individuals: 81


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
0,8,3000011,.,G,A,202.59,PASS,.,GT:AD:DP:FT:GQ:PL
1,8,3000013,.,G,C,417.6,PASS,.,GT:AD:DP:FT:GQ:PL
2,8,3000137,.,C,T,60.79,PASS,.,GT:AD:DP:FT:GQ:PGT:PID:PL
3,8,3000144,.,G,A,31165.8,PASS,.,GT:AD:DP:FT:GQ:PGT:PID:PL
4,8,3000164,.,C,T,9845.79,PASS,.,GT:AD:DP:FT:GQ:PGT:PID:PL


## Read fasta



In [3]:
from fasta_utilities import (
    reference_sequence, get_mutations, get_by_path, 
    set_by_path, fasta_get_freq, get_complement,
    complement_dicts,collapse_freqs, kmer_dict_init,
    collapse_freqs, vcf_kmers, geno_kmers, vcf_muts_matrix,
    geno_muts_v2, kmer_comp_index
)

In [4]:

dir_fasta= 'D:/GitHub/fine-scale-mutation-spectrum-master/Subset_explore/data/'
chrom= '8'
reference= 'rheMac10'

refseq= reference_sequence(chrom,reference,dir_launch= dir_fasta)
refseq= refseq.decode()


> get mutation indicies.

In [5]:
align= 'rheMac10'
vcf_data= 'vcf_data'
outdir= 'D:\Rhesus_test/fine-scale-mutation-spectrum-master/{}_finescale_mut_spectra_vcf.{}/'.format(align,vcf_data)
filename= outdir + 'derived_each_lineage_chr{}_nosingle.txt'.format(chrom)
infile=open(filename,'r')
lines=infile.readlines()
infile.close()

s=lines[0].strip('\n').split(' ')

indices = {}
for i in range(1,len(s)):
    try:
        indices[s[i]].append(i-1)
    except KeyError:
        indices[s[i]] = [i - 1]

RIP

### Window analysis

Estimate mutation profile at local genomic window. Determine using `wstart` and `wend`. 

Chose kmer size to construct profile with using `ksize` (int) and weather to collapse complementary mutations using `collapsed` (boolean). 

In [6]:
### Start playing
wstart= int(start) + 1e5
wend= wstart + 1.5e6
Wlen= wend - wstart
ksize= 3 # odd.
bases = 'ACGT'
collapsed= True

genotype_parse= [x for x in range(summary.shape[0]) if int(summary.POS[x])-1 >= wstart and int(summary.POS[x])-1 <= wend]
Window= genotype[:,genotype_parse]
subset_summary= summary.loc[genotype_parse,:].reset_index()

print(Window.shape)

##
mut_matrix= vcf_muts_matrix(refseq,subset_summary,start= wstart,end= wend,ksize= ksize,bases=bases)
ind_collapsed_mat= geno_muts_v2(np.array(Window), mut_matrix)
ind_collapsed_mat.shape


(81, 33186)


(81, 192)

In [7]:
from sklearn.decomposition import PCA
n_comp= 3

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
features = pca.fit_transform(ind_collapsed_mat)

var_comps= pca.explained_variance_ratio_


In [8]:
from plotly.subplots import make_subplots
from plotly import tools

colors_pres= {
'littoralis': 'blue',
'brevicaudus': 'green',
'tcheliensis': 'orange',
'lasiotis': 'brown',
'mulatta': 'purple',
'CH': 'red'
}


titles= ['PC1-2','PC1-3']
fig_subplots = tools.make_subplots(rows= 1, cols=2,
                             subplot_titles=tuple(titles))

for col in range(1,3):
    fig_data= [go.Scatter(
            x= features[indices[i],0],
            y= features[indices[i],col],
            mode= "markers",
            marker= {
            'color': colors_pres[i],
            'line': {'width': 1},
            'size': 8,
            'symbol': 'circle',
          "opacity": 1
          },
          name= str(i)
        ) for i in list(indices.keys())]
    
    for trace1 in fig_data:
        fig_subplots.append_trace(trace1, 1, col)
    
    fig_subplots.update_xaxes(title= 'PC1')

    fig_subplots.update_yaxes(title= 'PC' + str(col + 1),row=1, col=col)


layout = go.Layout(
    title= 'mutation profile PCA'
)


fig = go.Figure(data=fig_subplots, layout=layout)
fig['layout'].update(title= 'individuals')
iplot(fig)


In [9]:
from sklearn.cluster import MeanShift, estimate_bandwidth

bandwidth = estimate_bandwidth(features, quantile=0.3, n_samples=features.shape[0])

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False)
ms.fit(features)
labels = ms.labels_
zlib= {
    z: [x for x in range(len(labels)) if labels[x] == z] for z in list(set(labels))
}

titles= ['PC1-2','PC1-3']
fig_subplots = tools.make_subplots(rows= 1, cols=2,
                             subplot_titles=tuple(titles))

for col in range(1,3):
    fig_data= [go.Scatter(
            x= features[zlib[i],0],
            y= features[zlib[i],col],
            mode= "markers",
            marker= {
            'line': {'width': 1},
            'size': 8,
            'symbol': 'circle',
          "opacity": 1
          },
          name= str(i)
        ) for i in list(zlib.keys())]
    
    for trace1 in fig_data:
        fig_subplots.append_trace(trace1, 1, col)
    
    fig_subplots.update_xaxes(title= 'PC1')

    fig_subplots.update_yaxes(title= 'PC' + str(col + 1),row=1, col=col)


layout = go.Layout(
    title= 'mutation profile PCA'
)


fig = go.Figure(data=fig_subplots, layout=layout)
fig['layout'].update(title= 'individuals')
iplot(fig)

In [16]:
from sklearn.decomposition import PCA

mutations= get_mutations(bases= bases,ksize= ksize)
kmers, kmer_idx= kmer_comp_index(mutations)

mut_list= ['_'.join(x) for x in kmer_idx.values()]
#mut_list= ['_'.join(x) for x in mutations]

mut_indices= {
    z:[x for x in range(len(mut_list)) if mut_list[x][1] in z] for z in ['AT','CG']
}

col_mut= {
    'AT': 'blue',
    'CG': 'red'
}

n_comp= 3

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
Comps = pca.fit_transform(ind_collapsed_mat.T)

var_comps= pca.explained_variance_ratio_

titles= ['PC1-2','PC1-3']
fig_subplots = tools.make_subplots(rows= 1, cols=2,
                             subplot_titles=tuple(titles))

for col in range(1,3):
    fig_data= [go.Scatter(
            x= Comps[mut_indices[i],0],
            y= Comps[mut_indices[i],col],
            mode= "markers",
            text= [mut_list[x] for x in mut_indices[i]],
            marker= {
            'color': col_mut[i],
            'line': {'width': 1},
            'size': 8,
            'symbol': 'circle',
          "opacity": 1
          },
            name= i
        ) for i in list(mut_indices.keys())]
    
    for trace1 in fig_data:
        fig_subplots.append_trace(trace1, 1, col)
        
    fig_subplots.update_xaxes(title= 'PC1')

    fig_subplots.update_yaxes(title= 'PC' + str(col + 1),row=1, col=col)


layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    yaxis=dict(
        title='PC2: {}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= 'PC1: {}'.format(round(var_comps[0],3)))
)


fig = go.Figure(data=fig_subplots, layout=layout)
fig['layout'].update(title= 'kmers')
iplot(fig)


#### Across windows



In [17]:
Wlen= 1e4
ksize= 3
n_comp= 4

Nl= 200
start_list= np.random.uniform(int(start),int(end) - Wlen,Nl)
start_list= np.array(start_list,dtype= int)
#start_list= np.arange(int(start),int(end) - Wlen,1e4)
comp_dict, comp_index= complement_dicts(bases= bases,ksize= ksize)

window_muts= []
window_list= []

for wstart in start_list:
    wstart= int(wstart)
    wend= wstart + Wlen 

    genotype_parse= [x for x in range(summary.shape[0]) if int(summary.POS[x]) - 1 >= wstart and int(summary.POS[x]) - 1 <= wend]
    Window= genotype[:,genotype_parse]
    subset_summary= summary.loc[genotype_parse,:].reset_index()
    
    if Window.shape[1] > 10:
        
        #print(wstart, Window.shape[1])
        
        #kmer_dict= fasta_get_freq(refseq,start= int(wstart),end= int(wend),step= 1,ksize=ksize,bases= bases)
        
        #vcf_muts_vector= vcf_muts(refseq,subset_summary,start= wstart,end= wend,ksize= ksize,bases=bases)
        
        #ind_collapsed_muts= geno_muts_v1(Window, dict(kmer_dict), vcf_muts_vector, mutations, Wl= Wlen,ksize= ksize,bases=bases)
        
        ####
        ####
        mut_matrix= vcf_muts_matrix(refseq,subset_summary,start= wstart,end= wend,ksize= ksize,bases=bases)
        ind_collapsed_mat= geno_muts_v2(np.array(Window), mut_matrix)

        
        #ind_collapsed_muts= scale(ind_collapsed_muts)
        ind_collapsed_muts= np.nan_to_num(ind_collapsed_mat)
        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
        features = pca.fit_transform(scale(ind_collapsed_muts))

        bandwidth = estimate_bandwidth(features, quantile=0.2, n_samples=features.shape[0])
        
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=False)
        ms.fit(features)
        labels = ms.labels_
        
        zlib= {
            z: [x for x in range(len(labels)) if labels[x] == z] for z in list(set(labels))
        }
        
        average_muts= [np.mean(ind_collapsed_muts[zlib[i]],axis= 0) for i in zlib.keys()]
        
        window_list.extend([wstart] * len(zlib))
        window_muts.extend(average_muts)


window_muts= np.array(window_muts)
print(window_muts.shape)



invalid value encountered in true_divide



(1007, 192)


In [18]:

n_comp= 3

subsel= list(np.random.choice(list(range(window_muts.shape[0])),890,replace= False))

data= scale(window_muts[subsel])
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
features = pca.fit_transform(data)

var_comps= pca.explained_variance_ratio_

####

from sklearn.cluster import MeanShift, estimate_bandwidth
bandwidth = estimate_bandwidth(features, quantile=0.3, n_samples=features.shape[0])

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False)
ms.fit(features)
labels = ms.labels_
'''

###

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0).fit(features)
labels= kmeans.labels_
'''
###
zlib= {
    z: [x for x in range(len(labels)) if labels[x] == z] for z in list(set(labels))
}

zlib= {z:g for z,g in zlib.items() if len(g) > 1}

titles= ['PC1-2','PC1-3']
fig_subplots = tools.make_subplots(rows= 1, cols=2,
                             subplot_titles=tuple(titles))

for col in range(1,3):
    fig_data= [go.Scatter(
            x= features[zlib[i],0],
            y= features[zlib[i],col],
            mode= "markers",
            marker= {
            'line': {'width': 1},
            'size': 8,
            'symbol': 'circle',
          "opacity": 1
          },
          name= str(i)
        ) for i in list(zlib.keys())]
    
    for trace1 in fig_data:
        fig_subplots.append_trace(trace1, 1, col)
    
    fig_subplots.update_xaxes(title= 'PC1')

    fig_subplots.update_yaxes(title= 'PC' + str(col + 1),row=1, col=col)


layout = go.Layout(
    title= 'mutation profile PCA'
)


fig = go.Figure(data=fig_subplots, layout=layout)
fig['layout'].update(title= 'individuals')
iplot(fig)

In [19]:
import plotly.figure_factory as ff

hist_data= [[window_list[x] for x in zlib[z]] for z in zlib.keys()]
group_labels= ['gp{}'.format(str(z)) for z in zlib.keys()]

fig = ff.create_distplot(hist_data, group_labels, bin_size=1e4)
iplot(fig)

In [20]:
from sklearn.decomposition import PCA

kmers, kmer_idx= kmer_comp_index(mutations)

mut_list= ['_'.join(x) for x in kmer_idx.values()]

mut_indices= {
    z:[x for x in range(len(mut_list)) if mut_list[x][1] in z] for z in ['AT','CG']
}

col_mut= {
    'AT': 'blue',
    'CG': 'red'
}

n_comp= 3

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
Comps = pca.fit_transform(window_muts.T)

var_comps= pca.explained_variance_ratio_

titles= ['PC1-2','PC1-3']
fig_subplots = tools.make_subplots(rows= 1, cols=2,
                             subplot_titles=tuple(titles))

for col in range(1,3):
    fig_data= [go.Scatter(
            x= Comps[mut_indices[i],0],
            y= Comps[mut_indices[i],col],
            mode= "markers",
            text= [mut_list[x] for x in mut_indices[i]],
            marker= {
            'color': col_mut[i],
            'line': {'width': 1},
            'size': 8,
            'symbol': 'circle',
          "opacity": 1
          },
            name= i
        ) for i in list(mut_indices.keys())]
    
    for trace1 in fig_data:
        fig_subplots.append_trace(trace1, 1, col)
        
    fig_subplots.update_xaxes(title= 'PC1')

    fig_subplots.update_yaxes(title= 'PC' + str(col + 1),row=1, col=col)


layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    yaxis=dict(
        title='PC2: {}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= 'PC1: {}'.format(round(var_comps[0],3)))
)


fig = go.Figure(data=fig_subplots, layout=layout)
fig['layout'].update(title= 'kmers')
iplot(fig)
