In [1]:
import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.preprocessing import scale

import pandas as pd

from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint

import matplotlib.pyplot as plt
import itertools as it
from itertools import product

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from matplotlib.collections import BrokenBarHCollection
import re

import gzip

import tempfile

from structure_tools.Modules_tools import return_fsts

PCA_color_ref= ['darkseagreen','crimson', 'darkorange', 'darkblue', 'darkcyan',
            'darkgoldenrod', 'darkgray', 'darkgrey', 'darkgreen',
            'darkkhaki', 'darkmagenta', 'darkolivegreen', 'darkorange',
            'darkorchid', 'darkred', 'darksalmon', 'darkseagreen',
            'darkslateblue', 'darkslategray', 'darkslategrey',
            'darkturquoise', 'darkviolet', 'deeppink']

In [2]:

def read_geno_nanum(filename, row_info= 6,header_info= 9,phased= False):

    info_summ= {}
    info_save= list(range(row_info))

    header_len= header_info
    summary= []

    Miss= recursively_default_dict()
    
    Input= gzip.open(filename,'r')

    genotype= []
    d= 0

    for line in Input:    
        line= line.decode().strip().split()
        
        if line[0] != "#CHROM" and d == 0:
            continue

        if line[0] == "#CHROM":
            #print(info_summ)
            line= '\t'.join(line)
            line= ''.join(filter(lambda ch: ch not in "#", line))
            line= line.split()
            columns= line[:header_len]
            Names= line[header_len:]
            d += 1
            continue
        
        if d > 0:
            #line= line.split()
            seq= []
            #print(line)
            info= line[:header_len]
            chrom= re.search(r'\d+', line[0]).group()
            info[0]= chrom
            summary.append(info)
            #print(chrom)
            #print(line)
            
            for ind in range(header_len,len(line)):
                
                locus= line[ind]
                #print(locus)
                alleles= locus.split(':')[0]
                
                #print(alleles)
                
                if '.' in alleles:
                    alleles= ''.join([[x,'0'][int(x == '.')] for x in list(alleles)])
                alleles= list(map(int, re.findall(r'\d+', alleles)))
                if len(alleles) != 2:
                    print(alleles)
                if phased:
                    seq.extend(alleles)
                else:
                    seq.append(sum(alleles))

            genotype.append(seq)
            d += 1

    Input.close()

    summary= np.array(summary)
    summary= pd.DataFrame(summary,columns= columns)
    genotype= np.array(genotype).T

    return genotype, summary, Names



### Read VCF

In [3]:
#from structure_tools.vcf_geno_tools import read_geno_nanum

Home= 'vcf/'

Chr= '8'
start= '3000000'
end= '5000000'
filename= Home + 'chrchr{}.{}-{}.chr{}.vcf.gz'.format(Chr,start,end,Chr)


row_info= 6
header_info= 9
phased= False


genotype, summary, Names= read_geno_nanum(filename, row_info= row_info, header_info= header_info,phased= phased)


print('Number of markers: {}'.format(genotype.shape[1]))
print('Number of individuals: {}'.format(genotype.shape[0]))
summary.head()

Number of markers: 46437
Number of individuals: 81


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT
0,8,3000011,.,G,A,202.59,PASS,.,GT:AD:DP:FT:GQ:PL
1,8,3000013,.,G,C,417.6,PASS,.,GT:AD:DP:FT:GQ:PL
2,8,3000137,.,C,T,60.79,PASS,.,GT:AD:DP:FT:GQ:PGT:PID:PL
3,8,3000144,.,G,A,31165.8,PASS,.,GT:AD:DP:FT:GQ:PGT:PID:PL
4,8,3000164,.,C,T,9845.79,PASS,.,GT:AD:DP:FT:GQ:PGT:PID:PL


## Read fasta



In [4]:

dir_fasta= 'D:/GitHub/Tools_and_toys/VCF_analysis/Subset_explore/data/'
chrom= '8'
reference= 'rheMac10'

refseq= reference_sequence(chrom,reference,dir_launch= dir_fasta)
refseq= refseq.decode()


NameError: name 'reference_sequence' is not defined

In [None]:
from fasta_utilities import (
    reference_sequence, get_mutations, get_by_path, 
    set_by_path, fasta_get_freq, get_complement,
    complement_dicts,collapse_freqs, kmer_dict_init
)

In [None]:
align= 'rheMac10'
vcf_data= 'vcf_data'
outdir= 'D:\Rhesus_test/fine-scale-mutation-spectrum-master/{}_finescale_mut_spectra_vcf.{}/'.format(align,vcf_data)
filename= outdir + 'derived_each_lineage_chr{}_nosingle.txt'.format(chrom)
infile=open(filename,'r')
lines=infile.readlines()
infile.close()

s=lines[0].strip('\n').split(' ')

indices = {}
for i in range(1,len(s)):
    try:
        indices[s[i]].append(i-1)
    except KeyError:
        indices[s[i]] = [i - 1]

In [351]:
### vcf - fasta functions

def vcf_kmers(refseq, summary, comp_dict, ksize=3,start= 0,end= 0):
    '''
    get kmers in vcf.
    returns list of reference and alternative kmer indices in comp_dict.
    '''
    if end == 0:
        end= max(summary.POS)
        
    ###
    kmer_dict= {}
    kmer_list= []
    d= 0
    base_set= [bases] * ksize
    mers= product(*base_set)
    
    for kmer in mers:
        kmer= ''.join(kmer)
        kmer_dict[kmer]= d

    ###
    k5= int(ksize/2)
    k3= ksize - k5
    posKmer_ref= []
    posKmer_alt= []
    
    for x in range(summary.shape[0]):
        pos= int(summary.POS[x]) - 1
        if pos >=  (start + k5) and pos <= (end - k3):
            kmer= refseq[pos-k5: pos + k3]
            mut= kmer[:k5] + summary.ALT[x] + kmer[k3:]
            
            posKmer_ref.append(comp_dict[kmer])
            posKmer_alt.append(comp_dict[mut])
    
    return posKmer_ref, posKmer_alt


In [390]:
###########
########### get individual kmer frequencies following mutation.

def collapse_freqs(kmer_dict,comp_index):
    '''return vector of collapsed counts by kmer'''
    counts= []
    
    for kdx in comp_index.keys():
        total= [get_by_path(kmer_dict, comp) for comp in comp_index[kdx]]
        total= sum(total)
        counts.append(total)
        
    return counts


def geno_kmers(genotype, summary, refseq,comp_index,ksize= 3,bases= 'ATCG', start= 0, end= 0):
    '''get individual collapsed mutation arrays across data set.'''
    
    k5= int(ksize/2)
    k3= ksize - k5
    
    ind_dicts= {ind: fasta_get_freq(refseq,start= int(start),end= int(end),step= 1,ksize=ksize,bases= bases) for ind in range(genotype.shape[0])}
    collapsed= []
    
    for snp in range(genotype.shape[1]):
        alleles= genotype[:,snp]
        
        changes= [x for x in range(len(alleles)) if alleles[x] > 0]
        pos= int(summary.POS[snp]) - 1
        
        kmer= refseq[pos-k5: pos+k3]
        kmer_comp= get_complement(kmer)
        mut= kmer[:k5] + summary.ALT[snp] + kmer[k3:]
        mut_comp= get_complement(mut)
        
        for ind in changes:
            get_by_path(ind_dicts[ind], kmer[:-1])[kmer[-1]]-=  alleles[ind]
            get_by_path(ind_dicts[ind], mut[:-1])[mut[-1]]+= alleles[ind]
    
    for ind in range(genotype.shape[0]):
        collapsed_freqs= collapse_freqs(ind_dicts[ind],comp_index)
        collapsed.append(collapsed_freqs)
    
    collapsed= np.array(collapsed)
    collapsed= (collapsed.T/collapsed.sum(axis=1)).T
    
    return collapsed


def complement_dicts(bases= 'ACGT',ksize= 3):
    '''return dict of comp kmers + index dict to parse with'''
    comp_dict= {}
    comp_index= {}
    d= 0
    base_set= [bases] * ksize
    mers= product(*base_set)
    
    for kmer in mers:
        kmer= ''.join(kmer)
        if kmer not in comp_dict.keys():
            comp_dict[kmer]= d
            
            comp= get_complement(kmer)
            comp= ''.join(comp)
            comp_dict[comp]= d
            
            comp_index[d]= (kmer,comp)
            d += 1
    
    return comp_dict,comp_index


############################################
############################################


def kmer_mut_init(mutations):
    '''produce nested dictionary of nucs for a particular kmer size'''
    
    mut_lib= recursively_default_dict()
    
    d= 0
    for mut in range(len(mutations)):
        trimer= mutations[mut]
        trimer= ''.join(trimer)
        get_by_path(mut_lib, trimer[:-1])[trimer[-1]]= 0
    
    return mut_lib


def kmer_mut_index(mutations):
    '''produce nested dictionary of nucs for a particular mutation list'''
    mut_lib= recursively_default_dict()
    
    for mut in range(len(mutations)):
        trimer= ''.join(mutations[mut])
        get_by_path(mut_lib, trimer[:-1])[trimer[-1]]= mut
    
    return mut_lib


def vcf_muts(refseq,summary,start= 0,end= 0,ksize= 3,bases='ATCG'):
    ''' return vector of mutation contexts by SNP in vcf. '''
    
    mut_lib= kmer_mut_index(mutations)
    
    if end == 0:
        end= max(summary.POS)
    
    k5= int(ksize/2)
    k3= ksize - k5
    pos_mut= []
    
    for x in range(summary.shape[0]):
        pos= int(summary.POS[x]) - 1
        if pos >=  start and pos <= end:
            kmer= refseq[pos-k5: pos + k3]
            mut= kmer + summary.ALT[x]
            mut_index= get_by_path(mut_lib, list(mut))
            
            pos_mut.append(mut_index)
            
    return pos_mut


def kmer_comp_index(mutations):
    ''' return nested dictionaries of kmer mutations w/ index'''
    kmers= {}
    kmer_idx= {}
    d= 0
    for kmer in mutations:

        comp= get_complement(kmer[0]) + get_complement(kmer[1])
        comp= ''.join(comp)
        kmer= ''.join(kmer)
        
        if comp in kmers.keys():
            idx= kmers[comp]
            kmers[kmer]= idx
            kmer_idx[idx].append(kmer)
        else:
            kmers[kmer]= d
            kmer_idx[d]= [kmer]

        d += 1
    
    return kmers, kmer_idx


def collapse_muts(kmer_dict,mutations, collapse= True):
    '''return vector of counts by kmer, optional collapse by complement'''
    counts= []
    
    if collapse:
        kmers, kmer_idx= kmer_comp_index(mutations)

        for kdx in kmer_idx.keys():
            count= [get_by_path(kmer_dict, list(comp)) for comp in kmer_idx[kdx]]
            count= sum(count)
            ##
            counts.append(count)
    
    else:
        for comp in mutations:
            comp= ''.join(comp)
            kdx= get_by_path(kmer_dict, list(comp))
            counts.append(kdx)
        
    return counts


def geno_muts_v1(geno_array, kmer_dict, vcf_muts_vector, mutations, 
                 bases= 'ACGT', ksize= 3, Wl= 0, collapse= True):
    ''' return mutation spectrum'''
    
    ind_dict_store= {
        i: kmer_mut_init(mutations) for i in range(geno_array.shape[0])
    }
    collapsed= []
    
    for snp in range(geno_array.shape[1]):
        
        alleles= geno_array[:,snp]
        changes= [x for x in range(len(alleles)) if alleles[x] > 0]
        
        minus= mutations[vcf_muts_vector[snp]]
        
        prop_kmer= get_by_path(kmer_dict, minus[0]) / (Wl - ksize + 1)
        minus= ''.join(minus)        
        
        for ind in changes:
            get_by_path(ind_dict_store[ind], minus[:-1])[minus[-1]]+= alleles[ind] #/ prop_kmer
    
    for ind in range(geno_array.shape[0]):
        collapsed_freqs= collapse_muts(ind_dict_store[ind],mutations,collapse= collapse)
        collapsed.append(collapsed_freqs)
    
    collapsed= np.array(collapsed)
    collapsed= (collapsed.T/collapsed.sum(axis=1)).T
    
    return collapsed



In [391]:
### Start playing
wstart= int(start) + 9e5
wend= wstart + 5e3
Wlen= wend - wstart
ksize= 3 # odd.
bases = 'ACGT'
collapsed= True

mutations= get_mutations(bases= bases,ksize= ksize)

genotype_parse= [x for x in range(summary.shape[0]) if int(summary.POS[x])-1 >= wstart and int(summary.POS[x])-1 <= wend]
Window= genotype[:,genotype_parse]
subset_summary= summary.loc[genotype_parse,:].reset_index()

print(Window.shape)

## fasta lib
kmer_dict= fasta_get_freq(refseq,start= int(wstart),end= int(wend),step= 1,ksize=ksize,bases= bases)
comp_dict, comp_index= complement_dicts(bases= bases,ksize= ksize)
collapsed_freqs= collapse_freqs(dict(kmer_dict),comp_index)
collapsed_freqs= np.array(collapsed_freqs) / sum(collapsed_freqs)

##
ind_collapsed_freqs= geno_kmers(Window, subset_summary, refseq, comp_index, 
                                start= wstart, end= wend, ksize= ksize , bases= bases)
##
vcf_muts_vector= vcf_muts(refseq,subset_summary,start= wstart,end= wend,ksize= ksize,bases=bases)
ind_collapsed_muts= geno_muts_v1(Window, kmer_dict, vcf_muts_vector,mutations, 
                                 bases= bases, ksize= ksize, Wl= Wlen, collapse= collapsed)

ind_collapsed_muts.shape


(81, 114)


(81, 96)

In [392]:
from sklearn.decomposition import PCA
n_comp= 3

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
features = pca.fit_transform(ind_collapsed_muts)

var_comps= pca.explained_variance_ratio_


In [393]:
from plotly.subplots import make_subplots
from plotly import tools

colors_pres= {
'littoralis': 'blue',
'brevicaudus': 'green',
'tcheliensis': 'orange',
'lasiotis': 'brown',
'mulatta': 'purple',
'CH': 'red'
}


titles= ['PC1-2','PC1-3']
fig_subplots = tools.make_subplots(rows= 1, cols=2,
                             subplot_titles=tuple(titles))

for col in range(1,3):
    fig_data= [go.Scatter(
            x= features[indices[i],0],
            y= features[indices[i],col],
            mode= "markers",
            marker= {
            'color': colors_pres[i],
            'line': {'width': 1},
            'size': 8,
            'symbol': 'circle',
          "opacity": 1
          },
          name= str(i)
        ) for i in list(indices.keys())]
    
    for trace1 in fig_data:
        fig_subplots.append_trace(trace1, 1, col)
    
    fig_subplots.update_xaxes(title= 'PC1')

    fig_subplots.update_yaxes(title= 'PC' + str(col + 1),row=1, col=col)


layout = go.Layout(
    title= 'mutation profile PCA'
)


fig = go.Figure(data=fig_subplots, layout=layout)
fig['layout'].update(title= 'individuals')
iplot(fig)


In [394]:
from sklearn.cluster import MeanShift, estimate_bandwidth

bandwidth = estimate_bandwidth(features, quantile=0.3, n_samples=features.shape[0])

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False)
ms.fit(features)
labels = ms.labels_
zlib= {
    z: [x for x in range(len(labels)) if labels[x] == z] for z in list(set(labels))
}

titles= ['PC1-2','PC1-3']
fig_subplots = tools.make_subplots(rows= 1, cols=2,
                             subplot_titles=tuple(titles))

for col in range(1,3):
    fig_data= [go.Scatter(
            x= features[zlib[i],0],
            y= features[zlib[i],col],
            mode= "markers",
            marker= {
            'line': {'width': 1},
            'size': 8,
            'symbol': 'circle',
          "opacity": 1
          },
          name= str(i)
        ) for i in list(zlib.keys())]
    
    for trace1 in fig_data:
        fig_subplots.append_trace(trace1, 1, col)
    
    fig_subplots.update_xaxes(title= 'PC1')

    fig_subplots.update_yaxes(title= 'PC' + str(col + 1),row=1, col=col)


layout = go.Layout(
    title= 'mutation profile PCA'
)


fig = go.Figure(data=fig_subplots, layout=layout)
fig['layout'].update(title= 'individuals')
iplot(fig)

In [396]:
from sklearn.decomposition import PCA

kmers, kmer_idx= kmer_comp_index(mutations)

mut_list= ['_'.join(x) for x in kmer_idx.values()]
#mut_list= ['_'.join(x) for x in mutations]

mut_indices= {
    z:[x for x in range(len(mut_list)) if mut_list[x][1] in z] for z in ['AT','CG']
}

col_mut= {
    'AT': 'blue',
    'CG': 'red'
}

n_comp= 3

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
Comps = pca.fit_transform(ind_collapsed_muts.T)

var_comps= pca.explained_variance_ratio_

titles= ['PC1-2','PC1-3']
fig_subplots = tools.make_subplots(rows= 1, cols=2,
                             subplot_titles=tuple(titles))

for col in range(1,3):
    fig_data= [go.Scatter(
            x= Comps[mut_indices[i],0],
            y= Comps[mut_indices[i],col],
            mode= "markers",
            text= [mut_list[x] for x in mut_indices[i]],
            marker= {
            'color': col_mut[i],
            'line': {'width': 1},
            'size': 8,
            'symbol': 'circle',
          "opacity": 1
          },
            name= i
        ) for i in list(mut_indices.keys())]
    
    for trace1 in fig_data:
        fig_subplots.append_trace(trace1, 1, col)
        
    fig_subplots.update_xaxes(title= 'PC1')

    fig_subplots.update_yaxes(title= 'PC' + str(col + 1),row=1, col=col)


layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    yaxis=dict(
        title='PC2: {}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= 'PC1: {}'.format(round(var_comps[0],3)))
)


fig = go.Figure(data=fig_subplots, layout=layout)
fig['layout'].update(title= 'kmers')
iplot(fig)


#### Across windows



In [397]:
Wlen= 1e4
ksize= 3
n_comp= 4

Nl= 2000
start_list= np.random.uniform(int(start),int(end) - Wlen,Nl)
start_list= np.array(start_list,dtype= int)
#start_list= np.arange(int(start),int(end) - Wlen,1e4)
comp_dict, comp_index= complement_dicts(bases= bases,ksize= ksize)

window_muts= []
window_list= []

for wstart in start_list:
    wstart= int(wstart)
    wend= wstart + Wlen 

    genotype_parse= [x for x in range(summary.shape[0]) if int(summary.POS[x]) - 1 >= wstart and int(summary.POS[x]) - 1 <= wend]
    Window= genotype[:,genotype_parse]
    subset_summary= summary.loc[genotype_parse,:].reset_index()
    
    if Window.shape[1] > 10:
        
        print(wstart, Window.shape[1])
        
        kmer_dict= fasta_get_freq(refseq,start= int(wstart),end= int(wend),step= 1,ksize=ksize,bases= bases)
        
        vcf_muts_vector= vcf_muts(refseq,subset_summary,start= wstart,end= wend,ksize= ksize,bases=bases)
        
        ind_collapsed_muts= geno_muts_v1(Window, dict(kmer_dict), vcf_muts_vector,comp_index, mutations, Wl= Wlen,ksize= ksize,bases=bases)
        
        ind_collapsed_muts= scale(ind_collapsed_muts)
        ind_collapsed_muts= np.nan_to_num(ind_collapsed_muts)
        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
        features = pca.fit_transform(ind_collapsed_muts)

        bandwidth = estimate_bandwidth(features, quantile=0.2, n_samples=features.shape[0])
        
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=False)
        ms.fit(features)
        labels = ms.labels_
        
        zlib= {
            z: [x for x in range(len(labels)) if labels[x] == z] for z in list(set(labels))
        }
        
        average_muts= [np.mean(ind_collapsed_muts[zlib[i]],axis= 0) for i in zlib.keys()]
        
        window_list.extend([wstart] * len(zlib))
        window_muts.extend(average_muts)


window_muts= np.array(window_muts)
print(window_muts.shape)


4815350 283
4742159 328
4455301 279
3433950 192
3987519 195
4823040 335
3090977 222
4135689 170
4211227 256
3259907 237
3865635 190
4443040 238
4939687 280
4027791 277
3912915 228
4627936 269
3308265 182
3020048 220
3230663 237
3208078 219
3130478 130
4421162 252
3855804 202
4102186 271
3489687 204
4096222 357
3241454 290
3664825 146
4805886 282
4027967 273
3885813 257
4917153 228
4170353 211
4928720 339
3248020 298
3137974 121
4441925 249
4234506 218
3283199 155
3635920 211
4034434 263
4449791 285
4941410 286
3485122 220
3191290 195
3908201 240
4114614 253
4736157 193
4043696 244
4461631 222
4664533 246
4765460 301
4570405 230
4190634 172
3292105 163
3897465 247
3148520 192
4626046 236
3395932 205
4309367 265
3860666 166
4235909 231
4800271 220
4798548 256
3267422 203
4882778 297
4239049 256
4363089 283
4943273 308
4513678 240
4196436 193
3296961 148
4422368 235
3248668 275
3019506 208
3801258 256
3443222 183
3306107 215
3624814 221
4568821 236
3898211 249
4409743 256
4047043 278
4777


invalid value encountered in true_divide



3051536 250
4403573 231
3243924 298
3927950 235
3477764 260
4547917 281
4929689 368
3731269 230
4324971 242
3503273 229
4549874 285
3005044 237
4493586 253
4624990 256
4403958 228
3225640 241
4453648 289
3435581 202
4254054 232
4702971 255
4180781 207
3279046 165
4461262 232
4648767 195
4802973 246
4616348 211
4697260 226
3839854 130
4481979 195
4033649 262
4989356 273
3275337 192
3931994 256
4595483 267
3480361 227
3995012 164
4961700 284
4694508 222
3999472 150
3785757 140
4331755 299
3747762 221
3817102 204
4156038 208
3242091 292
3960651 213
4009928 230
4860704 235
3451919 231
4610731 199
3610760 174
3123838 205
3772633 228
4404625 220
4929490 360
4436158 207


KeyboardInterrupt: 

In [398]:

window_muts= np.array(window_muts)
window_muts.shape


(711, 96)

In [399]:

n_comp= 3

subsel= list(np.random.choice(list(range(window_muts.shape[0])),700,replace= False))

data= scale(window_muts[subsel])
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
features = pca.fit_transform(data)

var_comps= pca.explained_variance_ratio_

####

from sklearn.cluster import MeanShift, estimate_bandwidth
bandwidth = estimate_bandwidth(features, quantile=0.3, n_samples=features.shape[0])

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False)
ms.fit(features)
labels = ms.labels_
'''

###

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=0).fit(features)
labels= kmeans.labels_
'''
###
zlib= {
    z: [x for x in range(len(labels)) if labels[x] == z] for z in list(set(labels))
}

zlib= {z:g for z,g in zlib.items() if len(g) > 1}

titles= ['PC1-2','PC1-3']
fig_subplots = tools.make_subplots(rows= 1, cols=2,
                             subplot_titles=tuple(titles))

for col in range(1,3):
    fig_data= [go.Scatter(
            x= features[zlib[i],0],
            y= features[zlib[i],col],
            mode= "markers",
            marker= {
            'line': {'width': 1},
            'size': 8,
            'symbol': 'circle',
          "opacity": 1
          },
          name= str(i)
        ) for i in list(zlib.keys())]
    
    for trace1 in fig_data:
        fig_subplots.append_trace(trace1, 1, col)
    
    fig_subplots.update_xaxes(title= 'PC1')

    fig_subplots.update_yaxes(title= 'PC' + str(col + 1),row=1, col=col)


layout = go.Layout(
    title= 'mutation profile PCA'
)


fig = go.Figure(data=fig_subplots, layout=layout)
fig['layout'].update(title= 'individuals')
iplot(fig)

In [400]:
import plotly.figure_factory as ff

hist_data= [[window_list[x] for x in zlib[z]] for z in zlib.keys()]
group_labels= ['gp{}'.format(str(z)) for z in zlib.keys()]

fig = ff.create_distplot(hist_data, group_labels, bin_size=1e4)
iplot(fig)

In [401]:
from sklearn.decomposition import PCA

kmers, kmer_idx= kmer_comp_index(mutations)

mut_list= ['_'.join(x) for x in kmer_idx.values()]

mut_indices= {
    z:[x for x in range(len(mut_list)) if mut_list[x][1] in z] for z in ['AT','CG']
}

col_mut= {
    'AT': 'blue',
    'CG': 'red'
}

n_comp= 3

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
Comps = pca.fit_transform(window_muts.T)

var_comps= pca.explained_variance_ratio_

titles= ['PC1-2','PC1-3']
fig_subplots = tools.make_subplots(rows= 1, cols=2,
                             subplot_titles=tuple(titles))

for col in range(1,3):
    fig_data= [go.Scatter(
            x= Comps[mut_indices[i],0],
            y= Comps[mut_indices[i],col],
            mode= "markers",
            text= [mut_list[x] for x in mut_indices[i]],
            marker= {
            'color': col_mut[i],
            'line': {'width': 1},
            'size': 8,
            'symbol': 'circle',
          "opacity": 1
          },
            name= i
        ) for i in list(mut_indices.keys())]
    
    for trace1 in fig_data:
        fig_subplots.append_trace(trace1, 1, col)
        
    fig_subplots.update_xaxes(title= 'PC1')

    fig_subplots.update_yaxes(title= 'PC' + str(col + 1),row=1, col=col)


layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ),
    yaxis=dict(
        title='PC2: {}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= 'PC1: {}'.format(round(var_comps[0],3)))
)


fig = go.Figure(data=fig_subplots, layout=layout)
fig['layout'].update(title= 'kmers')
iplot(fig)
