In [2]:
import sys
import argparse
import numpy as np
from scipy.stats import chi2_contingency
from itertools import product
import itertools as it

import allel
import pandas as pd

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)
    
from datetime import datetime
import tempfile
import os
import gzip
import subprocess
import time

import collections
def recursively_default_dict():
    return collections.defaultdict(recursively_default_dict)

import matplotlib
matplotlib.use('Agg')

import matplotlib.pyplot as plt

In [3]:
from tools.mcounter_tools import (
    read_vcf_allel, ind_assignment_scatter_v1, MC_sample_matrix_v1,
    heatmap_v2
)

In [10]:
#from tools.SLiM_pipe_tools import mutation_counter_launch
import re
import pandas as pd


## directories
main_dir= os.getcwd() + '/'
count_dir= main_dir + 'mutation_counter/count/'
dir_launch= main_dir + 'mutation_counter'
muted_dir= main_dir + 'mutation_counter/data/mutation_count/'
sims_dir= main_dir + 'mutation_counter/data/MutVar_test/'
diffs= False

mutlog= 'toMut.log'
min_size= 5
sampling= [5,50,5]
sample_sim= 0
collapsed= False
row= 48
col= 4
bases= 'ACGT'

data, data_freqs = MC_sample_matrix_v1(min_size= min_size, samp= sampling, count_dir= count_dir, 
                        dir_launch= dir_launch,main_dir= main_dir,sim_dir= sims_dir,
                          muted_dir= muted_dir, diffs= diffs, row= row,bases= bases,
                       exclude= False,sample_sim= sample_sim,collapsed= collapsed)


missing: 0, no vcf: 6
available 12
sample 12
(2000, 6574) MatVar1MEquilM0C1.33219664
(2000, 6401) MatVar1MEquilM5C9.118995301
(2000, 6694) MatVar1MEquilM5C9.107684780
(2000, 6736) MatVar1MEquilM0C1.224063754
(2000, 6685) MatVar1MEquilM0C17.10995261
(2000, 6581) MatVar1MEquilM0C1.237002349
(2000, 6416) MatVar1MEquilM5C9.86444953
(2000, 6501) MatVar1MEquilM5C9.71536410
(2000, 6658) MatVar1MEquilM0C17.41991018
(2000, 6536) MatVar1MEquilM5C9.105466280
(2000, 6357) MatVar1MEquilM3C16.73540471
(2000, 6653) MatVar1MEquilM3C17.10995261
time elapsed: 5812.556041955948s


## Data analysis - Pairwise population comparison

In [11]:
from tools.fasta_utilities import (
    get_mutations, get_by_path, kmer_comp_index, kmer_mut_index,
    fasta_get_freq,kmer_dict_init
    )


def fasta_get_freq(seq,start= 0,end= 0,step= 1,ksize=3,bases= 'ATCG'):
    '''return count of kmer across fasta region'''
    kmer_dict= kmer_dict_init(ksize= ksize,bases=bases)
    if end == 0:
        end= len(seq) - ksize

    for ki in range(start,end,step):
        kmer= seq[ki:ki+ksize]
        if 'N' in kmer:
            continue
        get_by_path(kmer_dict, kmer[:-1])[kmer[-1]] += 1

    return kmer_dict


def kmer_freq_balance(kmer_dict, mutations, fasta_len= 10000, bases= 'ACGT',ksize= 3):
    '''return list of possible kmer mutations'''

    mutation_sum= []
    Nkmers= fasta_len - ksize

    for idx in range(len(mutations)):
        mut= mutations[idx]
        prop= get_by_path(kmer_dict,mut[0])
        prop= prop / Nkmers
        mutation_sum.append(prop)

    return np.array(mutation_sum).reshape(1,-1)


def get_fasta_prop(sim,sim_dir,mutations,ksize= 3,bases= 'ACGT'):
    
    chrom= sim.split('.')[0].split('C')[1]
    
    fasta_file= sim_dir + 'chr{}_{}.fa.gz'.format(chrom,sim)

    with gzip.open(fasta_file,'r') as f:
        lines= f.readlines()
        lines= [x.decode() for x in lines]

    refseq= lines[1].strip()

    kmer_dict= fasta_get_freq(refseq,start= 0,end= 0,step= 1,ksize=ksize,bases= bases)

    ref_kmer_prop =kmer_freq_balance(kmer_dict,mutations,fasta_len= len(refseq))
    
    return ref_kmer_prop


ksize= 3
#bases= 'ATCG'
bases= 'ACGT'

mutations= get_mutations(bases= bases,ksize= ksize)
kmers, kmer_idx= kmer_comp_index(mutations)

mut_lib= kmer_mut_index(mutations)

In [12]:
from tools.mcounter_tools import read_args

p_value= 1e-5
test_m= 'chi2'
individually= False
exclude= False
frequency_range= [0,1]
extract= 'pval'
Nbins= 100
tag_ref= '_ss'

avail= list(data.keys())
ref_idx= [int(tag_ref in avail[x]) for x in range(len(avail) )]
categ= {
    z: [x for x in range(len(avail)) if ref_idx[x] == z] for z in [0,1]
}

print([len(categ[x]) for x in [0,1]])

### possible combinations per simulation.
### ref_mdict: store count proportions by mutation matrix (list of lists). PCA,(deprecatted distances )
ref_mdict= recursively_default_dict()
### ref_mats: mutation matrix file (label) for reference simulation. 
ref_mats= {}

### ref_mat_dict: mutation count by population by simulation by mut_matrix file (used for distances)
ref_mat_dict= recursively_default_dict()

### fasta_ref_dict: store kmer frequency by reference simulation (use for standardize count proportions)
fasta_ref_dict= {}
### store fasta frequencies by population for each mutation matrix label - PCA. 
fasta_pop= {}


for idx in categ[0]:
    ref= avail[idx]
    ref_dir= sims_dir + ref + '/'
    ref_args= read_args(ref,sim_dir=ref_dir)
    mut_matrix= ref_args['mut_file']
    
    fasta_kmer_prop= get_fasta_prop(ref,ref_dir,mutations,ksize= 3,bases= 'ACGT')
    fasta_ref_dict[ref]= fasta_kmer_prop
    
    ref_mats[ref]= mut_matrix
    
    batch= ref.split('C')[0]
    
    sizes= data[ref]['sizes']
    #
    
    chromosomes= [ref.split('.')[0].split('C')[1]]

    pop_counts= data[ref]['counts']

    pop_counts= {
    z: pop_counts[z] / np.sum(pop_counts[z]) for z in pop_counts.keys()
    }
    
    if mut_matrix not in ref_mdict.keys():
        ref_mdict[mut_matrix]= []
        fasta_pop[mut_matrix]= []
        
    
    for pop in pop_counts.keys():
        mlist= pop_counts[pop].reshape(1,np.prod(pop_counts[pop].shape))[0]
        mlist= mlist.reshape(1,-1)
        
        ## balance for kmer frequencies in fasta.
        #mlist= mlist * (1/fasta_kmer_prop.shape[1] / fasta_kmer_prop)
        mlist= mlist - (fasta_kmer_prop/3)
        
        fasta_pop[mut_matrix].append(list(fasta_kmer_prop[0]))
        ref_mdict[mut_matrix].append(list(mlist[0]))
        ref_mat_dict[mut_matrix][ref][pop]= mlist
    


[12, 3000]


In [13]:
from sklearn import decomposition
from sklearn import preprocessing

mat_avail= list(ref_mdict.keys())

labels= np.repeat(mat_avail,[len(ref_mdict[x]) for x in mat_avail])
mat_dict= {
    z: [x for x in range(len(labels)) if labels[x] == z] for z in list(set(labels))
}
data_ref= list(it.chain(*[ref_mdict[x] for x in mat_avail]))
data_ref= np.array(data_ref)
data_freq= list(it.chain(*[fasta_pop[x] for x in mat_avail]))
data_freq=np.array(data_freq)
data_freq.shape

(12, 192)

In [14]:
freq_array= data_freq.reshape(1,np.prod(data_freq.shape))[0]
ref_array= data_ref.reshape(1,np.prod(data_ref.shape))[0]


fig= [go.Scatter(
    x= freq_array / 3,
    y= ref_array,
    mode= 'markers'
)]

line= go.Scatter(
    x= np.linspace(0,0.02,50),
    y= np.linspace(0,0.02,50),
    mode= 'lines',
    name= 'y=x'
)



layout= go.Layout(
    xaxis= dict(
        title= "kmer freq. in fasta"
    ),
    yaxis= dict(
        title= "kmer count"
    )
    
)

Figure= go.Figure(data= fig, layout= layout)
Figure['layout'].update(title= 'kmer frequency & count across data sets. - 1MB data sets',width= 800, height= 800)
iplot(Figure)


In [15]:

pca = decomposition.PCA(n_components=3)

pca.fit(preprocessing.normalize(data_freq,axis= 0))
pca_decomp = pca.transform(data_ref)

fig= [go.Scatter3d(
    x= [pca_decomp[x,0] for x in mat_dict[i]],
    y= [pca_decomp[x,1] for x in mat_dict[i]],
    z= [pca_decomp[x,2] for x in mat_dict[i]],
    mode= 'markers',
    name= i
) for i in list(mat_dict.keys())]

layout= go.Layout()

Figure= go.Figure(data= fig, layout= layout)
iplot(Figure)

In [13]:
pop_asso= {avail[x]:recursively_default_dict() for x in categ[0]}

for av in categ[1]:
    dat= [x for x in data[avail[av]]['counts'].keys() if tag_ref in x]
    dat_size= [data[avail[av]]['sizes'][x] for x in dat]
    ref_sim= avail[av].split(tag_ref)[0]
    ref_pop= [x.split('.')[0].strip(tag_ref) for x in dat]
    dat_size= [dat_size[x] / data[ref_sim]['sizes'][ref_pop[x]] for x in range(len(dat))]
    dat_size= [round(x,3) for x in dat_size]
    for p in range(len(dat)):
        pop_asso[ref_sim][ref_pop[p]][dat_size[p]][avail[av]]= dat[p]

d= 0

In [14]:

### combine simulation combination and population size ranges.
from sklearn.metrics import pairwise_distances
labels_ref= list(mat_dict.keys())

sub_pop= []
sub_prop= []
sub_label= []
sub_values= []

dub_diffs= []
euc_predict= []


for ref_sim in pop_asso.keys():
    #print(ref_sim)
    batch= ref_sim.split('C')[0]
    mut_matrix= ref_mats[ref_sim]
    # kmer_frequencies
    fasta_kmer_prop= fasta_ref_dict[ref]
    
    for pop in pop_asso[ref_sim].keys():
        for prop in pop_asso[ref_sim][pop].keys():
            
            for sub in pop_asso[ref_sim][pop][prop].keys():
                
                poppy= pop_asso[ref_sim][pop][prop][sub]
                
                pop_counts= data[sub]['counts']
                pop_counts= {z: g / np.sum(g) for z,g in pop_counts.items()}
                if not poppy:
                    continue
                mlist= pop_counts[poppy].reshape(1,np.prod(pop_counts[poppy].shape))[0]

                sub_pop.append(poppy)
                sub_values.append(list(mlist))
                sub_label.append(mut_matrix)
                sub_prop.append(prop)
                mlist=mlist.reshape(1,-1)
                ## balance for kmer frequency in respective fasta.
                mlist= mlist - (fasta_kmer_prop / 3)
                #mlist= mlist / np.sum(mlist)
                
                #ref_dists= [pairwise_distances(mlist,np.array(ref_mdict[z]),metric= 'euclidean') for z in labels_ref]
                #ref_dists= [pairwise_distances(mlist,np.mean(np.array(ref_mdict[z]),axis=0).reshape(1,-1)) for z in labels_ref]
                ref_dists= [[pairwise_distances(mlist,ref_mat_dict[z][r][p],metric= 'euclidean') for r in ref_mat_dict[z].keys() for p in ref_mat_dict[z][r].keys()  if [z,r,p] != [mut_matrix,ref,pop]] for z in labels_ref]
                ref_dists= [np.mean(x) for x in ref_dists]
                
                predict= np.argmin(np.array(ref_dists))
                predict= labels_ref[predict]
                euc_predict.append(predict)
                


sub_values= np.array(sub_values)

In [15]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(pca_decomp, labels)
predictions= neigh.predict(pca.transform(sub_values))
predictions= euc_predict

In [16]:
Nbins= 50
bins= np.linspace(0,1,Nbins)
bins= np.round(bins,4)
bins= [(bins[x-1],bins[x]) for x in range(1,len(bins))]


lab_dict= {
    z: [x for x in range(sub_values.shape[0]) if sub_label[x] == z] for z in list(set(sub_label))
}

## actual props
'''
lab_prop= {
    lab: {
        prop: [x for x in lab_dict[lab] if sub_prop[x] == prop] for prop in list(set(sub_prop))
    } for lab in lab_dict.keys()
}
'''

## bins
lab_prop= {
    lab: {
        sum(bi)/2: [x for x in lab_dict[lab] if sub_prop[x] >= bi[0] and sub_prop[x] < bi[1]] for bi in bins
    } for lab in lab_dict.keys()
}


comp_dict= {
    lab: {
        prop: [int(predictions[x] == sub_label[x]) for x in lab_prop[lab][prop]] for prop in lab_prop[lab].keys()
    } for lab in lab_prop.keys()
}


comp_std= {
    lab: {
        prop: np.std(comp_dict[lab][prop]) for prop in comp_dict[lab].keys()
    } for lab in comp_dict.keys()
}


comp_dict= {
    lab: {
        prop: np.mean(comp_dict[lab][prop]) for prop in comp_dict[lab].keys()
    } for lab in comp_dict.keys()
}



In [17]:
mat_dir= 'mut_matrices/'
mat_lines= {}

for lab in ref_mdict.keys():
    with open(mat_dir + lab,'r') as fp:
        lines= fp.readlines()
        mat_lines[lab]= len(lines)


In [20]:

fig= [go.Scatter(
    y= [comp_dict[lab][prop] for prop in sorted(comp_dict[lab].keys())],
    x= sorted(comp_dict[lab].keys()),
    error_y= dict(
        array= [comp_std[lab][prop] for prop in sorted(comp_dict[lab].keys())],
        type= 'data',
        #symmetric= True,
        visible=False
    ),
    mode= 'lines',
    name= lab.split('.')[0] + '_{}'.format(mat_lines[lab])
    
) for lab in comp_dict.keys()]


layout= go.Layout(
    title= 'rate of accuracy. kmer freq controlled - 1Mb',
    xaxis= dict(
        range= [-0.01,1.03],
        title= 'relative sample size'
    ),
    yaxis= dict(
        range= [0.5,1.02],
        title= 'accuracy'
    )
)

Figure= go.Figure(data= fig,layout= layout)
iplot(Figure)


In [21]:
### False positives

fp_dict= {
    lab: {
        prop: [[int(predictions[x] == lab and sub_label[x] != lab) for x in lab_prop[olab][prop]] for olab in lab_prop.keys() if olab != lab] for prop in lab_prop[lab].keys()
    } for lab in lab_prop.keys()
}


fp_dict= {
    lab: {
        prop: list(it.chain(*fp_dict[lab][prop])) for prop in lab_prop[lab].keys()
    } for lab in lab_prop.keys()
}


fp_std= {
    lab: {
        prop: np.std(fp_dict[lab][prop]) for prop in fp_dict[lab].keys()
    } for lab in fp_dict.keys()
}


fp_dict= {
    lab: {
        prop: np.mean(fp_dict[lab][prop]) for prop in fp_dict[lab].keys()
    } for lab in fp_dict.keys()
}


In [22]:

fig= [go.Scatter(
    y= [fp_dict[lab][prop] for prop in sorted(fp_dict[lab].keys())],
    x= sorted(fp_dict[lab].keys()),
    error_y= dict(
        array= [fp_dict[lab][prop] for prop in sorted(fp_dict[lab].keys())],
        type= 'data',
        #symmetric= True,
        visible=False
    ),
    mode= 'lines',
    name= lab.split('.')[0] + '_{}'.format(mat_lines[lab])
    
) for lab in fp_dict.keys()]

layout= go.Layout(
    title= 'FP & sampling. kmer freq controlled',
    xaxis= dict(
        range= [-0.01,1.05],
        title= 'relative sample size'
    ),
    yaxis= dict(
        title= 'false positive rate'
    )
)

Figure= go.Figure(data= fig,layout= layout)
iplot(Figure)

In [23]:
comp_dict.keys()

dict_keys(['MatVar1MEquilM5_grid.txt', 'MatVar1MEquilM4_grid.txt'])

In [18]:
mat_dir= 'mut_matrices/'
trial_dict= {z: {} for z in ref_mdict.keys()}

for trial in trial_dict.keys():
    with open(mat_dir + trial, 'r') as fp:
        lines= fp.readlines()
        lines= [x.strip().split('\t') for x in lines]
        print(lines)
        trial_dict[trial]= {
            x[0]: [float(x) for x in x[1].split(',')] for x in lines
        }

[]
[['AGG', '8.333333333333334e-09,8.333333333333334e-10,0.0,8.333333333333334e-10']]
[['ATA', '8.333333333333334e-09,8.333333333333334e-10,8.333333333333334e-10,0.0'], ['CTC', '8.333333333333334e-10,8.333333333333334e-10,8.333333333333334e-09,0.0']]


### Identifying patterns without labels. 

for now, assuming mutation counts are not collapsed. 

In [84]:
ref_mat= [x for x in ref_mdict.keys() if 'M0' in x][0]

ref_kmer_dists= {
    z: [y[z] for y in ref_mdict[ref_mat]] for z in range(len(ref_mdict[ref_mat][0]))
}

ref_kmer_stats= {
    z: [np.mean(ref_kmer_dists[z]),np.std(ref_kmer_dists[z])] for z in ref_kmer_dists.keys()
}


In [64]:
ref_kmer_stats

{0: [0.0010599338718093737, 0.0013691018693713047],
 1: [0.0001917246120932921, 0.0010331753986453334],
 2: [0.00040722580216532857, 0.0009679012028305394],
 3: [-0.00011036225202659782, 0.0008033069888199043],
 4: [-7.987987171193708e-05, 0.0004236616040484021],
 5: [-0.00016586865982143073, 0.0010689998438120514],
 6: [0.00016883968169390234, 0.000648641340828054],
 7: [0.0008278656393353168, 0.0005555107306078498],
 8: [-5.455281645009562e-05, 0.0004630669082732785],
 9: [0.00035479759252324413, 0.0010766663268665835],
 10: [-0.00015375141685695215, 0.00077937511593816],
 11: [9.664261158748351e-05, 0.001596424422801843],
 12: [0.00044012558371353885, 0.0007811743223856314],
 13: [1.1411303680429128e-06, 0.00022098732295523362],
 14: [0.000570795822865586, 0.0009942470619064268],
 15: [0.00017190922407498248, 0.0005940224754373381],
 16: [-0.000922412226726153, 0.0009847853852501518],
 17: [-0.0008077266792086198, 0.0005950113837769841],
 18: [0.00012096862103334531, 0.0003854683328

In [20]:

ref_kmer_means= {
    z: np.median(np.array(ref_mdict[z]),axis= 0).reshape(48,4) for z in ref_mdict.keys()
}


In [83]:
from plotly.subplots import make_subplots
from plotly import tools

grid= 'MatVar1MEquilM0_grid.txt'
kmer_indicies= [get_by_path(mut_lib, list())]
kmer_labels= ['_'.join([''.join(x[0]),x[1]]) for x in mutations]
kmer_labels= np.array(kmer_labels).reshape(48,4)

print(trial_dict[grid].keys())

fig = [go.Heatmap(
    z= ref_kmer_means[grid],
    text= kmer_labels,
    type = 'heatmap',
    colorscale= 'RdBu'
)]

layout= go.Layout()

fig= go.Figure(data=fig, layout=layout)

fig['layout'].update(title= grid,width= 800, height= 900)
iplot(fig)

dict_keys([])


In [70]:
pop_asso= {avail[x]:recursively_default_dict() for x in categ[0]}

for av in categ[1]:
    dat= [x for x in data[avail[av]]['counts'].keys() if tag_ref in x]
    dat_size= [data[avail[av]]['sizes'][x] for x in dat]
    ref_sim= avail[av].split(tag_ref)[0]
    ref_pop= [x.split('.')[0].strip(tag_ref) for x in dat]
    dat_size= [dat_size[x] / data[ref_sim]['sizes'][ref_pop[x]] for x in range(len(dat))]
    dat_size= [round(x,3) for x in dat_size]
    for p in range(len(dat)):
        pop_asso[ref_sim][ref_pop[p]][dat_size[p]][avail[av]]= dat[p]

d= 0

In [75]:

from scipy.stats import norm
sig_value= 0.5
labels_ref= list(mat_dict.keys())

sub_pop= []
sub_prop= []
sub_label= []
sub_values= []

dub_diffs= []
euc_predict= []

muts_each= []
sig_scores= []

for ref_sim in pop_asso.keys():
    #print(ref_sim)
    batch= ref_sim.split('C')[0]
    mut_matrix= ref_mats[ref_sim]
    
    for pop in pop_asso[ref_sim].keys():
        for prop in pop_asso[ref_sim][pop].keys():
            
            for sub in pop_asso[ref_sim][pop][prop].keys():
                
                poppy= pop_asso[ref_sim][pop][prop][sub]
                
                pop_counts= data[sub]['counts']
                pop_counts= {z: g / np.sum(g) for z,g in pop_counts.items()}
                if not poppy:
                    continue
                mlist= pop_counts[poppy].reshape(1,np.prod(pop_counts[poppy].shape))[0]
                
                sub_pop.append(poppy)
                sub_values.append(list(mlist))
                sub_label.append(mut_matrix)
                sub_prop.append(prop)
                #mlist=mlist.reshape(1,-1)
                #print('hi')
                #sigs= [norm.cdf(mlist[x],loc= ref_kmer_stats[x][0],scale= ref_kmer_stats[x][1]) for x in range(len(mlist))]
                sigs= [(mlist[x] - ref_kmer_stats[x][0]) / ref_kmer_stats[x][1] for x in range(len(mlist))]
                sigs= [x for x in range(len(sigs)) if sigs[x] > 2]
                
                muts_id= [mutations[x][0] for x in sigs]
                
                mut_score= len([x for x in muts_id if x in trial_dict[mut_matrix].keys()])
                
                if muts_id:
                    mut_score= mut_score / len(muts_id)
                
                sig_scores.append(mut_score)
                muts_each.append(muts_id)
                
                
                
sub_values= np.array(sub_values)

In [85]:
sigs= [(mlist[x] - ref_kmer_stats[x][0]) / ref_kmer_stats[x][1] for x in range(len(mlist))]
sigs

[11.340995721364518,
 12.758620186535083,
 13.467432419120366,
 4.961685628096976,
 4.961685628096976,
 3.8984672792190533,
 7.974137616584427,
 9.037355965462352,
 6.556513151413862,
 6.3793100932675415,
 6.556513151413862,
 10.632183488779235,
 6.0249039769749,
 9.037355965462352,
 6.3793100932675415,
 4.961685628096976,
 4.961685628096976,
 4.784482569950656,
 1.4176244651705647,
 1.5948275233168854,
 0.7088122325852824,
 6.202107035121221,
 6.3793100932675415,
 5.316091744389618,
 8.151340674730749,
 9.568965139901312,
 9.21455902360867,
 5.138888686243297,
 6.202107035121221,
 5.138888686243297,
 6.556513151413862,
 7.442528442145465,
 8.68294984916971,
 4.430076453658015,
 5.84770091882858,
 5.84770091882858,
 16.834290523900457,
 1.772030581463206,
 2.658045872194809,
 4.430076453658015,
 4.252873395511695,
 5.493294802535939,
 9.21455902360867,
 6.733716209560183,
 7.265325383999144,
 7.974137616584427,
 7.619731500291786,
 9.568965139901312,
 6.910919267706503,
 6.379310093267

In [87]:
ref_kmer_stats

{0: [2.168404344971009e-20, 0.0008927770077167252],
 1: [2.168404344971009e-20, 0.0008927770077167252],
 2: [2.168404344971009e-20, 0.0008927770077167252],
 3: [2.168404344971009e-20, 0.0008927770077167252],
 4: [2.168404344971009e-20, 0.0008927770077167252],
 5: [2.168404344971009e-20, 0.0008927770077167252],
 6: [2.168404344971009e-20, 0.0008927770077167252],
 7: [2.168404344971009e-20, 0.0008927770077167252],
 8: [2.168404344971009e-20, 0.0008927770077167252],
 9: [2.168404344971009e-20, 0.0008927770077167252],
 10: [2.168404344971009e-20, 0.0008927770077167252],
 11: [2.168404344971009e-20, 0.0008927770077167252],
 12: [2.168404344971009e-20, 0.0008927770077167252],
 13: [2.168404344971009e-20, 0.0008927770077167252],
 14: [2.168404344971009e-20, 0.0008927770077167252],
 15: [2.168404344971009e-20, 0.0008927770077167252],
 16: [2.168404344971009e-20, 0.0008927770077167252],
 17: [2.168404344971009e-20, 0.0008927770077167252],
 18: [2.168404344971009e-20, 0.0008927770077167252],
 19

In [54]:
sigs= [norm.pdf(mlist[x],loc= ref_kmer_stats[x][0],scale= ref_kmer_stats[x][1]) for x in range(len(mlist))]
sigs

[8.8060958384114e-08,
 1.1860566341039507e-23,
 2.1801128000751863e-29,
 5.7544155416006714e-05,
 2.3502292855630814e-22,
 1.110401476854239,
 7.19917342872145e-23,
 9.260252871711239e-35,
 3.872325931924208e-33,
 0.001683377763537677,
 6.433655881181558e-11,
 7.522151529896088e-06,
 1.0681585375380214e-06,
 7.560377977297909e-287,
 0.0006836384163948908,
 4.6880779814279395e-09,
 0.00015621709361497262,
 1.007314528567123e-13,
 12.592532776271613,
 35.25603828509611,
 66.49597432697945,
 0.00015189295997002314,
 7.316689438648902e-40,
 6.628341205694424e-13,
 2.144242518520704e-21,
 1.097470772377174e-72,
 0.0005151009308916131,
 2.5950029684751193e-11,
 1.6580057004791445e-05,
 2.1916248583302546e-15,
 2.3345033400980666e-12,
 4.290328928413829e-07,
 1.155178163244679e-06,
 0.11461050497380643,
 1.7222449438525662e-14,
 1.315664458616497e-24,
 8.518302925458253e-64,
 162.46340402413134,
 35.61259347532519,
 5.087652114879966,
 0.13921869848236731,
 5.803636411999808e-23,
 3.331723676

In [26]:
test= [x[0] for x in mutations]
test= np.array(test)
test= test.reshape(48,4)
test= test.reshape(1,np.prod(test.shape))[0]

test

array(['AAA', 'AAA', 'AAA', 'AAC', 'AAC', 'AAC', 'AAG', 'AAG', 'AAG',
       'AAT', 'AAT', 'AAT', 'ACA', 'ACA', 'ACA', 'ACC', 'ACC', 'ACC',
       'ACG', 'ACG', 'ACG', 'ACT', 'ACT', 'ACT', 'AGA', 'AGA', 'AGA',
       'AGC', 'AGC', 'AGC', 'AGG', 'AGG', 'AGG', 'AGT', 'AGT', 'AGT',
       'ATA', 'ATA', 'ATA', 'ATC', 'ATC', 'ATC', 'ATG', 'ATG', 'ATG',
       'ATT', 'ATT', 'ATT', 'CAA', 'CAA', 'CAA', 'CAC', 'CAC', 'CAC',
       'CAG', 'CAG', 'CAG', 'CAT', 'CAT', 'CAT', 'CCA', 'CCA', 'CCA',
       'CCC', 'CCC', 'CCC', 'CCG', 'CCG', 'CCG', 'CCT', 'CCT', 'CCT',
       'CGA', 'CGA', 'CGA', 'CGC', 'CGC', 'CGC', 'CGG', 'CGG', 'CGG',
       'CGT', 'CGT', 'CGT', 'CTA', 'CTA', 'CTA', 'CTC', 'CTC', 'CTC',
       'CTG', 'CTG', 'CTG', 'CTT', 'CTT', 'CTT', 'GAA', 'GAA', 'GAA',
       'GAC', 'GAC', 'GAC', 'GAG', 'GAG', 'GAG', 'GAT', 'GAT', 'GAT',
       'GCA', 'GCA', 'GCA', 'GCC', 'GCC', 'GCC', 'GCG', 'GCG', 'GCG',
       'GCT', 'GCT', 'GCT', 'GGA', 'GGA', 'GGA', 'GGC', 'GGC', 'GGC',
       'GGG', 'GGG',

In [190]:
[test[x] for x in sigs]

['AAC', 'AGT', 'AGC', 'CTA', 'CGT', 'CGT', 'CGC']

In [182]:
muts_id

['AAC', 'AGT', 'AGC', 'CTA', 'CGT', 'CGT', 'CGC']

In [76]:
Nbins= 50
bins= np.linspace(0,1,Nbins)
bins= np.round(bins,4)
bins= [(bins[x-1],bins[x]) for x in range(1,len(bins))]


lab_dict= {
    z: [x for x in range(sub_values.shape[0]) if sub_label[x] == z] for z in list(set(sub_label))
}

## bins
lab_prop= {
    lab: {
        sum(bi)/2: [x for x in lab_dict[lab] if sub_prop[x] >= bi[0] and sub_prop[x] < bi[1]] for bi in bins
    } for lab in lab_dict.keys()
}


sig_dict= {
    lab: {
        prop: [sig_scores[x] for x in lab_prop[lab][prop]] for prop in lab_prop[lab].keys()
    } for lab in lab_prop.keys()
}


sig_std= {
    lab: {
        prop: np.std(sig_dict[lab][prop]) for prop in sig_dict[lab].keys()
    } for lab in sig_dict.keys()
}


sig_dict= {
    lab: {
        prop: np.mean(sig_dict[lab][prop]) for prop in sig_dict[lab].keys()
    } for lab in sig_dict.keys()
}

In [77]:

fig= [go.Scatter(
    y= [sig_dict[lab][prop] for prop in sorted(sig_dict[lab].keys())],
    x= sorted(sig_dict[lab].keys()),
    error_y= dict(
        array= [sig_std[lab][prop] for prop in sorted(sig_dict[lab].keys())],
        type= 'data',
        #symmetric= True,
        visible=False
    ),
    mode= 'lines',
    name= lab.split('.')[0] + '_{}'.format(mat_lines[lab])
    
) for lab in sig_dict.keys()]

layout= go.Layout(
    xaxis= dict(
        range= [-0.01,1.03],
    ),
    yaxis= dict(
        range= [0.5,1.02]
    )
)

Figure= go.Figure(data= fig,layout= layout)
iplot(Figure)
