In [1]:
import sys
import argparse
import numpy as np
from scipy.stats import chi2_contingency
from itertools import product
import itertools as it

import allel
import pandas as pd

from _plotly_future_ import v4_subplots
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)
    
from datetime import datetime
import tempfile
import os
import gzip
import subprocess
import time

import collections
def recursively_default_dict():
    return collections.defaultdict(recursively_default_dict)


In [2]:
filetest='/mnt/d/SLiM/Rmut_var/mut_matrices/mut_matrix_v0.txt'
with open(filetest,'r') as fp:
    lines= fp.readlines()
    

In [3]:
lines

['AAA\t0,3.3333333333333334e-09,3.3333333333333334e-09,3.3333333333333334e-09\n',
 'AAT\t0,3.3333333333333334e-09,3.3333333333333334e-09,3.3333333333333334e-09\n',
 'AAC\t0,3.3333333333333334e-09,3.3333333333333334e-09,3.3333333333333334e-09\n',
 'AAG\t0,3.3333333333333334e-09,3.3333333333333334e-09,3.3333333333333334e-09\n',
 'ATA\t3.3333333333333334e-09,3.3333333333333334e-09,3.3333333333333334e-09,0\n',
 'ATT\t3.3333333333333334e-09,3.3333333333333334e-09,3.3333333333333334e-09,0\n',
 'ATC\t3.3333333333333334e-09,3.3333333333333334e-09,3.3333333333333334e-09,0\n',
 'ATG\t3.3333333333333334e-09,3.3333333333333334e-09,3.3333333333333334e-09,0\n',
 'ACA\t3.3333333333333334e-09,0,3.3333333333333334e-09,3.3333333333333334e-09\n',
 'ACT\t3.3333333333333334e-09,0,3.3333333333333334e-09,3.3333333333333334e-09\n',
 'ACC\t3.3333333333333334e-09,0,3.3333333333333334e-09,3.3333333333333334e-09\n',
 'ACG\t3.3333333333333334e-09,0,3.3333333333333334e-09,3.3333333333333334e-09\n',
 'AGA\t3.3333333

In [6]:
from tools.mcounter_tools import (
    read_vcf_allel, ind_assignment_scatter_v1, MC_sample_matrix_v1,
    heatmap_v2, ind_assignment_SFS, read_windows_SFS
)

In [None]:






def mcounter_deploy(data,p_value= 1e-5, test_m= 'fisher', individually= False,
                            exclude= False, frequency_range= [0,1], data_freqs= {}, extract= 'pval',
                            muted_dir= '', tag_ref= '_ss'):
    '''
    Parse data dictionary.
        data: {sim: {counts:{pop:g}, Nvars:{pop:g}, sizes:{pop:g}}}
    i: use sim and pop IDs to create dictionary connecting original populations to 
    subset populations created using ind_assignment_scatter_v1.
    ii: for each pair of reference/subset populations, launch heatmapv2. return grid pvals or proportions,
    and proportion of mutations in subset population. allows for fisher or chi2 test for pval.
    '''
    
    avail= list(data.keys())
    ref_idx= [int(tag_ref in avail[x]) for x in range(len(avail) )]
    categ= {
        z: [x for x in range(len(avail)) if ref_idx[x] == z] for z in [0,1]
    }

    pop_asso= {avail[x]:recursively_default_dict() for x in categ[0]}

    for av in categ[1]:
        dat= [x for x in data[avail[av]]['counts'].keys() if tag_ref in x]
        ref_sim= avail[av].split(tag_ref)[0]
        ref_pop= [x.split('.')[0].strip(tag_ref) for x in dat]
        for p in range(len(dat)):
            pop_asso[ref_sim][ref_pop[p]][avail[av]]= dat[p]

    d= 0
    count_data= recursively_default_dict()

    for ref in pop_asso.keys():

        for pop in pop_asso[ref].keys():
            for sub in pop_asso[ref][pop].keys():

                batch= ref.split('C')[0]

                pop_dict= {
                    ref: pop,
                    sub: pop_asso[ref][pop][sub]
                }

                sizes= [data[ref]['sizes'][pop], data[sub]['sizes'][pop_asso[ref][pop][sub]]]
                #print(sizes)

                chromosomes= [x.split('.')[0].split('C')[1] for x in pop_dict.keys()]

                pop_counts= {
                    x: data[x]['counts'][z] for x,z in pop_dict.items() 
                }

                num_variants= {
                    x: data[x]['Nvars'][z] for x,z in pop_dict.items() 
                }

                ratio_grid, sig_cells= heatmap_v2(chromosomes,pop_counts,num_variants,
                                                  pop_dict,frequency_range, exclude, 
                                                    p_value, muted_dir,tag= '',test= test_m,output= 'pval')

                pop_counts[sub]= pop_counts[sub] / np.sum(pop_counts[sub])
                pop_counts[ref]= pop_counts[ref] / np.sum(pop_counts[ref])

                dist_prop= pop_counts[sub] / pop_counts[ref]
                grid_diffs= pop_counts[sub] - pop_counts[ref]

                count_data[d]= {
                    'grids': ratio_grid,
                    'sigs': sig_cells,
                    'sizes': sizes,
                    'batch': batch,
                    'prop': dist_prop,
                    'pop': pop,
                    'diffs': grid_diffs
                }

                if data_freqs:
                    count_data[d]['freqs']= {
                        0: data_freqs[ref][pop],
                        1: data_freqs[sub][pop_asso[ref][pop][sub]]
                    }


                d += 1
    
    return pop_asso, count_data


In [None]:
#from tools.SLiM_pipe_tools import mutation_counter_launch
import re
import pandas as pd


## directories
main_dir= os.getcwd() + '/'
sims_dir= main_dir + 'mutation_counter/data/sims_dem/'
diffs= False
frequency_range= [0,1]
args_present= False

data_kmer, data= read_windows_SFS(diffs= diffs, frequency_range= frequency_range,indfile= 'ind_assignments.txt', outemp= 'ind_assignments{}.txt',
                    sim_dir= sims_dir, muted_dir= 'mutation_counter/data/mutation_count/',
                    outlog= 'indy.log', row= 24,col= 4, single= True, exclude= False, args= args_present)



In [40]:

####
def get_mutations(bases= 'ACGT',ksize= 3):
    '''return list of possible kmer mutations'''
    
    mutations=[]
    
    base_set= [bases]*ksize

    for trimer in product(*base_set):
        for base in bases:
            if trimer[int(ksize / 2)] != base:
                mutations.append((''.join(trimer), base))
    
    return mutations


def kmer_comp_index(mutations):
    ''' return nested dictionaries of kmer mutations w/ index'''
    kmers= {}
    kmer_idx= {}
    d= 0
    for kmer in mutations:

        comp= get_complement(kmer[0]) + get_complement(kmer[1])
        comp= ''.join(comp)
        kmer= ''.join(kmer)
        
        if comp in kmers.keys():
            idx= kmers[comp]
            kmers[kmer]= idx
            kmer_idx[idx].append(kmer)
        else:
            kmers[kmer]= len(kmer_idx)
            kmer_idx[len(kmer_idx)]= [kmer]

        d += 1
    
    return kmers, kmer_idx


def kmer_mut_index(mutations):
    '''produce nested dictionary of nucs for a particular mutation list'''
    mut_lib= recursively_default_dict()
    
    for mut in range(len(mutations)):
        trimer= ''.join(mutations[mut])
        get_by_path(mut_lib, trimer[:-1])[trimer[-1]]= mut
    
    return mut_lib




def get_complement(kmer):
    '''Return complement of a given kmer'''
    complements= {
        'A': 'T',
        'T': 'A',
        'C': 'G',
        'G': 'C'
    }
    
    comp= [complements[x] for x in kmer][::-1]
    return comp

def get_by_path(root, items):
    """Access a nested object in root by item sequence."""
    return reduce(operator.getitem, items, root)

def set_by_path(root, items, value):
    """Set a value in a nested object in root by item sequence."""
    get_by_path(root, items[:-1])[items[-1]] = value

####

def mutation_dict_full(bases= 'ATCG',ksize= 3):
    mutations= []
    mut_lib= recursively_default_dict()
    mut_org= []
    
    base_set= [bases]*ksize

    for trimer in product(*base_set):
        mut_org.append(trimer)
        for base in bases:
            mutations.append((''.join(trimer), base))
            get_by_path(mut_lib, trimer[:-1])[trimer[-1]]= base
            
            
    
    return mut_lib,mutations,mut_org

In [41]:
from functools import reduce  # forward compatibility for Python 3
import operator

bases= 'ATCG'
ksize= 3
mutations= get_mutations(bases= bases,ksize= ksize)
kmers, kmer_idx= kmer_comp_index(mutations)

mut_lib= kmer_mut_index(mutations)

In [42]:
mutations_full_dict, mutations_full_list, mut_org= mutation_dict_full(bases= bases,ksize= ksize)
len(mutations_full_list)

256

In [45]:
text_diff= 'mut_matrix_v0.txt'

mu= 1e-8
bases_slim= 'ACGT'

with open(text_diff,'w') as fp:
    for mut in mut_org:
        mut_vector= [0] * len(bases)
        for idx in range(len(bases_slim)):
            bass= bases_slim[idx]
            if bass != mut[1]:
                mut_vector[idx]= mu / 3
        
        mut= ''.join(mut)
        trans= ','.join([str(x) for x in mut_vector])
        
        fp.write('\t'.join([mut,trans]) + '\n')



In [9]:
t= [kmer_idx[x][0] for x in sorted(kmer_idx.keys())]
grid= np.array(t).reshape(24,4)
grid

array([['AAAT', 'AAAC', 'AAAG', 'AATT'],
       ['AATC', 'AATG', 'AACT', 'AACC'],
       ['AACG', 'AAGT', 'AAGC', 'AAGG'],
       ['ATAA', 'ATAC', 'ATAG', 'ATCA'],
       ['ATCC', 'ATCG', 'ATGA', 'ATGC'],
       ['ATGG', 'ACAA', 'ACAT', 'ACAG'],
       ['ACTA', 'ACTT', 'ACTG', 'ACCA'],
       ['ACCT', 'ACCG', 'ACGA', 'ACGT'],
       ['ACGG', 'AGAA', 'AGAT', 'AGAC'],
       ['AGCA', 'AGCT', 'AGCC', 'AGGA'],
       ['AGGT', 'AGGC', 'TAAT', 'TAAC'],
       ['TAAG', 'TACT', 'TACC', 'TACG'],
       ['TAGT', 'TAGC', 'TAGG', 'TTCA'],
       ['TTCC', 'TTCG', 'TTGA', 'TTGC'],
       ['TTGG', 'TCAA', 'TCAT', 'TCAG'],
       ['TCCA', 'TCCT', 'TCCG', 'TCGA'],
       ['TCGT', 'TCGG', 'TGCA', 'TGCT'],
       ['TGCC', 'TGGA', 'TGGT', 'TGGC'],
       ['CACT', 'CACC', 'CACG', 'CAGT'],
       ['CAGC', 'CAGG', 'CTCA', 'CTCC'],
       ['CTCG', 'CCCA', 'CCCT', 'CCCG'],
       ['CCGA', 'CCGT', 'CCGG', 'CGCA'],
       ['CGCT', 'CGCC', 'GACT', 'GACC'],
       ['GACG', 'GCCA', 'GCCT', 'GCCG']], dtype='<U4')

In [10]:
grid.shape

(24, 4)

In [34]:
grid_to_list= grid.reshape(1,np.prod(grid.shape))[0]
grid_to_list

array(['AAAT', 'AAAC', 'AAAG', 'AATT', 'AATC', 'AATG', 'AACT', 'AACC',
       'AACG', 'AAGT', 'AAGC', 'AAGG', 'ATAA', 'ATAC', 'ATAG', 'ATCA',
       'ATCC', 'ATCG', 'ATGA', 'ATGC', 'ATGG', 'ACAA', 'ACAT', 'ACAG',
       'ACTA', 'ACTT', 'ACTG', 'ACCA', 'ACCT', 'ACCG', 'ACGA', 'ACGT',
       'ACGG', 'AGAA', 'AGAT', 'AGAC', 'AGCA', 'AGCT', 'AGCC', 'AGGA',
       'AGGT', 'AGGC', 'TAAT', 'TAAC', 'TAAG', 'TACT', 'TACC', 'TACG',
       'TAGT', 'TAGC', 'TAGG', 'TTCA', 'TTCC', 'TTCG', 'TTGA', 'TTGC',
       'TTGG', 'TCAA', 'TCAT', 'TCAG', 'TCCA', 'TCCT', 'TCCG', 'TCGA',
       'TCGT', 'TCGG', 'TGCA', 'TGCT', 'TGCC', 'TGGA', 'TGGT', 'TGGC',
       'CACT', 'CACC', 'CACG', 'CAGT', 'CAGC', 'CAGG', 'CTCA', 'CTCC',
       'CTCG', 'CCCA', 'CCCT', 'CCCG', 'CCGA', 'CCGT', 'CCGG', 'CGCA',
       'CGCT', 'CGCC', 'GACT', 'GACC', 'GACG', 'GCCA', 'GCCT', 'GCCG'],
      dtype='<U4')

In [36]:
grid_to_list[21]

'ACAA'

In [23]:
kmer_idx

{0: ['AAAT', 'TTTA'],
 1: ['AAAC', 'TTTG'],
 2: ['AAAG', 'TTTC'],
 3: ['AATT', 'ATTA'],
 4: ['AATC', 'ATTG'],
 5: ['AATG', 'ATTC'],
 6: ['AACT', 'GTTA'],
 7: ['AACC', 'GTTG'],
 8: ['AACG', 'GTTC'],
 9: ['AAGT', 'CTTA'],
 10: ['AAGC', 'CTTG'],
 11: ['AAGG', 'CTTC'],
 12: ['ATAA', 'TATT'],
 13: ['ATAC', 'TATG'],
 14: ['ATAG', 'TATC'],
 15: ['ATCA', 'GATT'],
 16: ['ATCC', 'GATG'],
 17: ['ATCG', 'GATC'],
 18: ['ATGA', 'CATT'],
 19: ['ATGC', 'CATG'],
 20: ['ATGG', 'CATC'],
 21: ['ACAA', 'TGTT'],
 22: ['ACAT', 'TGTA'],
 23: ['ACAG', 'TGTC'],
 24: ['ACTA', 'AGTT'],
 25: ['ACTT', 'AGTA'],
 26: ['ACTG', 'AGTC'],
 27: ['ACCA', 'GGTT'],
 28: ['ACCT', 'GGTA'],
 29: ['ACCG', 'GGTC'],
 30: ['ACGA', 'CGTT'],
 31: ['ACGT', 'CGTA'],
 32: ['ACGG', 'CGTC'],
 33: ['AGAA', 'TCTT'],
 34: ['AGAT', 'TCTA'],
 35: ['AGAC', 'TCTG'],
 36: ['AGCA', 'GCTT'],
 37: ['AGCT', 'GCTA'],
 38: ['AGCC', 'GCTG'],
 39: ['AGGA', 'CCTT'],
 40: ['AGGT', 'CCTA'],
 41: ['AGGC', 'CCTG'],
 42: ['TAAT', 'TTAA'],
 43: ['TAAC', 'TTAG']

71