In [22]:
## Matrix multiplication.

import numpy as np
import itertools as it
import pandas as pd
import time
import gzip

from tools.fasta_utilities import (
    geno_muts_v2, get_mutations, get_by_path, vcf_muts_matrix,
    kmer_comp_index, kmer_mut_index
    )



def read_vcf_allel(file_vcf,haps_extract= False,calldata= 'calldata/GT'):
    '''
    Use scikit allel to read vcf file. Organise variant information into summary pandas df. 
    '''
    geno1= []

    vcf_ori= allel.read_vcf(file_vcf)

    if not vcf_ori:
        print('file:')
        print(file_vcf)
        print('is empty.')

        return {}, {}, {}

    ### get genotype array
    geno= vcf_ori[calldata]

    mult_alt= []
    indel= []
    single= []

    ## Filter SNPs. append to single list what to 
    for idx in range(geno.shape[0]):
        ## eliminate +1 segregating mutations.
        if vcf_ori['variants/ALT'][idx][1]:
            gen_t= geno[idx]
            gen_t[gen_t > 1] = 0
            geno[idx]= gen_t
            ## or just jump them
            indel.append(idx)

        elif len(vcf_ori['variants/REF'][idx]) != 1 or len(vcf_ori['variants/ALT'][idx][0]) != 1:
            indel.append(idx)
        else:
            single.append(idx)

    if haps_extract:
        geno1= geno[:,:,0].T
        geno= geno[:,:,1].T
        geno= np.concatenate((geno,geno1),axis= 0)
    else:
        geno= allel.GenotypeArray(geno)
        geno= geno.to_n_alt().T

    ## setup summary

    column_names= ['CHROM','POS','ID','REF','ALT','QUAL','FILTER']

    alts= [vcf_ori['variants/ALT'][x][0] for x in range(geno.shape[1])]
    PASS= [['.','PASS'][int(vcf_ori['variants/FILTER_PASS'][x])] for x in range(geno.shape[1])]

    summary= [
        vcf_ori['variants/CHROM'],
        vcf_ori['variants/POS'],
        vcf_ori['variants/ID'],
        vcf_ori['variants/REF'],
        alts,
        vcf_ori['variants/QUAL'],
        PASS,

    ]

    summary= np.array(summary).T

    if len(indel):
        #
        geno= geno[:,single]
        if len(geno1):
            geno1= geno1[:,single]
        summary= summary[single,:]

    summary= pd.DataFrame(summary,columns= column_names)
    
    return geno, summary, vcf_ori['samples']



def vcf_muts_matrix_v1(refseq,summary,start= 0,end= 0,ksize= 3,bases='ACGT', collapse= True):
    ''' 
    Return matrix of mutation contexts by SNP in genotype array
    Each mutation is mapped to list of possible mutations as a binary vector.
    - v1 determines if alternative allele = reference allele in fasta. 
        if so, allele is switched, position idx is flagged. 
    '''
    
    mutations= get_mutations(bases= bases,ksize= ksize)
    kmers, kmer_idx= kmer_comp_index(mutations)
    
    mut_lib= kmer_mut_index(mutations)
    
    if end == 0:
        end= max(summary.POS)
    
    k5= int(ksize/2)
    k3= ksize - k5
    pos_mut= []
    flag_reverse= []
    flag_remove= []
    
    for x in range(summary.shape[0]):
        pos= int(summary.POS[x]) - 1
        if pos >=  start and pos <= end:
            kmer= refseq[pos-k5: pos + k3]
            if 'N' in kmer:
                flag_remove.append(x)
                continue
            mut= kmer + summary.ALT[x]
            
            if kmer[1] == summary.ALT[x]:
                flag_reverse.append(x)
                mut= kmer+summary.REF[x]
            
            if len(mut) != 4: 
                print(kmer)
                print(summary.REF[x],summary.ALT[x])
                print(x,pos)
                print(len(refseq),summary.shape[0])
                if collapse:
                    mut_array=np.zeros(len(kmer_idx))
                    pos_mut.append(mut_array)
                    continue
                else:
                    mut_array=np.zeros(len(mutations))
                    pos_mut.append(mut_array)
                    continue
            if collapse:
                mut_index= kmers[mut]
                mut_array=np.zeros(len(kmer_idx))
            else:
                mut_index= get_by_path(mut_lib, list(mut))
                mut_array=np.zeros(len(mutations))
            
            mut_array[mut_index]= 1
            pos_mut.append(mut_array)
    
    pos_mut= np.array(pos_mut).T
    
    return pos_mut, flag_reverse, flag_remove


def ind_assignment_scatter_v1(reference,dir_sim= '',indfile= 'ind_assignments.txt', haps_extract= False,
                          min_size= 80, samp= [5,20,10], stepup= "increment",outemp= 'ind_assignments{}.txt',write_out= False):
    '''
    read ind assignments for a given window; 
    chose one population;
    subset that pop in some way.
    - v1: instead of writting new pop_assignment files, return them. 
    '''
    
    ind_assignments= dir_sim + reference + '/' + indfile
    
    with open(ind_assignments,'r') as f:
        inds= f.readlines()
    
    inds= [x.split() for x in inds]
    pops= np.array(inds)[:,1]
    pop_dict= {
        z: [x for x in range(len(pops)) if pops[x] == z] for z in list(set(pops))
    }
    total_N= sum([len(x) for x in pop_dict.values()])

    if haps_extract:
        pop_dict= {
            z: g + [x + total_N for x in g] for z,g in pop_dict.items()
        }
    
    tag_list= []
    tag_dict= {}
    
    ## criterium of choice. chose only one pop.
    pop_avail= [x for x in pop_dict.keys() if len(pop_dict[x]) >= min_size]
    for pop_chose in pop_avail:
        
        N= len(pop_dict[pop_chose])
        pop_list= pop_dict[pop_chose]

        if stepup== 'increment':
            timetable= np.linspace(2,samp[0],samp[1])
        else:
            timetable= np.linspace(samp[0],N,samp[1])

        for each in timetable:  
            each= int(each)
            for perm in range(samp[2]):
                tag= '_ss' + '.'.join([pop_chose,str(each),str(perm)])
                
                smaller= np.random.choice(pop_list,each,replace= False)
                smaller= [int(x in smaller) for x in pop_list]
                
                new_pop= {
                    tag + '.s' + str(z): [pop_list[x] for x in range(len(smaller)) if smaller[x] == z] for z in [1]
                }
                
                #new_dict= {v:g for v,g in pop_dict.items() if v != pop_chose}
                #new_dict.update(new_pop)
                new_dict= new_pop

                if write_out:
                    dict_write(new_dict,inds,outemp= outemp, dir_sim= dir_sim, tag= tag)
                else:
                    tag_dict[tag]= new_dict
                tag_list.append(tag)

    if write_out:
        return tag_list
    else: 
        return tag_list, tag_dict, pop_dict



In [135]:
import allel

chrom= "1"
ploidy= 2
row_info= 6
header_info= 9
ksize= 3 # odd.
bases = 'ACGT'
collapsed= True

scale_genSize= False
diffs= False
haps_extract= True

sim_dir= 'D:/GitHub/fine-scale-mutation-spectrum-master/slim_pipe/mutation_counter/data/gravel_1m_1Ksamp/'
#sim= "gravel_1mC1.106064802"
sim= "tests"
vcf_dir= sim_dir + sim + '/'
vcf_file= vcf_dir + sim + '_' + 'chr' + chrom + '.vcf.gz'


### read vcf file
t0= time.time()
genotype, summary, Names= read_vcf_allel(vcf_file,haps_extract= haps_extract)


In [136]:
genotype.shape

(10, 9)

In [137]:
chrom= '1'
min_size= 1
samp= [2,1,1]
row= [64,32][int(collapsed)]
col= 3
stepup= 'prop'
outemp= './'
indfile= 'ind_assignments.txt'
single= True
prop_gen_used= 1
scale= 1
frequency_range= [0,1]
segregating= True

In [138]:

t1= time.time()

read_time= t1- t0

print(genotype.shape)

## read fasta
fasta_file= vcf_dir + 'chr{}_{}.fa.gz'.format(chrom,sim)

with gzip.open(fasta_file,'r') as f:
    lines= f.readlines()
    lines= [x.decode() for x in lines]

refseq= lines[1].strip()

### subset genotype if need be (not used here)
positions= [int(x) for x in summary.POS]
wstart= int(min(positions))-1
wend= int(max(positions))

Wlen= wend - wstart

genotype_parse= [x for x in range(summary.shape[0]) if int(summary.POS[x])-1 >= wstart and int(summary.POS[x])-1 <= wend]
Window= genotype[:,genotype_parse]
subset_summary= summary.loc[genotype_parse,:].reset_index()
## Create mutation type by SNP matrix, also identify positions to ignore. 
t0= time.time()
mut_matrix, flag_reverse, flag_remove= vcf_muts_matrix_v1(refseq,subset_summary,start= wstart,end= wend,ksize= ksize,
                                                    bases=bases, collapse= collapsed)

## remove positions if mutation type is not possible to ascertain;
## usually Ns, but sometimes if the SNP is found in the very last position
## of the fasta, no 5'.
retain= [x for x in range(Window.shape[1]) if x not in flag_remove]
Window= Window[:,retain]
subset_summary= subset_summary.loc[retain,:].reset_index()

t1= time.time()
time_mut= t1 - t0

tag_list, tag_dict, pop_dict= ind_assignment_scatter_v1(sim,dir_sim= sim_dir, haps_extract= haps_extract,
                  min_size= min_size, samp= samp, stepup= stepup, outemp= outemp,indfile= indfile)
#
print(pop_dict.keys())
total_inds= sum([len(x) for x in pop_dict.values()])


(10, 9)
dict_keys(['pop1', 'pop2'])


In [139]:
mut_matrix.shape

(96, 9)

In [140]:
Window.shape

(10, 9)

In [141]:
Window

array([[0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 1, 1, 1],
       [0, 1, 0, 0, 1, 0, 0, 1, 0],
       [0, 1, 1, 0, 1, 1, 0, 1, 1],
       [0, 1, 1, 0, 1, 1, 0, 1, 1],
       [0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 1, 0],
       [0, 1, 1, 0, 1, 1, 0, 1, 1]], dtype=int8)

In [142]:


def count_popKmers(Window, mut_matrix, pop_dict, single= True, frequency_range= [0,1],row=32,col=3,segregating= False,
    scale= 1, prop_gen_used= 1, return_private= False,PA= {},pop_tag= '_ss'):
    '''
    Extract population mutation counts from _ind x kmer_ mutation matrix. 
    '''
    pop_counts= {}
    num_variants= {}
    pop_seg= {}
    PA_dict= {}

    pop_list= list(pop_dict.keys())
    
    for pop in pop_list:
        pop_ori= pop
        if pop_tag in pop:
            pop_ori= pop[len(pop_tag):].split('.')[0]
        pop_gen= Window[pop_dict[pop],:]

        freqs= np.sum(pop_gen,axis= 0) / pop_gen.shape[0]
        ## discount alleles outside freq range.
        in_out= (freqs <= frequency_range[0]) | (freqs >= frequency_range[1])
        print('in_out')
        print(freqs)
        print(in_out)

        if PA: 
            shared= [x for x in range(pop_gen.shape[1]) if PA[pop_ori][x] == 0]
            pop_gen[:,shared] = 0
        
        if single: 
            pop_gen= np.sum(pop_gen,axis= 0) > 0
            pop_gen= np.array(pop_gen,dtype= int).reshape(1,len(pop_gen))
        
        pop_seg[pop]= pop_gen * scale * prop_gen_used
        pop_gen[:,in_out]= 0
        print(pop_gen)
        pop_collapsed_mat= geno_muts_v2(pop_gen, mut_matrix)
        pop_summed= np.sum(pop_collapsed_mat,axis= 0)
        
        pop_counts[pop]= pop_summed.reshape(row,col) * scale * prop_gen_used
        
        num_variants[pop]= np.sum(pop_collapsed_mat) * scale * prop_gen_used

    pop_summary= {
        'counts': pop_counts,
        'Nvars': num_variants,
        'sizes': {z:len(g) for z,g in pop_dict.items()}
    }

    if segregating:
        pop_summary['seg']= pop_seg

    if return_private:
        pop_array= [pop_seg[x] for x in pop_list]
        pop_array= np.array(pop_array)
        pop_sum= np.sum(pop_array,axis= 0)[0]

        PA_dict= {z: np.sum(pop_seg[z],axis= 0) for z in pop_list}

        PA_dict= {z: [int(g[x] == pop_sum[x]) for x in range(len(g))] for z,g in PA_dict.items()}


    return pop_summary, PA_dict


In [143]:
return_private= True
data_kmer= {}
pop_summary, PA_dict= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used,
                          frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                          return_private= return_private)

data_kmer[sim]= pop_summary

if return_private: 
    pop_summary, dummy= count_popKmers(Window, mut_matrix, pop_dict, single= single, prop_gen_used= prop_gen_used,
                              frequency_range= frequency_range,row=row,col=col,segregating= segregating,scale= scale,
                              PA= PA_dict)
    data_kmer[sim]= pop_summary


[[0 0 0 0 0 0 1 1 1]]
[[0 0 1 0 0 1 0 0 1]]
[[0 0 0 0 0 0 1 0 0]]
[[0 0 1 0 0 0 0 0 0]]


In [122]:
Window[pop_dict['pop1']]

array([[0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1, 1, 0, 0, 0]], dtype=int8)

In [123]:
Window[pop_dict["pop2"]]

array([[0, 1, 0, 0, 1, 0, 0, 1, 0],
       [0, 1, 1, 0, 1, 1, 0, 1, 1],
       [0, 1, 1, 0, 1, 1, 0, 1, 1],
       [0, 1, 0, 0, 1, 0, 0, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 1, 0],
       [0, 1, 1, 0, 1, 1, 0, 1, 1]], dtype=int8)

In [146]:
mut_matrix.shape

(96, 9)

In [145]:
for idx in range(mut_matrix.shape[0]):
    print(kmer_idx[idx])

['AAAC', 'TTTG']
['AAAG', 'TTTC']
['AAAT', 'TTTA']
['AACC', 'GTTG']
['AACG', 'GTTC']
['AACT', 'GTTA']
['AAGC', 'CTTG']
['AAGG', 'CTTC']
['AAGT', 'CTTA']
['AATC', 'ATTG']
['AATG', 'ATTC']
['AATT', 'ATTA']
['ACAA', 'TGTT']
['ACAG', 'TGTC']
['ACAT', 'TGTA']
['ACCA', 'GGTT']
['ACCG', 'GGTC']
['ACCT', 'GGTA']
['ACGA', 'CGTT']
['ACGG', 'CGTC']
['ACGT', 'CGTA']
['ACTA', 'AGTT']
['ACTG', 'AGTC']
['ACTT', 'AGTA']
['AGAA', 'TCTT']
['AGAC', 'TCTG']
['AGAT', 'TCTA']
['AGCA', 'GCTT']
['AGCC', 'GCTG']
['AGCT', 'GCTA']
['AGGA', 'CCTT']
['AGGC', 'CCTG']
['AGGT', 'CCTA']
['ATAA', 'TATT']
['ATAC', 'TATG']
['ATAG', 'TATC']
['ATCA', 'GATT']
['ATCC', 'GATG']
['ATCG', 'GATC']
['ATGA', 'CATT']
['ATGC', 'CATG']
['ATGG', 'CATC']
['CAAC', 'TTGG']
['CAAG', 'TTGC']
['CAAT', 'TTGA']
['CACC', 'GTGG']
['CACG', 'GTGC']
['CACT', 'GTGA']
['CAGC', 'CTGG']
['CAGG', 'CTGC']
['CAGT', 'CTGA']
['CCAA', 'TGGT']
['CCAG', 'TGGC']
['CCAT', 'TGGA']
['CCCA', 'GGGT']
['CCCG', 'GGGC']
['CCCT', 'GGGA']
['CCGA', 'CGGT']
['CCGG', 'CGGC

In [130]:
mutations= get_mutations(bases= bases,ksize= ksize)
kmers, kmer_idx= kmer_comp_index(mutations)

mut_lib= kmer_mut_index(mutations)


In [134]:
kmer_idx.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95])

In [32]:
## Mutation-type by SNP matrix
## 192 types of mutation; 9118 SNPs

mut_matrix.shape

(192, 9118)

In [34]:
## genotype array:
Window.shape

(3000, 9118)

### Example

In [39]:
genex= []
mutex= mut_matrix[:,:Nsnps]

genex

array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]], dtype=int8)

In [40]:
sum(Window)

array([ 19,   3, 119, ...,  11,   1,   2], dtype=int8)