In [1]:
from collections import defaultdict
from sequence import *
from scipy.stats import fisher_exact

In [2]:
def kmer_counter(filtered_seqs, k):
    '''
    Return a list of the number of times each possible k-mer appears
    in seq, including overlapping occurrences.
    '''
    counts = {}
    for seq in filtered_seqs:
        for i in range(0, len(seq)-k+1):
            
            kmer = seq[i:i+k]
            current_count = counts.get(kmer, 0)
            counts[kmer] = current_count + 1

    return counts


def positve_kmer_counts(ec_num, k):
    
    filtered_seqs = readFastaFile(f"/home/seb-porras/expat_bench/workflows/{ec_num}/files/{ec_num}_filt.fasta")
    
    num_seqs = len(filtered_seqs)

    counts = kmer_counter(filtered_seqs, k)

    return counts, num_seqs

def negative_kmer_counts(ec_num, k):
    
    unfiltered_seqs = readFastaFile(f"/home/seb-porras/expat_bench/workflows/{ec_num}/files/{ec_num}.fasta")
    
    num_seqs = len(unfiltered_seqs)

    counts = kmer_counter(unfiltered_seqs, k)

    return counts, num_seqs

In [28]:
positive, pos_length = positve_kmer_counts('3_6_5_2', 5)
negative, neg_length = negative_kmer_counts('3_6_5_2', 5)


In [25]:
def get_pvals(positive_list):

    kmer_pvals = defaultdict()

    for key, value in positive_list.items():

        a = value
        b = pos_length - a

        c = negative[key]
        d = neg_length - c

        oddsratio, pvalue = fisher_exact([[a, b], [c, d]])

        kmer_pvals[key] = pvalue
        
    return kmer_pvals

def get_signif_pvals(positive_list, alpha):
    
    sig_pvals = {}
    
    sorted_pvals = sorted(get_pvals(positive_list).items(), key = lambda x: x[1])
    
    for x in sorted_pvals:
        if x[1] < alpha:
            sig_pvals[x[0]] = x[1]
            
    return sig_pvals


In [31]:
test = get_signif_pvals(positive, 0.05)

print(test)

{}
