# Toward a K-mer counting package

In [1]:
import numpy as np
class KmerCounter():
    NUCLEOTIDE_BITS={'A':0, 'C':1, 'G':2, 'T':3}
    BITS_PER_NUCLEOTIDE=2
    COUNT_TYPE=np.int8
    MAX_COUNT = 2**8-1
    def __init__(self,K=4):
        self.setK(K)
    def setK(self,K):
        self.K=4
        self.VOCABULARY_SIZE = 4**K
    def hash_value(self,token):
        # TO DO: CACHE BITS FROM PREVIOUS K-1 LETTERS 
        kmer_hash=0
        if len(token)!=self.K:
            raise Exception('Given token not of length K='+str(self.K))
        letters = list(token)
        for letter in letters:
            try:
                additional = KmerCounter.NUCLEOTIDE_BITS[letter]
                kmer_hash = kmer_hash << KmerCounter.BITS_PER_NUCLEOTIDE
                kmer_hash = kmer_hash + additional
            except KeyError:
                # Ignore tokens with N or any non-nucleotide
                return None
        return kmer_hash
    def seq_to_kmer_counts(self,seq):
        counts = np.zeros(
            self.VOCABULARY_SIZE, KmerCounter.COUNT_TYPE)
        for p in range(len(seq)-self.K+1):
            token=seq[p:p+self.K]
            hash_value = self.hash_value(token)
            if hash_value is not None:
                if counts[hash_value]<KmerCounter.MAX_COUNT:
                    counts[hash_value] += 1
        return counts

In [2]:
import unittest
class Test4mers(unittest.TestCase):
    def setUp(self):
        self.counter = KmerCounter(4)
    def test_4A(self):
        seq='AAAA'
        counts = self.counter.seq_to_kmer_counts(seq)
        self.assertEqual(counts[0],1)
        self.assertEqual(counts.sum(),1)
    def test_8A(self):
        seq='AAAAAAAA'
        counts = self.counter.seq_to_kmer_counts(seq)
        self.assertEqual(counts[0],5)
        self.assertEqual(counts.sum(),5)
    def test_middleN(self):
        seq='AAAANAAAA'
        counts = self.counter.seq_to_kmer_counts(seq)
        self.assertEqual(counts[0],2)
        self.assertEqual(counts.sum(),2)
    def test_middleN(self):
        seq='ACGTACGT'
        counts = self.counter.seq_to_kmer_counts(seq)
        self.assertEqual(counts[0],0)      # no AAAA
        self.assertEqual(counts.max(),2)   # ACTG twice     
        self.assertEqual(counts.sum(),5)   # all kmers counted
    def test_hash_homopolymers(self):
        c = self.counter
        self.assertEqual(c.hash_value('AAAA'),int("00000000",2))
        self.assertEqual(c.hash_value('CCCC'),int("01010101",2))
        self.assertEqual(c.hash_value('GGGG'),int("10101010",2))
        self.assertEqual(c.hash_value('TTTT'),int("11111111",2))  
    def test_hash_combos(self):
        c = self.counter
        self.assertEqual(c.hash_value('ACGT'),int("00"+"01"+"10"+"11",2))
        self.assertEqual(c.hash_value('TCGA'),int("11"+"01"+"10"+"00",2))
        self.assertEqual(c.hash_value('TATG'),int("11"+"00"+"11"+"10",2))
        self.assertEqual(c.hash_value('CTGG'),int("01"+"11"+"10"+"10",2))


In [3]:
unittest.main(argv=[''], verbosity=0, exit=False)

----------------------------------------------------------------------
Ran 5 tests in 0.001s

OK


<unittest.main.TestProgram at 0x109a538e0>