# K-mer 
Basic K-mer counting.

In [1]:
import time
def show_time():
    t = time.time()
    print(time.strftime('%Y-%m-%d %H:%M:%S %Z', time.localtime(t)))
show_time()

2021-07-06 15:57:08 EDT


In [2]:
PC_SEQUENCES=32000
NC_SEQUENCES=32000
RNA_LEN=32
CDS_LEN=16

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [4]:
import sys
IN_COLAB = False
try:
    from google.colab import drive
    IN_COLAB = True
except:
    pass
if IN_COLAB:
    print("On Google CoLab, mount cloud-local file, get our code from GitHub.")
    PATH='/content/drive/'
    #drive.mount(PATH,force_remount=True)  # hardly ever need this
    #drive.mount(PATH)    # Google will require login credentials
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
    import requests
    r = requests.get('https://raw.githubusercontent.com/ShepherdCode/Soars2021/master/SimTools/RNA_describe.py')
    with open('RNA_describe.py', 'w') as f:
        f.write(r.text)  
    from RNA_describe import ORF_counter
    from RNA_describe import Random_Base_Oracle
else:
        print("CoLab not working. On my PC, use relative paths.")
        DATAPATH='data/'  # must end in "/"
        sys.path.append("..") # append parent dir in order to use sibling dirs
        from SimTools.RNA_describe import ORF_counter,Random_Base_Oracle
MODELPATH="BestModel"  # saved on cloud instance and lost after logout
#MODELPATH=DATAPATH+MODELPATH  # saved on Google Drive but requires login

CoLab not working. On my PC, use relative paths.


## Sample data

In [5]:
rbo=Random_Base_Oracle(RNA_LEN,True)
pc_all,nc_all = rbo.get_partitioned_sequences(CDS_LEN,10) # just testing
pc_all,nc_all = rbo.get_partitioned_sequences(CDS_LEN,PC_SEQUENCES)
print("Use",len(pc_all),"PC seqs")
print("Use",len(nc_all),"NC seqs")

It took 50 trials to reach 10 per class.
It took 138262 trials to reach 32000 per class.
Use 32000 PC seqs
Use 32000 NC seqs


## K-mer counting

### Functions to create the dict of {kmer:count}

In [6]:
def make_kmer_keys(K):
    shorter_kmers=['']
    for i in range(K):
        longer_kmers=[]
        for mer in shorter_kmers:
            # No support for N or any non-ACGT bases.
            longer_kmers.append(mer+'A')
            longer_kmers.append(mer+'C')
            longer_kmers.append(mer+'G')
            longer_kmers.append(mer+'T')
        shorter_kmers = longer_kmers
    return shorter_kmers
def make_kmer_dict(keys,init=0):
    return dict.fromkeys(keys,init)
def make_dict_upto_K(max_K):
    keys=make_kmer_keys(1)
    for k in range(2,max_K+1):
        keys.extend(make_kmer_keys(k))
    counts = make_kmer_dict(keys)
    return counts


### Naive K-mer counting algorithm
Algorithm:  
1. for every string  
    1. for every K  
        1. for every position  
            1. kmer=substring
            2. count{kmer}++

In [7]:
def update_count_one_K(counts,K,rna,tail=False):
    L = len(rna)
    padding=" "*(K-1)
    padded=rna+padding
    for i in range(0,L-K+1):
        kmer=padded[i:i+K]
        counts[kmer] += 1
    if tail and K>1:  
        # for Harvester algorithm, count last letters as special case
        for start_pos in range(L-K+1,L):
            for end_pos in range(start_pos+1,L+1):
                kmer=rna[start_pos:end_pos]
                counts[kmer] += 1
    return counts
def update_count_upto_K(counts,max_K,sample,tail=False):
    for i in range(1,max_K+1):
        update_count_one_K(counts,i,sample,tail)
    return counts

### Harvester K-mer counting algorithm
Algorithm:  
1. Count K-mers for max K only  
2. For each K-mer in counts table:  
    1. For every prefix of the K-mer:  
        1. count{prefix} += count{kmer}  
3. Handle last K-1 letters of each string as special case

In [8]:
def harvest_counts_from_K(counts,max_K):
    for kmer in counts.keys():
        klen = len(kmer)
        kcnt = counts[kmer]
        if klen==max_K and kcnt>0:
            for i in range(1,klen):
                prefix = kmer[:i]
                counts[prefix] += kcnt
    return counts

## Demo

### Demo: Naive algorithm

In [13]:
MAX_K = 3
counts1 = make_dict_upto_K(MAX_K)
print("Initial counts:\n",counts1)

sample = "ACCGGGTTTTACGTACGT"
update_count_upto_K(counts1,MAX_K,sample)
print("Final counts:\n",counts1)

Initial counts:
 {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'AA': 0, 'AC': 0, 'AG': 0, 'AT': 0, 'CA': 0, 'CC': 0, 'CG': 0, 'CT': 0, 'GA': 0, 'GC': 0, 'GG': 0, 'GT': 0, 'TA': 0, 'TC': 0, 'TG': 0, 'TT': 0, 'AAA': 0, 'AAC': 0, 'AAG': 0, 'AAT': 0, 'ACA': 0, 'ACC': 0, 'ACG': 0, 'ACT': 0, 'AGA': 0, 'AGC': 0, 'AGG': 0, 'AGT': 0, 'ATA': 0, 'ATC': 0, 'ATG': 0, 'ATT': 0, 'CAA': 0, 'CAC': 0, 'CAG': 0, 'CAT': 0, 'CCA': 0, 'CCC': 0, 'CCG': 0, 'CCT': 0, 'CGA': 0, 'CGC': 0, 'CGG': 0, 'CGT': 0, 'CTA': 0, 'CTC': 0, 'CTG': 0, 'CTT': 0, 'GAA': 0, 'GAC': 0, 'GAG': 0, 'GAT': 0, 'GCA': 0, 'GCC': 0, 'GCG': 0, 'GCT': 0, 'GGA': 0, 'GGC': 0, 'GGG': 0, 'GGT': 0, 'GTA': 0, 'GTC': 0, 'GTG': 0, 'GTT': 0, 'TAA': 0, 'TAC': 0, 'TAG': 0, 'TAT': 0, 'TCA': 0, 'TCC': 0, 'TCG': 0, 'TCT': 0, 'TGA': 0, 'TGC': 0, 'TGG': 0, 'TGT': 0, 'TTA': 0, 'TTC': 0, 'TTG': 0, 'TTT': 0}
Final counts:
 {'A': 3, 'C': 4, 'G': 5, 'T': 6, 'AA': 0, 'AC': 3, 'AG': 0, 'AT': 0, 'CA': 0, 'CC': 1, 'CG': 3, 'CT': 0, 'GA': 0, 'GC': 0, 'GG': 2, 'GT': 3, 'TA': 2, '

### Demo: Harvester algorithm

In [16]:
MAX_K = 3
counts2 = make_dict_upto_K(MAX_K)
print("Initial counts:\n",counts2)

sample = "ACCGGGTTTTACGTACGT"
update_count_one_K(counts2,MAX_K,sample,True)
print("Partial counts (just max K and special case letters)\n:",counts2)
harvest_counts_from_K(counts2,MAX_K)
print("Final counts (includes smaller values of K):\n",counts2)


Initial counts:
 {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'AA': 0, 'AC': 0, 'AG': 0, 'AT': 0, 'CA': 0, 'CC': 0, 'CG': 0, 'CT': 0, 'GA': 0, 'GC': 0, 'GG': 0, 'GT': 0, 'TA': 0, 'TC': 0, 'TG': 0, 'TT': 0, 'AAA': 0, 'AAC': 0, 'AAG': 0, 'AAT': 0, 'ACA': 0, 'ACC': 0, 'ACG': 0, 'ACT': 0, 'AGA': 0, 'AGC': 0, 'AGG': 0, 'AGT': 0, 'ATA': 0, 'ATC': 0, 'ATG': 0, 'ATT': 0, 'CAA': 0, 'CAC': 0, 'CAG': 0, 'CAT': 0, 'CCA': 0, 'CCC': 0, 'CCG': 0, 'CCT': 0, 'CGA': 0, 'CGC': 0, 'CGG': 0, 'CGT': 0, 'CTA': 0, 'CTC': 0, 'CTG': 0, 'CTT': 0, 'GAA': 0, 'GAC': 0, 'GAG': 0, 'GAT': 0, 'GCA': 0, 'GCC': 0, 'GCG': 0, 'GCT': 0, 'GGA': 0, 'GGC': 0, 'GGG': 0, 'GGT': 0, 'GTA': 0, 'GTC': 0, 'GTG': 0, 'GTT': 0, 'TAA': 0, 'TAC': 0, 'TAG': 0, 'TAT': 0, 'TCA': 0, 'TCC': 0, 'TCG': 0, 'TCT': 0, 'TGA': 0, 'TGC': 0, 'TGG': 0, 'TGT': 0, 'TTA': 0, 'TTC': 0, 'TTG': 0, 'TTT': 0}
Partial counts (just max K and special case letters)
: {'A': 0, 'C': 0, 'G': 1, 'T': 1, 'AA': 0, 'AC': 0, 'AG': 0, 'AT': 0, 'CA': 0, 'CC': 0, 'CG': 0, 'CT': 0, 'GA': 

In [12]:
if counts1==counts2:
    print("Success. Harvester output matches naive results!")
else:
    print("Fail. Harvester output differs from naive results!")

Success. Harvester output matches naive results!
