# K-mer 
Move code into SimTools library.

In [1]:
import time
def show_time():
    t = time.time()
    print(time.strftime('%Y-%m-%d %H:%M:%S %Z', time.localtime(t)))
show_time()

2021-07-07 11:49:14 EDT


In [2]:
PC_SEQUENCES=32000
NC_SEQUENCES=32000
RNA_LEN=32
CDS_LEN=16

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [4]:
import sys
IN_COLAB = False
try:
    from google.colab import drive
    IN_COLAB = True
except:
    pass
if IN_COLAB:
    print("On Google CoLab, mount cloud-local file, get our code from GitHub.")
    PATH='/content/drive/'
    #drive.mount(PATH,force_remount=True)  # hardly ever need this
    #drive.mount(PATH)    # Google will require login credentials
    DATAPATH=PATH+'My Drive/data/'  # must end in "/"
    import requests
    r = requests.get('https://raw.githubusercontent.com/ShepherdCode/Soars2021/master/SimTools/RNA_describe.py')
    with open('RNA_describe.py', 'w') as f:
        f.write(r.text)  
    from RNA_describe import Random_Base_Oracle
    r = requests.get('https://raw.githubusercontent.com/ShepherdCode/Soars2021/master/SimTools/KmerTools.py')
    with open('KmerTools.py', 'w') as f:
        f.write(r.text)  
    from KmerTools import KmerTools
else:
        print("CoLab not working. On my PC, use relative paths.")
        DATAPATH='data/'  # must end in "/"
        sys.path.append("..") # append parent dir in order to use sibling dirs
        from SimTools.RNA_describe import Random_Base_Oracle
        from SimTools.KmerTools import KmerTools
MODELPATH="BestModel"  # saved on cloud instance and lost after logout
#MODELPATH=DATAPATH+MODELPATH  # saved on Google Drive but requires login

CoLab not working. On my PC, use relative paths.


## Demo

### Use Harvester algorithm

In [5]:
rbo=Random_Base_Oracle(RNA_LEN,True)
pc_all,nc_all = rbo.get_partitioned_sequences(CDS_LEN,10) # just testing
pc_all,nc_all = rbo.get_partitioned_sequences(CDS_LEN,PC_SEQUENCES)
print("Use",len(pc_all),"PC seqs")
print("Use",len(nc_all),"NC seqs")

It took 48 trials to reach 10 per class.
It took 139143 trials to reach 32000 per class.
Use 32000 PC seqs
Use 32000 NC seqs


In [6]:
MAX_K = 3
tool = KmerTools()
pc_counts = tool.make_dict_upto_K(MAX_K)
for sample in pc_all:
    tool.update_count_one_K(pc_counts,MAX_K,sample,True)
tool.harvest_counts_from_K(pc_counts,MAX_K)
print("PC counts:\n",pc_counts)
pc_freqs = tool.count_to_frequency(pc_counts,MAX_K)
print ("Frequency:\n",pc_freqs)

PC counts:
 {'A': 283790, 'C': 204475, 'G': 262326, 'T': 273409, 'AA': 69382, 'AC': 50076, 'AG': 61372, 'AT': 94286, 'CA': 53110, 'CC': 45251, 'CG': 44870, 'CT': 53688, 'GA': 75972, 'GC': 56771, 'GG': 57309, 'GT': 64209, 'TA': 75672, 'TC': 45029, 'TG': 91378, 'TT': 53625, 'AAA': 15144, 'AAC': 13515, 'AAG': 13751, 'AAT': 23905, 'ACA': 12480, 'ACC': 10840, 'ACG': 10960, 'ACT': 13247, 'AGA': 15153, 'AGC': 13527, 'AGG': 13695, 'AGT': 15932, 'ATA': 18378, 'ATC': 10897, 'ATG': 49028, 'ATT': 13390, 'CAA': 11407, 'CAC': 9966, 'CAG': 9800, 'CAT': 20441, 'CCA': 11689, 'CCC': 9998, 'CCG': 10019, 'CCT': 12051, 'CGA': 11591, 'CGC': 9883, 'CGG': 10005, 'CGT': 11861, 'CTA': 17500, 'CTC': 9898, 'CTG': 12835, 'CTT': 11909, 'GAA': 17230, 'GAC': 15232, 'GAG': 15233, 'GAT': 25724, 'GCA': 14382, 'GCC': 12906, 'GCG': 12582, 'GCT': 14902, 'GGA': 14642, 'GGC': 12657, 'GGG': 12910, 'GGT': 15112, 'GTA': 19494, 'GTC': 12667, 'GTG': 15068, 'GTT': 14983, 'TAA': 22785, 'TAC': 9857, 'TAG': 21077, 'TAT': 20395, 'TCA'

In [7]:
nc_counts = tool.make_dict_upto_K(MAX_K)
for sample in nc_all:
    tool.update_count_one_K(nc_counts,MAX_K,sample,True)
tool.harvest_counts_from_K(nc_counts,MAX_K)
print("NC counts:\n",nc_counts)
nc_freqs = tool.count_to_frequency(nc_counts,MAX_K)
print ("Frequency:\n",nc_freqs)

NC counts:
 {'A': 282420, 'C': 209249, 'G': 261038, 'T': 271293, 'AA': 69430, 'AC': 51817, 'AG': 62146, 'AT': 90609, 'CA': 53936, 'CC': 46568, 'CG': 47001, 'CT': 53990, 'GA': 74388, 'GC': 56483, 'GG': 56627, 'GT': 65503, 'TA': 74875, 'TC': 47051, 'TG': 87823, 'TT': 53753, 'AAA': 15779, 'AAC': 14121, 'AAG': 14084, 'AAT': 22990, 'ACA': 13437, 'ACC': 11452, 'ACG': 11671, 'ACT': 13134, 'AGA': 15932, 'AGC': 14022, 'AGG': 13953, 'AGT': 15784, 'ATA': 18382, 'ATC': 11844, 'ATG': 45043, 'ATT': 13265, 'CAA': 12060, 'CAC': 10357, 'CAG': 10345, 'CAT': 19334, 'CCA': 11960, 'CCC': 10241, 'CCG': 10362, 'CCT': 12214, 'CGA': 12230, 'CGC': 10501, 'CGG': 10350, 'CGT': 12065, 'CTA': 16936, 'CTC': 10360, 'CTG': 12683, 'CTT': 12135, 'GAA': 16926, 'GAC': 15357, 'GAG': 15005, 'GAT': 24821, 'GCA': 14241, 'GCC': 12675, 'GCG': 12842, 'GCT': 14794, 'GGA': 14392, 'GGC': 12708, 'GGG': 12876, 'GGT': 14698, 'GTA': 20503, 'GTC': 12753, 'GTG': 15681, 'GTT': 14622, 'TAA': 22524, 'TAC': 10283, 'TAG': 20892, 'TAT': 19333,