# Write K-mer Counts 02 
Rewrite sequences as K-mer counts in a binary file.
There is no need to repeat the counting every time we train a model.

In [1]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
from csv import reader
import numpy as np
import math
import random

2022-10-11 17:08:36.939222
Python 3.10.0


In [2]:
from KmerCounter import KmerCounter

In [3]:
GENCODE_DIR = '/Users/jasonmiller/WVU/Localization/GenCode/'
SEQUENCE_FILE = 'cds_100.csv' # testing
SEQUENCE_FILE = 'Homo_sapiens.GRCh38.cds.csv'


In [4]:
METHOD = 3
def get_output_filenames(K):
    text  =f"cds_100.kmer_counts.k{K}.csv"   # testing
    binary=f"cds_100.kmer_counts.k{K}.npy"
    text  =f"cds.kmer_counts.k{K}.csv"
    binary=f"cds.kmer_counts.k{K}.npy"
    return text,binary

## Load sequences

In [5]:
def load_sequences(gencode_file):
    gid_tid = []
    tseqs = []
    with open(gencode_file,'r') as gencode:
        header = None
        csv = reader(gencode)
        for row in csv:
            if header is None:
                header = row
            else:
                tran_id = row[0]
                gene_id = row[1]
                tseq = row[4]
                gid_tid.append ((gene_id,tran_id))
                tseqs.append(tseq)
    return gid_tid,tseqs

In [6]:
print(datetime.now())
gencode_path = GENCODE_DIR+SEQUENCE_FILE
ordered_gid_tid,ordered_seqs = load_sequences(gencode_path)
print('Number of transcript ID:  ',len(ordered_gid_tid))
print('Number of transcript seqs:',len(ordered_seqs))
print(datetime.now())

2022-10-11 17:08:37.337810
Number of transcript ID:   98078
Number of transcript seqs: 98078
2022-10-11 17:08:39.082409


## Make K-mers

In [7]:
def make_kmers(counter,ordered_seqs,verbose=True):
    example = counter.seq_to_kmer_counts('AAAATTTT')
    ordered_kmer_counts = np.empty(len(ordered_seqs),np.ndarray)
    tock = 1000
    tick = 0
    i=0
    for seq in ordered_seqs:
        if verbose:
            tick += 1
            if tick >= tock:
                tick = 1
                print('.',end='')
        counts = counter.seq_to_kmer_counts(seq,METHOD)
        ordered_kmer_counts[i]=counts
        i += 1
    if verbose:
        print()
    return ordered_kmer_counts

In [8]:
K=4
counter = KmerCounter()
counter.setK(K)
print(datetime.now())
ordered_kmer_counts=make_kmers(counter,ordered_seqs)
print(datetime.now())

2022-10-11 17:08:39.102929
..................................................................................................
2022-10-11 17:21:54.364798


## Save binary file

In [9]:
# The saved binary file contains...
#    numpy array of seqs, with one element per sequence, of...
#       numpy array of counts, one 8-bit count per letter 

print(datetime.now())
text_file,binary_file = get_output_filenames(K)
OUTPUT_FILE=GENCODE_DIR+binary_file
NUMPY_ARRAY = np.asarray(ordered_kmer_counts)
np.save(OUTPUT_FILE,NUMPY_ARRAY,allow_pickle=True)
print(datetime.now())

2022-10-11 17:21:54.373775
2022-10-11 17:21:55.187343


## Save text file

In [10]:
def make_headers(K):
    token = 'A'*K
    headers='gene_id,transcript_id,'+token
    token=counter.next_kmer(token)
    while token is not None:
        headers = headers + ',' + token
        token=counter.next_kmer(token)
    return headers

In [11]:
text_file,binary_file = get_output_filenames(K)
INPUT_BINARY = GENCODE_DIR+binary_file
OUTPUT_TEXT = GENCODE_DIR+text_file
NUMPY_ARRAY = np.load(INPUT_BINARY,allow_pickle=True)
with open(OUTPUT_TEXT,'w') as f:
    f.write(make_headers(K))
    f.write('\n')
    i=0
    for line in NUMPY_ARRAY:
        ids = ordered_gid_tid[i]
        elements = []
        for element in line:
            elements.append(str(element))
        outline = ids[0]+','+ids[1]+','+','.join(elements)
        f.write(outline)
        f.write('\n')
        i += 1


In [12]:
print(datetime.now())

2022-10-11 17:22:19.121772
