# Write K-mer Counts 
First K-mer counts after revising our data in Oct 2022. Before, we were using sequences from Ensembl files. Now, we are using sequences from GenCode files. 

In [1]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import math
import random

2022-11-03 08:12:35.910255
Python 3.10.0


In [2]:
from KmerCounter import KmerCounter
K=4

In [3]:
DATA_DIR = '/Users/jasonmiller/WVU/Localization/TrainTest/'

INPUT_FILES=[
    'CNRCI_coding_test_transcripts.gc42.csv',
    'CNRCI_coding_train_transcripts.gc42.csv',
    'CNRCI_noncoding_test_transcripts.gc42.csv',
    'CNRCI_noncoding_train_transcripts.gc42.csv'
]

OUTPUT_FILES=[
    'CNRCI_coding_test_counts.K4.gc42.csv',
    'CNRCI_coding_train_counts.K4.gc42.csv',
    'CNRCI_noncoding_test_counts.K4.gc42.csv',
    'CNRCI_noncoding_train_counts.K4.gc42.csv'
]

In [4]:
class Progress_Bar():
    def __init__(self,tock=1000):
        self.tock = tock   
        self.tick = 0
    def advance(self):
        self.tick += 1
        if self.tick >= self.tock:
            self.tick = 1
            print('.',end='')
    def done(self):
        print()
    def __repr__(self):
        return "Progress_Bar("+str(self.tick)+","+str(self.tock)+")"

In [5]:
def count_kmers_from_file(K,filename,verbose=True,check_length=True):
    '''
    Use KmerCounter to count K-mers. 
    Assume input file is csv with these headers:
    transcript_id,gene_id,biotype,length,sequence.
    Returns a numpy array of list-of-int,
    with row order matching the input file.
    '''
    counter = KmerCounter()
    counter.setK(K)
    ordered_kmer_counts = []
    progress_bar = Progress_Bar()
    i=0
    with open (filename,'r') as infile:
        header = None
        for row in infile:
            if header is None:
                header = row
            else:
                if verbose:
                    progress_bar.advance()
                row = row.strip()
                fields = row.split(',')
                transcript_id = fields[0]  # with version number
                gene_id = fields[1]        # without version number
                gene_biotype = fields[2]   # e.g. protein_coding
                seq = fields[4]
                if check_length:           # slower but safer
                    reported_length = int(fields[3])
                    measured_length = len(seq)
                    if reported_length != measured_length:
                        print('Transcript      = ',transcript_id)
                        print('Reported length =',reported_length)
                        print('Measured length =',measured_length)
                        raise Exception('Something is wrong with the inputs!')
                counts = counter.seq_to_kmer_counts(seq)
                ordered_kmer_counts.append(counts)
                i += 1
    if verbose:
        progress_bar.done()
    numpy_kmer_counts = np.asarray(ordered_kmer_counts)
    return numpy_kmer_counts

In [6]:
def make_header(K):
    '''
    Return a string suitable for csv header for given K.
    For K=1, returns "A,C,G,T".
    '''
    counter = KmerCounter()
    counter.setK(K)
    token = 'A'*K
    headers='gene_id,transcript_id,'+token
    token=counter.next_kmer(token)
    while token is not None:
        headers = headers + ',' + token
        token=counter.next_kmer(token)
    return headers

In [7]:
def save_as_text(header,kmer_counts,seq_file,out_file):
    '''
    Combine data (kmer_counts) from array with
    metadata (transcript_id,gene_id) from seq_file.
    Assume 1-to-1 row correspondence of array and file.
    Save one row of count per sequence to a csv file.
    '''
    with open(out_file,'w') as ofile, open(seq_file,'r') as ifile:
        ofile.write(header)
        ofile.write('\n')
        i=0
        headline = None
        for line in ifile:
            if headline is None:
                headline = line
            else:
                fields = line.split(',')
                transcript_id = fields[0]
                gene_id = fields[1]
                counts = kmer_counts[i]
                elements = []
                for element in counts:
                    elements.append(str(element))
                all_elements=','.join(elements)
                outline = gene_id+','+transcript_id+','+all_elements
                ofile.write(outline)
                ofile.write('\n')
                i += 1

## Data Processing

In [8]:
print(datetime.now())
print('Count K-mers, K=',K)
iterations = len(INPUT_FILES)
## iterations = 1  # just for testing
for i in range(iterations):
    print(datetime.now())
    inpath = DATA_DIR+INPUT_FILES[i]
    outpath = DATA_DIR+OUTPUT_FILES[i]
    print('Count K-mers in sequence from this file:\n',inpath)
    ordered_kmer_counts = count_kmers_from_file(K,inpath)
    print('Write counts to:\n',outpath)
    header = make_header(K)
    save_as_text(header,ordered_kmer_counts,inpath,outpath)
print(datetime.now())

2022-11-03 08:12:36.358224
Count K-mers, K= 4
2022-11-03 08:12:36.360674
Count K-mers in sequence from this file:
 /Users/jasonmiller/WVU/Localization/TrainTest/CNRCI_coding_test_transcripts.gc42.csv
.................
Write counts to:
 /Users/jasonmiller/WVU/Localization/TrainTest/CNRCI_coding_test_counts.K4.gc42.csv
2022-11-03 08:16:40.428669
Count K-mers in sequence from this file:
 /Users/jasonmiller/WVU/Localization/TrainTest/CNRCI_coding_train_transcripts.gc42.csv
....................................................................
Write counts to:
 /Users/jasonmiller/WVU/Localization/TrainTest/CNRCI_coding_train_counts.K4.gc42.csv
2022-11-03 08:33:32.682798
Count K-mers in sequence from this file:
 /Users/jasonmiller/WVU/Localization/TrainTest/CNRCI_noncoding_test_transcripts.gc42.csv
.....
Write counts to:
 /Users/jasonmiller/WVU/Localization/TrainTest/CNRCI_noncoding_test_counts.K4.gc42.csv
2022-11-03 08:34:17.543324
Count K-mers in sequence from this file:
 /Users/jasonmiller/