# Write K-mer Counts 
On Windows, write K-mer counts.
This time, no cap (we previously set maximum count = 255 on K=4)

In [1]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import math
import random

2022-10-25 13:06:16.412061
Python 3.10.6


In [2]:
from KmerCounter import KmerCounter
K=2

In [3]:
GENCODE_DIR = 'D:/Adjeroh/Localization/GenCode/'
ATLAS_DIR = 'D:/Adjeroh/Localization/LncAtlas/'

ID_SUBSETS = [
    'CNRCI_coding_test_genes.csv',
    'CNRCI_coding_train_genes.csv',
    'CNRCI_noncoding_test_genes.csv',
    'CNRCI_noncoding_train_genes.csv'
]
SEQ_SUBSETS = [
    'CNRCI_coding_test_sequences.csv',
    'CNRCI_coding_train_sequences.csv',
    'CNRCI_noncoding_test_sequences.csv',
    'CNRCI_noncoding_train_sequences.csv'
]
KMER_COUNTS_TEXT = [
    'CNRCI_coding_test_counts.K2.2byte.csv',
    'CNRCI_coding_train_counts.K2.2byte.csv',
    'CNRCI_noncoding_test_counts.K2.2byte.csv',
    'CNRCI_noncoding_train_counts.K2.2byte.csv'
]

## Load sequences

In [4]:
def load_gene_ids(ids_file):
    gene_ids = set()
    with open(ids_file,'r') as handle:
        header = None
        for row in handle:
            if header is None:
                header = row
            else:
                fields = row.split(',')
                gene_id = fields[0]
                gene_ids.add(gene_id)
    return gene_ids

In [5]:
def kmers_from_file(K,filename,verbose=True):
    counter = KmerCounter()
    counter.setK(K)
    ordered_kmer_counts = []
    tock = 1000
    tick = 0
    i=0
    header = None
    with open (filename,'r') as infile:
        for row in infile:
            if header is None:
                header = row
            else:
                if verbose:
                    tick += 1
                    if tick >= tock:
                        tick = 1
                        print('.',end='')
                row = row.strip()
                fields = row.split(',')
                transcript_id = fields[0]  # with version number
                gene_id = fields[1]    # without version number
                gene_biotype = fields[2]   # e.g. protein_coding
                reported_length = int(fields[3])
                seq = fields[4]
                measured_length = len(seq)
                if reported_length != measured_length:
                    print(row)
                    print('Reported length =',reported_length)
                    print('Measured length =',measured_length)
                    raise Exception('Something is wrong with the inputs!')
                counts = counter.seq_to_kmer_counts(seq)
                ordered_kmer_counts.append(counts)
                i += 1
    if verbose:
        print()
    numpy_kmer_counts = np.asarray(ordered_kmer_counts)
    return numpy_kmer_counts

In [6]:
def make_headers(K):
    counter = KmerCounter()
    counter.setK(K)
    token = 'A'*K
    headers='gene_id,transcript_id,'+token
    token=counter.next_kmer(token)
    while token is not None:
        headers = headers + ',' + token
        token=counter.next_kmer(token)
    return headers

In [7]:
def save_as_text(header,kmer_counts,seq_file,out_file):
    with open(out_file,'w') as ofile, open(seq_file,'r') as ifile:
        ofile.write(header)
        ofile.write('\n')
        i=0
        headline = None
        for line in ifile:
            if headline is None:
                headline = line
            else:
                fields = line.split(',')
                transcript_id = fields[0]
                gene_id = fields[1]
                counts = kmer_counts[i]
                elements = []
                for element in counts:
                    elements.append(str(element))
                all_elements=','.join(elements)
                outline = gene_id+','+transcript_id+','+all_elements
                ofile.write(outline)
                ofile.write('\n')
                i += 1

In [8]:
print(datetime.now())
iterations = 4  # make this 4 after testing
for i in range(iterations):
    print(datetime.now())

    gene_subset = ATLAS_DIR+ID_SUBSETS[i]
    print('Read gene id subset from',gene_subset)
    ids = load_gene_ids(gene_subset)

    seqfile = GENCODE_DIR+SEQ_SUBSETS[i]
    print(datetime.now())
    print('Count K-mers, K=',K)
    ordered_kmer_counts=kmers_from_file(K,seqfile)
    print(datetime.now())

    text_file = GENCODE_DIR+KMER_COUNTS_TEXT[i]
    print('Save counts to text file',text_file)
    headers = make_headers(K)
    save_as_text(headers,ordered_kmer_counts,seqfile,text_file)
    print()

2022-10-25 13:06:16.700674
2022-10-25 13:06:16.700674
Read gene id subset from D:/Adjeroh/Localization/LncAtlas/CNRCI_coding_test_genes.csv
2022-10-25 13:06:16.716296
Count K-mers, K= 2
................
2022-10-25 13:07:32.692473
Save counts to text file D:/Adjeroh/Localization/GenCode/CNRCI_coding_test_counts.K2.2byte.csv

2022-10-25 13:07:32.976534
Read gene id subset from D:/Adjeroh/Localization/LncAtlas/CNRCI_coding_train_genes.csv
2022-10-25 13:07:32.988614
Count K-mers, K= 2
....................................................................
2022-10-25 13:12:46.437458
Save counts to text file D:/Adjeroh/Localization/GenCode/CNRCI_coding_train_counts.K2.2byte.csv

2022-10-25 13:12:47.549637
Read gene id subset from D:/Adjeroh/Localization/LncAtlas/CNRCI_noncoding_test_genes.csv
2022-10-25 13:12:47.557668
Count K-mers, K= 2
........
2022-10-25 13:13:32.256543
Save counts to text file D:/Adjeroh/Localization/GenCode/CNRCI_noncoding_test_counts.K2.2byte.csv

2022-10-25 13:13:32.4074

In [9]:
print(datetime.now())

2022-10-25 13:14:59.128137
