# Write K-mer Counts 03 
Separate counts for train and test sets.

In [1]:
from datetime import datetime
print(datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import math
import random

2022-10-13 14:57:44.724065
Python 3.10.0


In [2]:
from KmerCounter import KmerCounter
K=4

In [3]:
GENCODE_DIR = '/Users/jasonmiller/WVU/Localization/GenCode/'
ATLAS_DIR = '/Users/jasonmiller/WVU/Localization/LncAtlas/'

GENCODE_FILES = [
    'Homo_sapiens.GRCh38.cds.csv',
    'Homo_sapiens.GRCh38.cds.csv',
    'Homo_sapiens.GRCh38.ncrna.csv',
    'Homo_sapiens.GRCh38.ncrna.csv'
]
ID_SUBSETS = [
    'CNRCI_coding_test_genes.csv',
    'CNRCI_coding_train_genes.csv',
    'CNRCI_noncoding_test_genes.csv',
    'CNRCI_noncoding_train_genes.csv'
]
SEQ_SUBSETS = [
    'CNRCI_coding_test_sequences.csv',
    'CNRCI_coding_train_sequences.csv',
    'CNRCI_noncoding_test_sequences.csv',
    'CNRCI_noncoding_train_sequences.csv'
]
KMER_COUNTS_TEXT = [
    'CNRCI_coding_test_counts.K4.csv',
    'CNRCI_coding_train_counts.K4.csv',
    'CNRCI_noncoding_test_counts.K4.csv',
    'CNRCI_noncoding_train_counts.K4.csv'
]
KMER_COUNTS_BINARY = [
    'CNRCI_coding_test_counts.K4.npy',
    'CNRCI_coding_train_counts.K4.npy',
    'CNRCI_noncoding_test_counts.K4.npy',
    'CNRCI_noncoding_train_counts.K4.npy'
]

## Load sequences

In [4]:
def load_gene_ids(ids_file):
    gene_ids = set()
    with open(ids_file,'r') as handle:
        header = None
        for row in handle:
            if header is None:
                header = row
            else:
                fields = row.split(',')
                gene_id = fields[0]
                gene_ids.add(gene_id)
    return gene_ids

In [5]:
def load_subset(gene_ids,seq_file):
    rows=[]
    with open(seq_file,'r') as handle:
        header = None
        for row in handle:
            if header is None:
                header = row
            else:
                fields = row.split(',')
                gene_id = fields[1]
                if gene_id in gene_ids:
                    rows.append(row)
    return header,rows

In [6]:
def save_rows(header,rows,seq_file):
    with open (seq_file,'w') as output:
        output.write(header)
        for row in rows:
            output.write(row)

In [7]:
def kmers_from_file(K,filename,verbose=True):
    counter = KmerCounter()
    counter.setK(K)
    ordered_kmer_counts = []
    tock = 1000
    tick = 0
    i=0
    header = None
    with open (filename,'r') as infile:
        for row in infile:
            if header is None:
                header = row
            else:
                if verbose:
                    tick += 1
                    if tick >= tock:
                        tick = 1
                        print('.',end='')
                fields = row.split(',')
                seq = fields[4]
                counts = counter.seq_to_kmer_counts(seq)
                ordered_kmer_counts.append(counts)
                i += 1
    if verbose:
        print()
    numpy_kmer_counts = np.asarray(ordered_kmer_counts)
    return numpy_kmer_counts

In [8]:
def make_headers(K):
    counter = KmerCounter()
    counter.setK(K)
    token = 'A'*K
    headers='gene_id,transcript_id,'+token
    token=counter.next_kmer(token)
    while token is not None:
        headers = headers + ',' + token
        token=counter.next_kmer(token)
    return headers

In [9]:
def save_as_text(header,kmer_counts,seq_file,out_file):
    with open(out_file,'w') as ofile, open(seq_file,'r') as ifile:
        ofile.write(header)
        ofile.write('\n')
        i=0
        headline = None
        for line in ifile:
            if headline is None:
                headline = line
            else:
                fields = line.split(',')
                transcript_id = fields[0]
                gene_id = fields[1]
                counts = kmer_counts[i]
                elements = []
                for element in counts:
                    elements.append(str(element))
                all_elements=','.join(elements)
                outline = gene_id+','+transcript_id+','+all_elements
                ofile.write(outline)
                ofile.write('\n')
                i += 1

In [10]:
def save_as_binary(kmer_counts,filename):
    NUMPY_ARRAY = np.asarray(kmer_counts)
    np.save(filename,NUMPY_ARRAY,allow_pickle=True)

In [12]:
print(datetime.now())
iterations = 4  # make this 4 after testing
K=4
for i in range(iterations):
    print(datetime.now())

    gene_subset = ATLAS_DIR+ID_SUBSETS[i]
    print('Read gene id subset from',gene_subset)
    ids = load_gene_ids(gene_subset)

    all_seqs = GENCODE_DIR+GENCODE_FILES[i]
    print('Read subset of rows from',all_seqs)
    header,rows = load_subset(ids,all_seqs)

    seqfile = GENCODE_DIR+SEQ_SUBSETS[i]
    print('Write subset of rows to',seqfile)
    save_rows(header,rows,seqfile)

    print(datetime.now())
    print('Count K-mers, K=',K)
    ordered_kmer_counts=kmers_from_file(K,seqfile)
    print(datetime.now())

    text_file = GENCODE_DIR+KMER_COUNTS_TEXT[i]
    print('Save counts to text file',text_file)
    headers = make_headers(K)
    save_as_text(headers,ordered_kmer_counts,seqfile,text_file)

    binary_file = GENCODE_DIR+KMER_COUNTS_BINARY[i]
    print('Save counts to binary file',binary_file)
    save_as_binary(ordered_kmer_counts,binary_file)
    
    print()

2022-10-13 15:01:27.314812
2022-10-13 15:01:27.315894
Read gene id subset from /Users/jasonmiller/WVU/Localization/LncAtlas/CNRCI_coding_test_genes.csv
Read subset of rows from /Users/jasonmiller/WVU/Localization/GenCode/Homo_sapiens.GRCh38.cds.csv
Write subset of rows to /Users/jasonmiller/WVU/Localization/GenCode/CNRCI_coding_test_sequences.csv
2022-10-13 15:01:27.776410
Count K-mers, K= 4
................
2022-10-13 15:03:51.137810
Save counts to text file /Users/jasonmiller/WVU/Localization/GenCode/CNRCI_coding_test_counts.K4.csv
Save counts to binary file /Users/jasonmiller/WVU/Localization/GenCode/CNRCI_coding_test_counts.K4.npy

2022-10-13 15:03:55.939109
Read gene id subset from /Users/jasonmiller/WVU/Localization/LncAtlas/CNRCI_coding_train_genes.csv
Read subset of rows from /Users/jasonmiller/WVU/Localization/GenCode/Homo_sapiens.GRCh38.cds.csv
Write subset of rows to /Users/jasonmiller/WVU/Localization/GenCode/CNRCI_coding_train_sequences.csv
2022-10-13 15:03:56.956858
Count

In [None]:
print(datetime.now())