In [1]:
import itertools

bases=["A", "T", "G", "C"]
def create_features(k):
    list = ["".join(p) for p in itertools.product(bases, repeat=k)]
    return list

def create_dict(list):
    kmers = {}
    for i in list:
        kmers[i]=0
    return kmers

In [2]:
import numpy as np
import operator

# Get the k-mer nucleotide composition
def getArray(k):
    # 644 is the total number of lncRNA sequences
    k_array = np.zeros((644, pow(4,k)), dtype=int)
    # Get 4^k different k-mer nucleotide composition
    kmers = create_dict(create_features(k))
    # Read the lncRNA sequences
    with open("Data/Nucleolus.txt", "r") as file_object:
        nucleolus = file_object.readlines()
    with open("Data/Cytoplasm.txt", "r") as file_object:
        cytoplasm = file_object.readlines()
    with open("Data/Ribosome.txt", "r") as file_object:
        ribosome = file_object.readlines()
    with open("Data/Exosome.txt", "r") as file_object:
        exosome = file_object.readlines()
    
    # Get the occurrence number of each k-mer for a lncRNA sequence
    integer = -1
    for sequences in [nucleolus, cytoplasm, ribosome, exosome]:
        num = 0
        for seq in sequences:
            for i in kmers:
                kmers[i] = 0
            num += 1
            if num%2 == 0:
                integer += 1
                seq = seq.rstrip()
                for i in range(len(seq) - k + 1):
                    kmer = seq[i:i+k]
                    if kmer in kmers.keys():
                        kmers[kmer] += 1
                j = -1
                for kmer, count in kmers.items():
                    j += 1
                    k_array[integer][j] = count
    
    # Calculate the occurrence frequency of each k-mer for a lncRNA sequence
    k_array = k_array.astype("float")
    for i in range(k_array.shape[0]):
        k_array[i] = k_array[i] / sum(k_array[i])
    return k_array

In [3]:
# Set the appropriate label for each class of lncRNA sequences
def getTarget():
    # 0-153 is the lncRNA sequences in nucleolus, and we set the label of these sequences as 0
    # 154-570 is the lncRNA sequences in cytoplasm, and we set the label of these sequences as 1
    # 571-613 is the lncRNA sequences in ribosome, and we set the label of these sequences as 2
    # 614-643 is the lncRNA sequences in exosome, and we set the label of these sequences as 3
    target = np.zeros((644,), dtype=int)
    for i in range(154, 571): 
        target[i] = 1

    for i in range(571, 614):
        target[i] = 2

    for i in range(614, 644):
        target[i] = 3
    return target

# Store these labels into target.npy
target = np.asarray(getTarget())
np.save("target.npy", target)

In [4]:
# Get 8-mer nucleotide composition
k_mer = getArray(8)
print(k_mer.shape)

(644, 65536)


In [5]:
# Store the 8-mer nucleotide composition into k_mer.npy
k_mer = np.asarray(k_mer)
np.save("k_mer.npy", k_mer)