In [1]:
import numpy as np

# Read physicochemical properties
def loadTXTfile():
    tmp = np.loadtxt("physichemical.csv", dtype=np.float, delimiter=",")
    return tmp

physichemical = loadTXTfile()
print(physichemical.shape)

(10, 16)


In [2]:
# Perform a standard conversion
physichemical_stand = np.zeros((10,16), dtype=float)
for i in range(physichemical.shape[0]):
    for j in range(physichemical.shape[1]):
        physichemical_stand[i][j] = (physichemical[i][j] - np.mean(physichemical[i]))/np.std(physichemical[i]) 

In [3]:
# List of the 16 different dinucleotides
list = ["AA","AC","AG","AT","CA","CC","CG","CT","GA","GC","GG","GT","TA","TC","TG","TT"]

In [4]:
# Calculate the Eq.8 in our paper, where seq represents the lncRNA sequence
def getH(seq, j, physichemical_stand,i):
    dinu1 = seq[i:i+2]
    dinu1_p = list.index(dinu1)
    dinu2 = seq[i+j:i+j+2]
    dinu2_p = list.index(dinu2)
    sum_u = 0
    for u in range(physichemical_stand.shape[0]):
        dinu1_physi_value = physichemical_stand[u][dinu1_p]
        dinu2_physi_value = physichemical_stand[u][dinu2_p]
        sum_u = sum_u + pow(dinu1_physi_value-dinu2_physi_value,2)
    return sum_u / (physichemical_stand.shape[0])

# Get the sequence order correlated factors
def getThetaLambda(seq, Lambda, physichemical_stand):
    sum_theta = 0
    for i in range(len(seq)-1-Lambda):
        sum_theta = sum_theta + getH(seq, Lambda, physichemical_stand, i)
    return sum_theta/(len(seq)-1-Lambda)

In [5]:
# Store all the lncRNA sequences
all_lncRNA_sequences = []

# Read the lncRNA sequences and add these lncRNA sequences into all_lncRNA_sequences 
with open("Data/Nucleolus.txt", "r") as file_object:
    nucleolus = file_object.readlines()
    all_lncRNA_sequences.extend(nucleolus)
with open("Data/Cytoplasm.txt", "r") as file_object:
    cytoplasm = file_object.readlines()
    all_lncRNA_sequences.extend(cytoplasm)
with open("Data/Ribosome.txt", "r") as file_object:
    ribosome = file_object.readlines()
    all_lncRNA_sequences.extend(ribosome)
with open("Data/Exosome.txt", "r") as file_object:
    exosome = file_object.readlines()
    all_lncRNA_sequences.extend(exosome)

In [6]:
# Calculate the Eq.7 in our paper, where lambda_scale represents the scale of the lambda
def getThetaList(all_lncRNA_sequences,lambda_scale,physichemical_stand):
    num = 0
    theta_list = []
    for seq in all_lncRNA_sequences:
        if seq.startswith(">"):
            num = num + 1
        else:
            seq = seq.rstrip()
            theta_list_line = []
            for Lambda in range(1,lambda_scale+1):
                theta_list_line.append(getThetaLambda(seq, Lambda, physichemical_stand))
            theta_list.append(theta_list_line)
    return np.asarray(theta_list)

In [7]:
# Get sequence order correlated factors for lambda=14
theta_list = getThetaList(all_lncRNA_sequences,14,physichemical_stand)
print(theta_list.shape)

(644, 14)


In [8]:
print(theta_list)

[[1.55998056 1.72068744 1.69772631 ... 1.630961   1.68333032 1.7408159 ]
 [1.71094442 1.87349621 1.75910365 ... 1.82335529 1.81034405 1.78741049]
 [1.66686377 1.8478084  1.76740672 ... 1.73820501 1.7512723  1.77451019]
 ...
 [2.01129764 2.01496932 1.90431592 ... 1.93906265 1.93308006 1.94249633]
 [1.70317293 1.85484811 1.61644431 ... 1.78508198 1.89115889 1.98460661]
 [1.86769445 2.01312376 1.87986256 ... 1.88144703 1.94246718 1.9155969 ]]


In [9]:
print(theta_list.shape)

(644, 14)


In [10]:
# Store the sequence order correlated factors into data_theta.npy
theta_list = np.asarray(theta_list)
np.save("theta_list.npy", theta_list)