In [1]:
import numpy as np

# Read 8-mer nucleotide composition
k_mer = np.load("k_mer.npy")
# Read sequence order correlated factors
theta_list = np.load("theta_list.npy")
# Concatenate the 8-mer nucleotide composition and sequence order correlated factors
features = np.column_stack((k_mer,theta_list)) 
print(features.shape)

(644, 65550)


In [2]:
# Read labels
target = np.load("target.npy")
print(target.shape)

(644,)


In [3]:
# When the value of i-th feature for all lncRNA sequences are zero, we will delete this feature.
list_delete = []
for i in range(65536):
    if sum(features[:,i])==0:
        list_delete.append(i)
features_new = np.delete(features, list_delete, axis=1)
print(features_new.shape)

(644, 64723)


In [4]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

# Apply the ANOVA to obtain the optimal feature subset
selector = SelectKBest(f_classif, k=11220)
selector.fit(features_new, target)
features_selected = selector.transform(features_new)
print(features_selected.shape)

(644, 11220)


In [5]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([("scaler", MinMaxScaler()), 
                 ("svm",SVC(C=256, gamma=0.0001220703125,
                            decision_function_shape="ovo",kernel="rbf"))])

loo = LeaveOneOut()

In [6]:
import numpy as np
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

# Calculate the confusion matrix
confusion_matrix_myself = np.zeros((4,4))

def coufusion_matrix_function(y_true, y_pred):
    for num in range(len(y_true)):
        i = y_true[num]
        j = y_pred[num]
        confusion_matrix_myself[i][j] += 1
    return 1
confusion_matrix_score = make_scorer(coufusion_matrix_function, greater_is_better=True)

confusion_matrix_scores = cross_val_score(pipe, features_selected, target, 
                                  cv=loo, scoring=confusion_matrix_score)  

In [7]:
print(confusion_matrix_myself)

[[127.  27.   0.   0.]
 [  0. 417.   0.   0.]
 [  8.   6.  18.  11.]
 [  6.   3.   1.  20.]]


In [8]:
# Calculate the metric: sensitivity
def sensitivity_myself(confusion_matrix, i):
    N_plus_minus_i = sum(confusion_matrix[i]) - confusion_matrix[i,i]
    N_plus_i = sum(confusion_matrix[i])
    Sn_i = 1 - float(N_plus_minus_i)/N_plus_i
    return Sn_i

In [9]:
# Calculate the Sp(i)
for i in range(4):
    print(sensitivity_myself(confusion_matrix_myself, i))

0.8246753246753247
1.0
0.41860465116279066
0.6666666666666667


In [10]:
# Calculate the metric: specificity
def specificity_myself(confusion_matrix, i):
    matrix_sum = sum(sum(j) for j in confusion_matrix)
    N_minus_plus_i = sum(confusion_matrix[:,i])-confusion_matrix[i,i]
    N_minus_i = matrix_sum - sum(confusion_matrix[i])
    Sp_i = 1 - float(N_minus_plus_i)/N_minus_i
    return Sp_i

In [11]:
# Calculate the Sp(i)
for i in range(4):
    print(specificity_myself(confusion_matrix_myself, i))

0.9714285714285714
0.8414096916299559
0.9983361064891847
0.9820846905537459


In [12]:
# Calculate the metric: Matthew's correlation coefficient
import math
def mcc_myself(confusion_matrix, i):
    matrix_sum = sum(sum(j) for j in confusion_matrix)
    N_plus_minus_i = sum(confusion_matrix[i]) - confusion_matrix[i,i]
    N_plus_i = sum(confusion_matrix[i])
    N_minus_plus_i = sum(confusion_matrix[:,i])-confusion_matrix[i,i]
    N_minus_i = matrix_sum - sum(confusion_matrix[i])
    part_1 =float(N_plus_minus_i)/N_plus_i + float(N_minus_plus_i)/N_minus_i
    part_2 =1+float(N_minus_plus_i-N_plus_minus_i)/N_plus_i
    part_3 =1+float(N_plus_minus_i-N_minus_plus_i)/N_minus_i
    mcc_i = float(1 - part_1)/math.sqrt(part_2*part_3)
    return mcc_i

In [13]:
# Calculate the MCC(i)
for i in range(4):
    print(mcc_myself(confusion_matrix_myself, i))

0.8211728856111272
0.8800810736907339
0.6150763179827043
0.638722195631508


In [14]:
# Calculate the metric: overall accuracy
def accuracy_myself(confusion_matrix):
    num_i = 0
    for i in range(4):
        N_plus_i = sum(confusion_matrix[i])
        N_plus_minus_i = sum(confusion_matrix[i]) - confusion_matrix[i,i]
        num_i += N_plus_i - N_plus_minus_i
    return float(num_i)/644

In [15]:
# Calculate the OA
print(accuracy_myself(confusion_matrix_myself))

0.9037267080745341
