In [1]:
import numpy as np

# Read 8-mer nucleotide composition
k_mer = np.load("k_mer.npy")
# Read sequence order correlated factors
theta_list = np.load("theta_list.npy")
# Concatenate the 8-mer nucleotide composition and sequence order correlated factors
features = np.column_stack((k_mer,theta_list)) 
print(features.shape)

(644, 65550)


In [2]:
# Read labels
target = np.load("target.npy")
print(target.shape)

(644,)


In [3]:
import pandas as pd

# When the value of i-th feature for all lncRNA sequences are zero, we will delete this feature.
list_delete = []
for i in range(65550):
    if sum(features[:,i])==0:
        list_delete.append(i)
features_new = np.delete(features, list_delete, axis=1)
print(features_new.shape)

(644, 64723)


In [4]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

# Apply the ANOVA to obtain the optimal feature subset
selector = SelectKBest(f_classif, k=11220)
selector.fit(features_new, target)
features_selected = selector.transform(features_new)
print(features_selected.shape)

(644, 11220)


In [5]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

pipe = Pipeline([("scaler", MinMaxScaler()), 
                 ("svm",SVC(decision_function_shape="ovo",kernel="rbf"))])

param_grid = {"svm__C":[pow(2, k) for k in range(-5, 16)],
             "svm__gamma":[pow(2, k) for k in range(-15, 4)]}

# When we select 11220 features, the maximum overall accuracy can reach 88.82%
grid_search = GridSearchCV(pipe, param_grid, cv=5)
grid_search.fit(features_selected, target)
print(features_selected.shape)
# The best values for parameters C and gamma.
print(grid_search.best_params_)
print(grid_search.best_score_)

(644, 11220)
{'svm__C': 256, 'svm__gamma': 0.0001220703125}
0.8881987577639752
