In [2]:
import numpy as np
from sklearn.svm import SVC
import time 

### Data Loading ###

In [3]:
AA_dict = {'A':0,'R':1,'N':2,'D':3,'C':4,'Q':5,'E':6,'G':7,'H':8,'I':9,'L':10,
           'K':11,'M':12,'F':13,'P':14,'S':15,'T':16,'W':17,'Y':18,'V':19,'!':20}
SS_dict = {'H':0,'E':1,'C':2}

def one_hot_encoding(seq,ref_dict,width=None):
    if width == None:
        width = len(ref_dict.keys())
    L = len(seq)
    result = np.zeros([L,width])
    for i in range(L):
        result[i][ref_dict[seq[i]]] = 1
    return result.reshape(-1)

In [4]:
def load_segments(seg_file):
    with open(seg_file,'r') as f:
        lines = f.readlines()
    seg_len = len(lines[0].split('\t')[0])
    data_size = len(lines)
    X = []
    Y = []
    for line in lines:
        line = line.strip('\n').split('\t')
        X.append(one_hot_encoding(line[0],AA_dict))
        Y.append(SS_dict[line[1]])
    return np.array(X),np.array(Y)

In [5]:
Data_path = '../Data/Datasets/For_Seq_SS/'

X_train_3, Y_train_3 = load_segments(Data_path + 'Segments_train_3')
X_train_5, Y_train_5 = load_segments(Data_path + 'Segments_train_5')
X_train_7, Y_train_7 = load_segments(Data_path + 'Segments_train_7')

X_vali_3, Y_vali_3 = load_segments(Data_path + 'Segments_vali_3')
X_vali_5, Y_vali_5 = load_segments(Data_path + 'Segments_vali_5')
X_vali_7, Y_vali_7 = load_segments(Data_path + 'Segments_vali_7')

In [6]:
def validation(model,X_vali,Y_vali):
    pred = model.predict(X_vali)
    return list(pred == Y_vali).count(True)/X_vali.shape[0]

### Linear Kernel ###

In [8]:
start = time.time()

clf_linear_3 = SVC(C=1,kernel='linear',gamma='auto')
clf_linear_3.fit(X_train_3, Y_train_3)

end = time.time()

vali_linear_3 = validation(clf_linear_3,X_vali_3,Y_vali_3)

print('Validation Accuraccy: %.3f'%vali_linear_3)
print('Training Time: %.3f s'%(end - start))

Validation Accuraccy: 0.539
Training Time: 1246.312 s


In [9]:
start = time.time()

clf_linear_5 = SVC(C=1,kernel='linear',gamma='auto')
clf_linear_5.fit(X_train_5, Y_train_5)

end = time.time()

vali_linear_5 = validation(clf_linear_5,X_vali_5,Y_vali_5)

print('Validation Accuraccy: %.3f'%vali_linear_5)
print('Training Time: %.3f s'%(end - start))

Validation Accuraccy: 0.585
Training Time: 2949.894 s


In [10]:
start = time.time()

clf_linear_7 = SVC(C=1,kernel='linear',gamma='auto')
clf_linear_7.fit(X_train_7, Y_train_7)

end = time.time()

vali_linear_7 = validation(clf_linear_7,X_vali_7,Y_vali_7)

print('Validation Accuraccy: %.3f'%vali_linear_7)
print('Training Time: %.3f s'%(end - start))

Validation Accuraccy: 0.616
Training Time: 5811.959 s


### RBF Kernel ###

In [11]:
start = time.time()

clf_rbf_3 = SVC(C=1,kernel='rbf',gamma='auto')
clf_rbf_3.fit(X_train_3, Y_train_3)

end = time.time()

vali_rbf_3 = validation(clf_rbf_3,X_vali_3,Y_vali_3)

print('Validation Accuraccy: %.3f'%vali_rbf_3)
print('Training Time: %.3f s'%(end - start))

Validation Accuraccy: 0.539
Training Time: 1511.938 s


In [12]:
start = time.time()

clf_rbf_5 = SVC(C=1,kernel='rbf',gamma='auto')
clf_rbf_5.fit(X_train_5, Y_train_5)

end = time.time()

vali_rbf_5 = validation(clf_rbf_5,X_vali_5,Y_vali_5)

print('Validation Accuraccy: %.3f'%vali_rbf_5)
print('Training Time: %.3f s'%(end - start))

Validation Accuraccy: 0.587
Training Time: 1806.145 s


In [13]:
start = time.time()

clf_rbf_7 = SVC(C=1,kernel='rbf',gamma='auto')
clf_rbf_7.fit(X_train_7, Y_train_7)

end = time.time()

vali_rbf_7 = validation(clf_rbf_7,X_vali_7,Y_vali_7)

print('Validation Accuraccy: %.3f'%vali_rbf_7)
print('Training Time: %.3f s'%(end - start))

Validation Accuraccy: 0.616
Training Time: 2248.263 s


In [14]:
Model_Path = '../Model/SVM/'

from joblib import dump, load

dump(clf_linear_3,Model_Path + 'SVM_clf_linear_3.joblib')
dump(clf_linear_5,Model_Path + 'SVM_clf_linear_5.joblib')
dump(clf_linear_7,Model_Path + 'SVM_clf_linear_7.joblib')

dump(clf_rbf_3,Model_Path + 'SVM_rbf_3.joblib')
dump(clf_rbf_5,Model_Path + 'SVM_rbf_5.joblib')
dump(clf_rbf_7,Model_Path + 'SVM_rbf_7.joblib')

['../Model/SVM/SVM_rbf_7.joblib']

In [15]:
SS_dict_reverse = {0:'H',1:'E',2:'C'}

def ss_predict(model,seq,padding_num,AA_dict,SS_dict_reverse):
    seq_pad = '!'*padding_num + seq + '!'*padding_num
    ss = ''
    for i in range(len(seq)):
        seg = seq_pad[i:padding_num*2+i+1]
        s_vec = one_hot_encoding(seg,AA_dict)
        ss += SS_dict_reverse[model.predict([s_vec])[0]]
    return ss

def Evaluate(model,seq_list,ss_list,padding_num,AA_dict,SS_dict_reverse):
    l = len(seq_list)
    if l != len(ss_list):
        print('Error!')
        return None
    else:
        resi_num = 0
        corr_num = 0
        for i in range(l):
            seq = seq_list[i]
            ss = ss_list[i]
            resi_num += len(seq)
            ss_pre = ss_predict(model,seq,padding_num,AA_dict,SS_dict_reverse)
            corr_num += list(np.array(list(ss_pre)) == np.array(list(ss))).count(True)
            return corr_num / resi_num

In [19]:
with open(Data_path + 'SeqSS_seq_test','r') as seq_f, open(Data_path + 'SeqSS_ss_test','r') as ss_f:
    seq_list = [s.strip('\n') for s in seq_f.readlines()]
    ss_list = [s.strip('\n') for s in ss_f.readlines()]
    Test_Accu_linear_7 = Evaluate(clf_linear_7,seq_list,ss_list,3,AA_dict,SS_dict_reverse)
    Test_Accu_rbf_7 = Evaluate(clf_rbf_7,seq_list,ss_list,3,AA_dict,SS_dict_reverse)
    
print("Test Accuracy (kernel = 'linear', seg_length = 7): %.3f"% Test_Accu_linear_7)
print("Test Accuracy (kernel = 'rbf', seg_length = 7): %.3f"% Test_Accu_rbf_7)

Test Accuracy (kernel = 'linear', seg_length = 7): 0.794
Test Accuracy (kernel = 'rbf', seg_length = 7): 0.802


In [23]:
print('Sequence Amount: %d'%len(seq_list))
print()

model_dict = {'linear':{3:clf_linear_3,5:clf_linear_5,7:clf_linear_7},
            'rbf':{3:clf_rbf_3,5:clf_rbf_5,7:clf_rbf_7}}
    
for ker in model_dict.keys():
    print("kernel = '%s':"%ker)
    for sl in model_dict[ker].keys():
        start = time.time()
        accu = Evaluate(model_dict[ker][sl],seq_list,ss_list,int((sl - 1)/2),AA_dict,SS_dict_reverse)
        end = time.time()
        print("\tTest Accuracy (seglength = %d): %.3f"%(sl,accu))
        print("Inference Time: %.3f s"%(end - start))
        print()

Sequence Amount: 1769

kernel = 'linear':
	Test Accuracy (seglength = 3): 0.722
Inference Time: 0.772 s

	Test Accuracy (seglength = 5): 0.762
Inference Time: 1.326 s

	Test Accuracy (seglength = 7): 0.794
Inference Time: 1.639 s

kernel = 'rbf':
	Test Accuracy (seglength = 3): 0.722
Inference Time: 1.043 s

	Test Accuracy (seglength = 5): 0.770
Inference Time: 1.984 s

	Test Accuracy (seglength = 7): 0.802
Inference Time: 2.010 s

