In [8]:
from sklearn.neural_network import MLPClassifier
import numpy as np
import time

### Data Loading ###

In [2]:
AA_dict = {'A':0,'R':1,'N':2,'D':3,'C':4,'Q':5,'E':6,'G':7,'H':8,'I':9,'L':10,
           'K':11,'M':12,'F':13,'P':14,'S':15,'T':16,'W':17,'Y':18,'V':19,'!':20}
SS_dict = {'H':0,'E':1,'C':2}

def one_hot_encoding(seq,ref_dict,width=None):
    if width == None:
        width = len(ref_dict.keys())
    L = len(seq)
    result = np.zeros([L,width])
    for i in range(L):
        result[i][ref_dict[seq[i]]] = 1
    return result.reshape(-1)

def load_segments(seg_file):
    with open(seg_file,'r') as f:
        lines = f.readlines()
    seg_len = len(lines[0].split('\t')[0])
    data_size = len(lines)
    X = []
    Y = []
    for line in lines:
        line = line.strip('\n').split('\t')
        X.append(one_hot_encoding(line[0],AA_dict))
        Y.append(SS_dict[line[1]])
    return np.array(X),np.array(Y)

In [3]:
Data_path = '../Data/Datasets/For_Seq_SS/'

X_train_3, Y_train_3 = load_segments(Data_path + 'Segments_train_3')
X_train_5, Y_train_5 = load_segments(Data_path + 'Segments_train_5')
X_train_7, Y_train_7 = load_segments(Data_path + 'Segments_train_7')

X_vali_3, Y_vali_3 = load_segments(Data_path + 'Segments_vali_3')
X_vali_5, Y_vali_5 = load_segments(Data_path + 'Segments_vali_5')
X_vali_7, Y_vali_7 = load_segments(Data_path + 'Segments_vali_7')

In [4]:
def validation(model,X_vali,Y_vali):
    pred = model.predict(X_vali)
    return list(pred == Y_vali).count(True)/X_vali.shape[0]

In [10]:
start = time.time()

clf_3 = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(100, 100, 100), random_state=1)
clf_3.fit(X_train_3,Y_train_3)

end = time.time()

vali_3 = validation(clf_3,X_vali_3,Y_vali_3)

print('Validation Accuraccy: %.3f'%vali_3)
print('Training Time: %.3f s'%(end - start))

Validation Accuraccy: 0.553
Training Time: 221.544 s


In [11]:
start = time.time()

clf_5 = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(100, 100, 100), random_state=1)
clf_5.fit(X_train_5,Y_train_5)

end = time.time()

vali_5 = validation(clf_5,X_vali_5,Y_vali_5)

print('Validation Accuraccy: %.3f'%vali_5)
print('Training Time: %.3f s'%(end - start))

Validation Accuraccy: 0.591
Training Time: 226.025 s


In [12]:
start = time.time()

clf_7 = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(100, 100, 100), random_state=1)
clf_7.fit(X_train_7,Y_train_7)

end = time.time()

vali_7 = validation(clf_7,X_vali_7,Y_vali_7)

print('Validation Accuraccy: %.3f'%vali_7)
print('Training Time: %.3f s'%(end - start))

Validation Accuraccy: 0.611
Training Time: 230.125 s


In [13]:
Model_Path = '../Model/FCNN/'

from joblib import dump, load

dump(clf_3,Model_Path + 'FCNN_3.joblib')
dump(clf_5,Model_Path + 'FCNN_5.joblib')
dump(clf_7,Model_Path + 'FCNN_7.joblib')

['../Model/FCNN/FCNN_7.joblib']

In [33]:
SS_dict_reverse = {0:'H',1:'E',2:'C'}

def ss_predict(model,seq,padding_num,AA_dict,SS_dict_reverse):
    seq_pad = '!'*padding_num + seq + '!'*padding_num
    ss = ''
    for i in range(len(seq)):
        seg = seq_pad[i:padding_num*2+i+1]
        s_vec = one_hot_encoding(seg,AA_dict)
        ss += SS_dict_reverse[model.predict([s_vec])[0]]
    return ss

In [48]:
def Evaluate(model,seq_list,ss_list,padding_num,AA_dict,SS_dict_reverse):
    l = len(seq_list)
    if l != len(ss_list):
        print('Error!')
        return None
    else:
        resi_num = 0
        corr_num = 0
        for i in range(l):
            seq = seq_list[i]
            ss = ss_list[i]
            resi_num += len(seq)
            ss_pre = ss_predict(model,seq,padding_num,AA_dict,SS_dict_reverse)
            corr_num += list(np.array(list(ss_pre)) == np.array(list(ss))).count(True)
            return corr_num / resi_num

In [51]:
with open(Data_path + 'SeqSS_seq_test','r') as seq_f, open(Data_path + 'SeqSS_ss_test','r') as ss_f:
    seq_list = [s.strip('\n') for s in seq_f.readlines()]
    ss_list = [s.strip('\n') for s in ss_f.readlines()]
    Test_Accu = Evaluate(clf_7,seq_list,ss_list,3,AA_dict,SS_dict_reverse)
    
print('Test Accuracy: %.3f'% Test_Accu)

Test Accuracy: 0.833


In [55]:
print('Sequence Amount: %d'%len(seq_list))
print()

model_dict = {3:clf_3,5:clf_5,7:clf_7}
    

for sl in model_dict.keys():
    start = time.time()
    accu = Evaluate(model_dict[sl],seq_list,ss_list,int((sl - 1)/2),AA_dict,SS_dict_reverse)
    end = time.time()
    print("\tTest Accuracy (seglength = %d): %.3f"%(sl,accu))
    print("Inference Time: %.3f s"%(end - start))
    print()

Sequence Amount: 1769

	Test Accuracy (seglength = 3): 0.706
Inference Time: 0.045 s

	Test Accuracy (seglength = 5): 0.738
Inference Time: 0.032 s

	Test Accuracy (seglength = 7): 0.833
Inference Time: 0.057 s

