In [10]:
import pickle
import numpy as np

### Data Clear ###

In [11]:
Data_Path = '../Datasets/'

with open(Data_Path + 'protein_list','r') as file_1:
    pro_lines = file_1.readlines()
with open(Data_Path + 'seq_list','r') as file_2:
    seq_lines = file_2.readlines()
with open(Data_Path + 'ss_list','r') as file_3:
    ss_lines = file_3.readlines()
    
seq_num = len(seq_lines)

seq_clear_num = 0
resi_clear_num = 0
SS_num_dict = {'H':0,'E':0,'C':0}

with open(Data_Path + 'protein_list_clear','w') as file_1, open(Data_Path + 'seq_list_clear','w') as file_2, open(Data_Path + 'ss_list_clear','w') as file_3:
    for i in range(seq_num):
        ss = ss_lines[i].strip('\n')
        if not 'M' in ss:
            seq_clear_num += 1
            resi_clear_num += len(seq_lines[i].strip('\n'))
            for ch in SS_num_dict.keys():
                SS_num_dict[ch] += ss.count(ch)
            
            file_1.write(pro_lines[i])
            file_2.write(seq_lines[i])
            file_3.write(ss_lines[i])
            

In [12]:
print('Sequence amount after data clearance: %d'%seq_clear_num)
print('Residue amount after data clearance: %d'%resi_clear_num)
print()
print('Residue amount for different secondary structure:')
for ch in SS_num_dict.keys():
    print('%s: %d'%(ch,SS_num_dict[ch]))

Sequence amount after data clearance: 8845
Residue amount after data clearance: 1384022

Residue amount for different secondary structure:
H: 507813
E: 313871
C: 562338


In [15]:
AA_dict = {'A':0,'R':1,'N':2,'D':3,'C':4,'Q':5,'E':6,'G':7,'H':8,'I':9,'L':10,
           'K':11,'M':12,'F':13,'P':14,'S':15,'T':16,'W':17,'Y':18,'v':19,'!':20}
SS_dict = {'H':0,'E':1,'C':2}

def one_hot_encoding(seq,ref_dict,width=None):
    if width == None:
        width = len(ref_dict.keys())
    L = len(seq)
    result = np.zeros([L,width])
    for i in range(L):
        result[i][ref_dict[seq[i]]] = 1
    return result

### Datasets for Seq-SS models: ###

In [32]:
with open(Data_Path + 'protein_list_clear','r') as file_pro:
    pro_clear_lines = file_pro.readlines()
with open(Data_Path + 'seq_list_clear','r') as file_seq:
    seq_clear_lines = file_seq.readlines()
with open(Data_Path + 'ss_list_clear','r') as file_ss:
    ss_clear_lines = file_ss.readlines()

pro_seq_ss_dict = {}
for i in range(len(pro_clear_lines)):
    pro = pro_clear_lines[i].strip('\n').split('\t')[0]
    fold = pro_clear_lines[i].strip('\n').split('\t')[1]
    seq = seq_clear_lines[i].strip('\n')
    ss = ss_clear_lines[i].strip('\n')
    pro_seq_ss_dict[pro] = [fold,seq,ss]

In [53]:
import random

SeqSS_pro_list = list(pro_seq_ss_dict.keys())
SeqSS_seq_num = len(SeqSS_pro_list)

SeqSS_train_num_all = int(SeqSS_seq_num*0.8)
SeqSS_train_num = int(SeqSS_seq_num*0.7)
SeqSS_vali_num = SeqSS_train_num_all - SeqSS_train_num
SeqSS_test_num = SeqSS_seq_num - SeqSS_train_num_all

Resi_SS_dict = {'train_all':{'H':0,'E':0,'C':0},
                'train':{'H':0,'E':0,'C':0},
                'vali':{'H':0,'E':0,'C':0},
                'test':{'H':0,'E':0,'C':0}}

SeqSS_index_list = list(range(SeqSS_seq_num))
shuffle_SeqSS_index_list = SeqSS_index_list[:]
np.random.shuffle(shuffle_SeqSS_index_list)

SeqSS_index_sample_all = shuffle_SeqSS_index_list[:SeqSS_train_num_all]
SeqSS_index_sample_train = SeqSS_index_sample_all[:SeqSS_train_num]
SeqSS_index_sample_vali = SeqSS_index_sample_all[SeqSS_train_num:]

SeqSS_train_list_all = [SeqSS_pro_list[i] for i in SeqSS_index_sample_all]
SeqSS_train_list = [SeqSS_pro_list[i] for i in SeqSS_index_sample_train]
SeqSS_vali_list = [SeqSS_pro_list[i] for i in SeqSS_index_sample_vali]
SeqSS_test_list = [SeqSS_pro_list[i] for i in SeqSS_index_list if not (i in SeqSS_index_sample_all)]

SeqSS_Data_Path = '../Datasets/For_Seq_SS/'
 
SeqSS_train_seq_all = open(SeqSS_Data_Path + 'SeqSS_seq_train_all','w')
SeqSS_train_ss_all = open(SeqSS_Data_Path + 'SeqSS_ss_train_all','w')

SeqSS_train_seq = open(SeqSS_Data_Path + 'SeqSS_seq_train','w')
SeqSS_train_ss = open(SeqSS_Data_Path + 'SeqSS_ss_train','w')

SeqSS_vali_seq = open(SeqSS_Data_Path + 'SeqSS_seq_vali','w')
SeqSS_vali_ss = open(SeqSS_Data_Path + 'SeqSS_ss_vali','w')

SeqSS_test_seq = open(SeqSS_Data_Path + 'SeqSS_seq_test','w')
SeqSS_test_ss = open(SeqSS_Data_Path + 'SeqSS_ss_test','w')
        
for p in SeqSS_train_list_all:
    SeqSS_train_seq_all.write(pro_seq_ss_dict[p][1] + '\n')
    SeqSS_train_ss_all.write(pro_seq_ss_dict[p][2] + '\n')
    
    for ss_type in ['H','E','C']:
        Resi_SS_dict['train_all'][ss_type] += pro_seq_ss_dict[p][2].count(ss_type)
    
for p in SeqSS_train_list:
    SeqSS_train_seq.write(pro_seq_ss_dict[p][1] + '\n')
    SeqSS_train_ss.write(pro_seq_ss_dict[p][2] + '\n')
    
    for ss_type in ['H','E','C']:
        Resi_SS_dict['train'][ss_type] += pro_seq_ss_dict[p][2].count(ss_type)
    
for p in SeqSS_vali_list:
    SeqSS_vali_seq.write(pro_seq_ss_dict[p][1] + '\n')
    SeqSS_vali_ss.write(pro_seq_ss_dict[p][2] + '\n')
    
    for ss_type in ['H','E','C']:
        Resi_SS_dict['vali'][ss_type] += pro_seq_ss_dict[p][2].count(ss_type)
            
for p in SeqSS_test_list:
    SeqSS_test_seq.write(pro_seq_ss_dict[p][1] + '\n')
    SeqSS_test_ss.write(pro_seq_ss_dict[p][2] + '\n')
    
    for ss_type in ['H','E','C']:
        Resi_SS_dict['test'][ss_type] += pro_seq_ss_dict[p][2].count(ss_type)

SeqSS_train_seq_all.close()
SeqSS_train_ss_all.close()

SeqSS_train_seq.close()
SeqSS_train_ss.close()

SeqSS_vali_seq.close()
SeqSS_vali_ss.close()

SeqSS_test_seq.close()
SeqSS_test_ss.close()

print('Sequences amount in all: %d'%SeqSS_seq_num)
print('Training sequences amount: %d'%SeqSS_train_num)
print('Validation sequences amount: %d'%SeqSS_vali_num)
print('Final training sequences amount: %d'%SeqSS_train_num_all)
print('Test sequences amount: %d'%SeqSS_test_num)

Sequences amount in all: 8845
Training sequences amount: 6191
Validation sequences amount: 885
Final training sequences amount: 7076
Test sequences amount: 1769


In [57]:
for k in Resi_SS_dict.keys():
    print(k)
    print('Residue amount: %d'%sum(list(Resi_SS_dict[k].values())))
    print('%s: %d'%('H',Resi_SS_dict[k]['H']),'%s: %d'%('E',Resi_SS_dict[k]['E']),'%s: %d'%('C',Resi_SS_dict[k]['C']))
    print()

train_all
Residue amount: 1114086
H: 410590 E: 251995 C: 451501

train
Residue amount: 975134
H: 359465 E: 221108 C: 394561

vali
Residue amount: 138952
H: 51125 E: 30887 C: 56940

test
Residue amount: 269936
H: 97223 E: 61876 C: 110837



In [None]:
def SS_segment(seq,ss,rate=0.1):
    

### Datasets for Seq-Fold models: ###

In [47]:
import random
    
with open(Data_Path + 'fold_protein_dict_reduced.pickle', 'rb') as handle:
    fold_protein_dict_reduced = pickle.load(handle)

SF_seq_num = 0
SF_train_num = 0
SF_test_num = 0

SF_train_dict = {}
SF_test_dict = {}

SF_Data_Path = '../Datasets/For_Seq_Stru/'

SF_train_fold = open(SF_Data_Path + 'SF_fold_train','w') 
SF_train_seq = open(SF_Data_Path + 'SF_seq_train','w')
SF_train_ss = open(SF_Data_Path + 'SF_ss_train','w')

SF_test_fold = open(SF_Data_Path + 'SF_fold_test','w') 
SF_test_seq = open(SF_Data_Path + 'SF_seq_test','w')
SF_test_ss = open(SF_Data_Path + 'SF_ss_test','w')
        
for f in fold_protein_dict_reduced.keys():
    f_pro_list = [p for p in fold_protein_dict_reduced[f] if p in pro_seq_ss_dict.keys()]
    f_pro_num = len(f_pro_list)
    
    if f_pro_num >= 3:
        index_list = range(f_pro_num)
        select_train_num = int(f_pro_num*0.8)
    
        SF_seq_num += f_pro_num
        SF_train_num += select_train_num
        SF_test_num += f_pro_num - select_train_num
        
        SF_index_list = list(range(f_pro_num))
        shuffle_SF_index_list = SF_index_list[:]
        np.random.shuffle(shuffle_SF_index_list)
        
        index_sample = shuffle_SF_index_list[:select_train_num]
        train_pro = [f_pro_list[i] for i in index_sample]
        test_pro = [f_pro_list[i] for i in index_list if not (i in index_sample)]
        
        SF_train_dict[f] = train_pro
        SF_test_dict[f] = test_pro
        
        for p in train_pro:
            SF_train_fold.write(pro_seq_ss_dict[p][0] + '\n')
            SF_train_seq.write(pro_seq_ss_dict[p][1] + '\n')
            SF_train_ss.write(pro_seq_ss_dict[p][2] + '\n')
            
        for p in test_pro:
            SF_test_fold.write(pro_seq_ss_dict[p][0] + '\n')
            SF_test_seq.write(pro_seq_ss_dict[p][1] + '\n')
            SF_test_ss.write(pro_seq_ss_dict[p][2] + '\n')
        
        
SF_train_fold.close()
SF_train_seq.close()
SF_train_ss.close()

SF_test_fold.close() 
SF_test_seq.close()
SF_test_ss.close()

print('Folds amount in all: %d'%len(SF_train_dict.keys()))
print('Sequences amount in all: %d'%SF_seq_num)
print('Training sequences amount: %d'%SF_train_num)
print('Test sequences amount: %d'%SF_test_num)

Folds amount in all: 420
Sequences amount in all: 8132
Training sequences amount: 6354
Test sequences amount: 1778
