In [1]:
import pandas as pd
import yaml
import numpy as np

with open('../../hyperparams.yml') as f:
    configs=yaml.safe_load(f)
    
with open('../../data/dataset_config.yaml') as f:
    dataset_configs=yaml.safe_load(f)

data_dir=configs['data_dir']
dataset_name='secondary_structures.csv'

In [2]:
dataset_secondary=pd.read_csv(data_dir+'raw/'+dataset_name)
validation_dataset=pd.read_csv(data_dir+'raw/csv/validation.csv')
train_dataset=pd.read_csv(data_dir+'raw/csv/training_30.csv')
test_dataset=pd.read_csv(data_dir+'raw/csv/testing.csv')

dataset_secondary.head()

Unnamed: 0,pdb_id,chain_code,seq,sst8,sst3,len,has_nonstd_aa
0,1A30,C,EDL,CBC,CEC,3,False
1,1B05,B,KCK,CBC,CEC,3,False
2,1B0H,B,KAK,CBC,CEC,3,False
3,1B1H,B,KFK,CBC,CEC,3,False
4,1B2H,B,KAK,CBC,CEC,3,False


In [3]:
max_length=512


In [4]:
def prepareInput(dataset, dataset_secondary):
    new_dataset=dataset.merge(dataset_secondary, left_on=['pdb_id', 'seqs'], right_on=['pdb_id', 'seq'], how='left').dropna()
    
    new_dataset=new_dataset.groupby('seqs', as_index=False).first()
    
    new_dataset=new_dataset[new_dataset['len']<=(max_length-2)]
    new_dataset=new_dataset[['pdb_id', 'seqs', 'sst8', 'sst3']]
    
    return new_dataset

def encodeInputs(dataset, aa_voc):
    encoded_seqs=[]

    for seq in dataset:
        encoded_seq=[aa_voc['<BOS>']]
        
        for aa in seq:
            encoded_seq.append(aa_voc[aa])
            
        encoded_seq.append(aa_voc['<EOS>'])

        while not len(encoded_seq)==(max_length):
            encoded_seq.append(aa_voc['<PAD>'])
        encoded_seqs.append(encoded_seq)

    encoded_seqs=np.stack(encoded_seqs)
    print(encoded_seqs.shape)
    return encoded_seqs

def encodeLabels(dataset, labels_voc):
    encoded_labels=[]
    
    for seq in dataset:
        encoded_lab=[labels_voc['<PAD>']]
        
        for label in seq:
            encoded_lab.append(labels_voc[label])
        
        while not len(encoded_lab)==(max_length):
            encoded_lab.append(labels_voc['<PAD>'])

        encoded_labels.append(encoded_lab)
        
    encoded_labels=np.stack(encoded_labels)
    print(encoded_labels.shape)
    return encoded_labels

In [5]:
dataset=prepareInput(train_dataset, dataset_secondary)
print(len(dataset))

19162


# Generate dictionaries

In [6]:
aa_voc=dataset_configs['aa_vocabulary']
print(aa_voc)

{'<BOS>': 21, '<EOS>': 22, '<PAD>': 0, 'A': 2, 'C': 11, 'D': 19, 'E': 16, 'F': 8, 'G': 9, 'H': 20, 'I': 7, 'K': 13, 'L': 12, 'M': 1, 'N': 15, 'P': 10, 'Q': 5, 'R': 14, 'S': 6, 'T': 3, 'V': 4, 'W': 17, 'Y': 18}


In [7]:
labels=dataset['sst3'].tolist()
labels_sst3_voc={'<PAD>':0}
for seq in labels:
    for label in seq:
        if label not in labels_sst3_voc:
            labels_sst3_voc[label]=len(labels_sst3_voc)

print(labels_sst3_voc)

labels=dataset['sst8'].tolist()
labels_sst8_voc={'<PAD>':0}
for seq in labels:
    for label in seq:
        if label not in labels_sst8_voc:
            labels_sst8_voc[label]=len(labels_sst8_voc)
            
print(labels_sst8_voc)

{'<PAD>': 0, 'C': 1, 'H': 2, 'E': 3}
{'<PAD>': 0, 'C': 1, 'H': 2, 'T': 3, 'S': 4, 'B': 5, 'E': 6, 'G': 7, 'I': 8}


# Prepare train dataset

In [8]:
#encode aa seqs
encoded_inputs=encodeInputs(dataset['seqs'].tolist(), aa_voc)

#encode labels 3
encoded_sst3_labels=encodeLabels(dataset['sst3'].tolist(), labels_sst3_voc)

#encode labels 8
encoded_sst8_labels=encodeLabels(dataset['sst8'].tolist(), labels_sst8_voc)

(19162, 512)
(19162, 512)
(19162, 512)


In [9]:
dataset_1=np.asarray([encoded_inputs, encoded_sst3_labels])
np.save(data_dir+'dataset/secondary_structure/training_sst3.npy', dataset_1)

dataset_2=np.asarray([encoded_inputs, encoded_sst8_labels])
np.save(data_dir+'dataset/secondary_structure/training_sst8.npy', dataset_2)

# Prepare validation dataset

In [10]:
dataset=prepareInput(validation_dataset, dataset_secondary)

In [11]:
#encode aa seqs
encoded_inputs=encodeInputs(dataset['seqs'].tolist(), aa_voc)

#encode labels 3
encoded_sst3_labels=encodeLabels(dataset['sst3'].tolist(), labels_sst3_voc)

#encode labels 8
encoded_sst8_labels=encodeLabels(dataset['sst8'].tolist(), labels_sst8_voc)

(204, 512)
(204, 512)
(204, 512)


In [12]:
dataset_1=np.asarray([encoded_inputs, encoded_sst3_labels])
np.save(data_dir+'dataset/secondary_structure/validating_sst3.npy', dataset_1)

dataset_2=np.asarray([encoded_inputs, encoded_sst8_labels])
np.save(data_dir+'dataset/secondary_structure/validating_sst8.npy', dataset_2)