In [1]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import pandas as pd
sns.set()

with open("../../hyperparams.yml", 'r') as f:
    configs = yaml.safe_load(f)
    
data_dir=configs['data_dir']

In [2]:
dataset=pd.read_csv(data_dir+'raw/csv/training_90.csv')
dataset.head()

Unnamed: 0,pdb_id,seqs
0,4Y28,MATVTTQASAAIFGPCGLKSRFLGGSSGKLNRGVAFRPVGCSPSAS...
1,2JXZ,CGNLSTCMLGTLTQDFHKFHTFPQTNTGVGTPA
2,4P15,GRFSERAQKVLALSQEEAIRLSHHNIGTEHILLGLIREGEGIAAKA...
3,4CYJ,GSMPLSSIGLPYYREPLFSAWPADIISDVGAPPLQLEPSFVATLKQ...
4,2Z51,MVPLTEENVESVLDEIRPYLMSDGGNVALHEIDGNVVRVKLQGACG...


In [3]:
print(len(dataset))

49600


# Sequences Length

In [4]:
#analyse length distribution
dataset['len']=dataset['seqs'].str.len()

In [5]:
dataset['len'].describe()

count    49600.000000
mean       230.238770
std        178.504938
min         20.000000
25%        112.000000
50%        187.000000
75%        304.000000
max       4914.000000
Name: len, dtype: float64

In [6]:
max_length=512
dataset=dataset[dataset['len']<=(max_length-2)]

In [7]:
print(len(dataset))

46698


# Amino acids analysis

In [8]:
aminos_vocabulary={'<PAD>':0}
for seq in dataset['seqs']:
    for aa in seq:
        if aa not in aminos_vocabulary:
            aminos_vocabulary[aa]=len(aminos_vocabulary)

aminos_vocabulary['<BOS>']=len(aminos_vocabulary)
aminos_vocabulary['<EOS>']=len(aminos_vocabulary)
print(aminos_vocabulary)

{'<PAD>': 0, 'M': 1, 'A': 2, 'T': 3, 'V': 4, 'Q': 5, 'S': 6, 'I': 7, 'F': 8, 'G': 9, 'P': 10, 'C': 11, 'L': 12, 'K': 13, 'R': 14, 'N': 15, 'E': 16, 'W': 17, 'Y': 18, 'D': 19, 'H': 20, '<BOS>': 21, '<EOS>': 22}


In [13]:
dataset_encoded=[]
for seq in dataset['seqs']:
    encoded_seq=[aminos_vocabulary['<BOS>']]
    for aa in seq:
        encoded_seq.append(aminos_vocabulary[aa])
    encoded_seq.append(aminos_vocabulary['<EOS>'])
    
    if len(encoded_seq)<max_length:
        padding_size=max_length-len(encoded_seq)
        for i in range(padding_size):
            encoded_seq.append(aminos_vocabulary['<PAD>'])
    dataset_encoded.append(encoded_seq)

In [14]:
dataset_encoded=np.stack(dataset_encoded)
dataset_encoded.shape

(46698, 512)

In [17]:
data={
    'sequence_length':max_length,
    'aa_vocabulary': aminos_vocabulary
}

with open(data_dir+'dataset_config.yaml', 'w') as outfile:
    yaml.dump(data, outfile)

In [18]:
np.save(data_dir+'dataset/training_90.npy', dataset_encoded)

# Encode Test and Validation datasets

In [26]:
dataset_encoded=[]
for dataset_name in ['testing', 'validation']:
    dataset=pd.read_csv(data_dir+'raw/csv/'+dataset_name+'.csv')
    dataset['len']=dataset['seqs'].str.len()
    dataset=dataset[dataset['len']<=(max_length-2)]
    
    for seq in dataset['seqs']:
        encoded_seq=[aminos_vocabulary['<BOS>']]
        for aa in seq:
            encoded_seq.append(aminos_vocabulary[aa])
        encoded_seq.append(aminos_vocabulary['<EOS>'])

        if len(encoded_seq)<max_length:
            padding_size=max_length-len(encoded_seq)
            for i in range(padding_size):
                encoded_seq.append(aminos_vocabulary['<PAD>'])
        dataset_encoded.append(encoded_seq)
        
dataset_encoded=np.stack(dataset_encoded)
print(dataset_encoded.shape)
np.save(data_dir+'dataset/evaluation.npy', dataset_encoded)    
        

(252, 512)
