In [1]:
import pandas as pd
import yaml
import numpy as np
import os
import time

with open('../../hyperparams.yml') as f:
    configs=yaml.safe_load(f)
    
with open('../../data/dataset_config.yaml') as f:
    dataset_configs=yaml.safe_load(f)

data_dir=configs['data_dir']
raw_files_dir='raw/clusters/'
max_length=dataset_configs['sequence_length']
aa_vocabulary=dataset_configs['aa_vocabulary']

In [2]:
import h5py
hdf5_file = h5py.File(data_dir+"test.hdf5", "w")
sequences_encoded = hdf5_file.create_dataset("sequences", (10000000, 512), compression="gzip", dtype='i8')

In [3]:
sequences_refs=pd.DataFrame()

destination_dir='dataset/unsupervised_large/'
pointer=0
for filename in os.listdir(data_dir+raw_files_dir):
    print('\nProcessing', filename)
    
    data=pd.read_csv(data_dir+raw_files_dir+filename)
    
    #compute length and remove too long seqs
    data=data[data['sequence'].str.len()<(max_length-2)]
    
    batch_size=len(data)
    print(batch_size)
    
    sequences_refs=pd.concat([sequences_refs, data['cluster_ref']], ignore_index=True)
    
    batch_encoded=np.zeros((len(data), max_length), dtype=np.int8)
    
    seq_idx=0
    for row in data.itertuples():
        aa_seq=row.sequence
        
        seq_encoded=np.zeros((max_length), dtype=np.int8)
        
        seq_encoded[0]=aa_vocabulary['<BOS>']

        for aa_idx, aa in enumerate(aa_seq):
            if aa not in aa_vocabulary:
                aa_token=aa_vocabulary['X']
            else:
                aa_token=aa_vocabulary[aa]
                
            seq_encoded[aa_idx+1]=aa_token
        
        seq_encoded[aa_idx+2]=aa_vocabulary['<EOS>']
        
        batch_encoded[seq_idx]=seq_encoded
        seq_idx += 1
        
    sequences_encoded[pointer:pointer+batch_size]=batch_encoded

    pointer+=batch_size

print('Total number of records:', pointer)
print('Total number of references:', len(sequences_refs))


Processing dataset_uniref_50_3.csv
317208

Processing dataset_uniref_50_7.csv
504354

Processing dataset_uniref_50_1.csv
581907

Processing dataset_uniref_50_4.csv
469936

Processing dataset_uniref_50_2.csv
362271

Processing dataset_uniref_50_5.csv
668075

Processing dataset_uniref_50_6.csv
435248
Total number of records: 3338999
Total number of references: 3338999


In [4]:
sequences_encoded.resize((pointer, max_length))
hdf5_file.close()

In [5]:
sequences_refs.columns=['cluster_ref']
sequences_refs.to_csv(data_dir+'cluster_refs.csv', index=False)