In [1]:
import pandas as pd
import yaml
import numpy as np
import os
import time

with open('../../hyperparams.yml') as f:
    configs=yaml.safe_load(f)
    
with open('../../data/dataset_config.yaml') as f:
    dataset_configs=yaml.safe_load(f)

data_dir=configs['data_dir']
raw_files_dir='raw/old_clusters/'
max_length=dataset_configs['sequence_length']
aa_vocabulary=dataset_configs['aa_vocabulary']


destination_dir='dataset/unsupervised_large_clusters/'

In [2]:
import h5py
hdf5_file = h5py.File(data_dir+destination_dir+"dataset.hdf5", "w")
sequences_encoded = hdf5_file.create_dataset("sequences", (10000000, 512), compression="gzip", dtype='i8')

# Soft Dataset

In [3]:
clusters_refs=pd.DataFrame()
sequences_ids=pd.DataFrame()

pointer=0
for filename in os.listdir(data_dir+raw_files_dir):
    print('\nProcessing', filename)
    
    data=pd.read_csv(data_dir+raw_files_dir+filename)
    
    #compute length and remove too long seqs
    data=data[data['sequence'].str.len()<(max_length-2)]
    
    data=data.groupby('cluster_ref').first().reset_index()

    batch_size=len(data)
    print(batch_size)
    
    clusters_refs=pd.concat([clusters_refs, data['cluster_ref']], ignore_index=True)
    sequences_ids=pd.concat([sequences_ids, data['entry_id']], ignore_index=True)
    
    
    batch_encoded=np.zeros((len(data), max_length), dtype=np.int8)
    
    seq_idx=0
    for row in data.itertuples():
        aa_seq=row.sequence
        
        seq_encoded=np.zeros((max_length), dtype=np.int8)
        
        seq_encoded[0]=aa_vocabulary['<BOS>']

        for aa_idx, aa in enumerate(aa_seq):
            if aa not in aa_vocabulary:
                aa_token=aa_vocabulary['X']
            else:
                aa_token=aa_vocabulary[aa]
                
            seq_encoded[aa_idx+1]=aa_token
        
        seq_encoded[aa_idx+2]=aa_vocabulary['<EOS>']
        
        batch_encoded[seq_idx]=seq_encoded
        seq_idx += 1
        
    sequences_encoded[pointer:pointer+batch_size]=batch_encoded

    pointer+=batch_size

print('Total number of records:', pointer)
print('Total number of cluster references:', len(clusters_refs))
print('Total number of sequence references:', len(sequences_ids))


Processing dataset_uniref_50_3.csv
85952

Processing dataset_uniref_50_7.csv
88568

Processing dataset_uniref_50_1.csv
87792

Processing dataset_uniref_50_4.csv
87427

Processing dataset_uniref_50_2.csv
88784

Processing dataset_uniref_50_5.csv
88256

Processing dataset_uniref_50_6.csv
82554
Total number of records: 609333
Total number of cluster references: 609333
Total number of sequence references: 609333


# Complete dataset

In [None]:
clusters_refs=pd.DataFrame()
sequences_ids=pd.DataFrame()

pointer=0
for filename in os.listdir(data_dir+raw_files_dir):
    print('\nProcessing', filename)
    
    data=pd.read_csv(data_dir+raw_files_dir+filename)
    
    #compute length and remove too long seqs
    data=data[data['sequence'].str.len()<(max_length-2)]
    
    batch_size=len(data)
    print(batch_size)
    
    clusters_refs=pd.concat([clusters_refs, data['cluster_ref']], ignore_index=True)
    sequences_ids=pd.concat([sequences_ids, data['entry_id']], ignore_index=True)
    
    
    batch_encoded=np.zeros((len(data), max_length), dtype=np.int8)
    
    seq_idx=0
    for row in data.itertuples():
        aa_seq=row.sequence
        
        seq_encoded=np.zeros((max_length), dtype=np.int8)
        
        seq_encoded[0]=aa_vocabulary['<BOS>']

        for aa_idx, aa in enumerate(aa_seq):
            if aa not in aa_vocabulary:
                aa_token=aa_vocabulary['X']
            else:
                aa_token=aa_vocabulary[aa]
                
            seq_encoded[aa_idx+1]=aa_token
        
        seq_encoded[aa_idx+2]=aa_vocabulary['<EOS>']
        
        batch_encoded[seq_idx]=seq_encoded
        seq_idx += 1
        
    sequences_encoded[pointer:pointer+batch_size]=batch_encoded

    pointer+=batch_size

print('Total number of records:', pointer)
print('Total number of cluster references:', len(clusters_refs))
print('Total number of sequence references:', len(sequences_ids))

# Finalize

In [6]:
sequences_encoded.resize((pointer, max_length))
print(sequences_encoded.shape)
hdf5_file.close()

(609333, 512)


In [8]:
clusters_refs.columns=['cluster_ref']
clusters_refs.to_csv(data_dir+destination_dir+'clusters_refs.csv', index=False)

In [9]:
sequences_ids.columns=['sequence_id']
sequences_ids.to_csv(data_dir+destination_dir+'sequences_ids.csv', index=False)