In [1]:
import h5py
import numpy as np
import scipy.io

## Embedding 

In [2]:
def hot2label(hot_array):
    '''Convert the one hot encoding into tokens'''
    #(hot_array * np.array([0,1,2,3],dtype=np.uint8)).sum(-1,dtype=np.uint8)
    return np.einsum('ijk,k->ij',hot_array,np.array([0,1,2,3],dtype=np.uint8)) #two times faster

1. Load data from Matlab file into numpy
2. Transpose
3. Convert one hot to label
4. Save

For the train set, we separate the work in chunk to fit data in memory

In [None]:
#https://www.pythonforthelab.com/blog/how-to-use-hdf5-files-in-python/
trainmat = h5py.File('../data/DeepSEA/train.mat') #(1000, 4, 4400000)
N = trainmat['trainxdata'].shape[-1] 
c = trainmat['traindata'].shape[0] 
seq_len = 1000
                     
with h5py.File('../data/Processed/train.hdf5', 'w') as train_h5:
    X_h5 = train_h5.create_dataset("X_train", (N,seq_len), dtype='u1')#, compression="gzip")
    y_h5 = train_h5.create_dataset("y_train", (N,c), dtype='u1')#, compression="gzip")

    chunk_sz = 200_000
    n_chunks = N//chunk_sz if N%chunk_sz==0 else N//chunk_sz+1

    for i in range(n_chunks):
        fi = int( i   *chunk_sz)
        ti = int((i+1)*chunk_sz) if i!=(n_chunks-1) else N

        X_train = np.transpose(trainmat['trainxdata'][:,:,fi:ti],axes=(2,0,1))
        y_train = (trainmat['traindata'][:,fi:ti]).T
        X_train = hot2label(X_train)
#         print(X_train.shape,X_h5.shape,fi,ti,N,seq_len)
        #save hdf5
        X_h5[fi:ti] = X_train
        y_h5[fi:ti] = y_train

In [4]:
validmat = scipy.io.loadmat('../data/DeepSEA/valid.mat')
X_valid = np.transpose(validmat['validxdata'],axes=(0,2,1))
y_valid = validmat['validdata']
X_valid = hot2label(X_valid)
np.savez_compressed('../data/Processed/valid',X_valid,y_valid)

In [5]:
testmat = scipy.io.loadmat('data/DeepSEA/test.mat')
X_test = np.transpose(testmat['testxdata'],axes=(0,2,1))
y_test = testmat['testdata']
X_test  = hot2label(X_test)
np.savez_compressed('../data/Processed/test', X_test, y_test)

## Hot encoded

In [6]:
#https://www.pythonforthelab.com/blog/how-to-use-hdf5-files-in-python/
trainmat = h5py.File('../data/DeepSEA/train.mat') #(1000, 4, 4400000)
N = trainmat['trainxdata'].shape[-1] 
c = trainmat['traindata'].shape[0] 
seq_len = 1000
                     
with h5py.File('../data/Processed/hot_train.hdf5', 'w') as train_h5:
    X_h5 = train_h5.create_dataset("X_train", (N,seq_len,4), dtype='u1', compression="gzip")
    y_h5 = train_h5.create_dataset("y_train", (N,c), dtype='u1', compression="gzip")

    chunk_sz = 200_000
    n_chunks = N//chunk_sz if N%chunk_sz==0 else N//chunk_sz+1

    for i in range(n_chunks):
        fi = int( i   *chunk_sz)
        ti = int((i+1)*chunk_sz) if i!=(n_chunks-1) else N

        X_train = np.transpose(trainmat['trainxdata'][:,:,fi:ti],axes=(2,0,1))
        y_train = (trainmat['traindata'][:,fi:ti]).T

        X_h5[fi:ti] = X_train
        y_h5[fi:ti] = y_train

In [9]:
validmat = scipy.io.loadmat('../data/DeepSEA/valid.mat')
X_valid = np.transpose(validmat['validxdata'],axes=(0,2,1))
y_valid = validmat['validdata']
np.savez_compressed('../data/Processed/hot_valid2',X_valid,y_valid)

In [6]:
testmat = scipy.io.loadmat('../data/DeepSEA/test.mat')
X_test = np.transpose(testmat['testxdata'],axes=(0,2,1))
y_test = testmat['testdata']
np.savez_compressed('../data/Processed/hot_test', X_test, y_test)