In [1]:
import kaldi_io
import lmdb
import numpy as np
import scipy.io as sio
import os
import time
import random
import matplotlib.pyplot as plt
import pickle
import h5py
import kaldiio
from kaldiio import WriteHelper
from functools import wraps
import shutil

%matplotlib inline

In [2]:
def getdirsize(dir):
    if os.path.isfile(dir):
        return os.path.getsize(dir) / 1024 / 1024.

    elif os.path.isdir(dir):
        the_size = 0
        for root, dirs, files in os.walk(dir):
              the_size += sum([os.path.getsize(os.path.join(root, name)) for name in files])
        return the_size / 1024 / 1024.

def fn_timer(function):
    @wraps(function)
    def function_timer(*args, **kwargs):
        # print('Start %s ...' % function.__name__, end='')
        t0 = time.time()
        result, deviation = function(*args, **kwargs)
        t1 = time.time()
        t = float(t1-t0)
        size  = getdirsize(result)

        print(" Running {:>16}: {:>9.4f} seconds, {:>9.4f} MB, deviation: {:>10.4f}.".format(function.__name__, float(t), size, deviation))
        return t, size, deviation

    return function_timer

@fn_timer
def npz_save(npz_file, kwds):
    np.savez_compressed(npz_file, **kwds)
    test_np = np.load(npz_file)
    
    deviation = 0
    for u in utts:
        deviation += np.sum(np.abs(test_np[u] - kwds[u]))

    return npz_file, deviation

@fn_timer
def npy_save(np_dir, kwds):

    uid_npath = {}
    for u in kwds.keys():
        np.save(os.path.join(np_dir, u + '.npy'), kwds[u])
        uid_npath[u] = os.path.join(np_dir, u + '.npy')
    
    deviation = 0
    for u in utts:
        deviation += np.sum(np.abs(np.load(uid_npath[u]) - kwds[u]))

    return np_dir, deviation

@fn_timer
def mat_save(mat_file, kwds, utts):
    sio.savemat(mat_file, kwds, do_compression=True)
    test_mat = sio.loadmat(mat_file)
    
    deviation = 0
    for u in utts:
        deviation += np.sum(np.abs(test_mat[u] - kwds[u])) 

    return mat_file, deviation

@fn_timer
def kalid_io_save(ark_dir, kwds, utts):
    ark_file = os.path.join(ark_dir, 'test.ark')
    feat_scp = os.path.join(ark_dir, 'feat.scp')
    # uid_feat = {}

    with open(ark_file, 'wb') as ark_f, open(feat_scp, 'w') as feat_f:
        for k in kwds.keys():
            # kaldi_io.write_mat(ark_f, kwds[k], key='')
            # kaldi_io.write_vec_int(ark_f, kwds[k], key='')
            kaldi_io.write_vec_flt(ark_f, kwds[k].astype(np.float32), key='')

            offsets = str(ark_file) + ':' + str(ark_f.tell() - len(kwds[k].astype(np.float32).tobytes()) - 10)
            feat_f.write(str(k) + ' ' + offsets + '\n')
            # uid_feat[k] = offsets
    d = {}
    with open(feat_scp, 'r') as  f:
        for l in f.readlines():
            uid, path = l.split()
            d[uid] = path
    deviation = 0
    for u in utts:
        # deviation += np.sum(np.abs(kaldi_io.read_mat(uid_feat[u]) - kwds[u])) 
        # deviation += np.sum(np.abs(kaldi_io.read_vec_int(uid_feat[u]) - kwds[u])) 
        deviation += np.sum(np.abs(kaldi_io.read_vec_flt(d[u]) - kwds[u])) 

    return ark_dir, deviation

@fn_timer
def kaldiio_save(ark_dir, kwds, utts):
    ark_file = os.path.join(ark_dir, 'test.ark')
    feat_scp = os.path.join(ark_dir, 'feat.scp')

    with WriteHelper('ark,scp:%s,%s' % (ark_file, feat_scp)) as writer: #, compression_method=1
        for u in kwds.keys():
            writer(str(u), kwds[u].astype(np.float32))
    d = {}
    with open(feat_scp, 'r') as  f:
        for l in f.readlines():
            uid, path = l.split()
            d[uid] = path
    # d = kaldiio.load_scp(feat_scp)
    
    deviation = 0
    for u in kwds.keys():
        deviation += np.sum(np.abs(kaldiio.load_mat(d[u]) - kwds[u].astype(np.float32))) 
        
    return ark_dir, deviation

@fn_timer
def kaldiio_mat_save(ark_dir, kwds, utts):
    ark_file = os.path.join(ark_dir, 'test.ark')
    feat_scp = os.path.join(ark_dir, 'feat.scp')

    with WriteHelper('ark,scp:%s,%s' % (ark_file, feat_scp), compression_method=1) as writer: #
        for u in kwds.keys():
            writer(str(u), kwds[u].reshape(1,-1).astype(np.float32))

    # d = kaldiio.load_scp(feat_scp)
    d = {}
    with open(feat_scp, 'r') as  f:
        for l in f.readlines():
            uid, path = l.split()
            d[uid] = path
    
    deviation = 0
    for u in kwds.keys():
        deviation += np.sum(np.abs(kaldiio.load_mat(d[u]).reshape(-1) - kwds[u].astype(np.float32))) 
        
    return ark_dir, deviation

@fn_timer
def pickle_save(pick_dir, kwds, utts):
    pick_file = os.path.join(pick_dir, 'test.pickle')
    with open(pick_file, 'wb') as pic_f:
        pickle.dump(kwds, pic_f, protocol=pickle.HIGHEST_PROTOCOL)

    with open(pick_file, 'rb') as pic_f:
        new_dic = pickle.load(pic_f)
        
    deviation = 0
    for u in utts:
        deviation += np.sum(np.abs(new_dic[u] - kwds[u])) 

    return pick_dir, deviation

@fn_timer
def lmdb_save(lmdir_dir, kwds, utts):
    lmdb_file = os.path.join(lmdir_dir, 'test.lmdb')
    data_size_per_exa = np.random.rand(400, 64).astype(np.float32).nbytes
    # print('data size per examples is: ', data_size_per_exa)
    data_size = data_size_per_exa * len(utts)

    env = lmdb.open(lmdb_file, map_size=data_size * 10)
    # map_size：
    # Maximum size database may grow to; used to size the memory mapping. If database grows larger
    # than map_size, an exception will be raised and the user must close and reopen Environment.
    # write data to lmdb
    txn = env.begin(write=True)
    # resolutions = []
    # tqdm_iter = tqdm(enumerate(utts), total=len(utts), leave=False)
    for idx, key in enumerate(utts):
        # tqdm_iter.set_description('Write {}'.format(key))
        key_byte = key.encode('ascii')
        data = kwds[key]#.astype(np.int16)
        # resolutions.append('{:d}_{:d}'.format(H, W))
        txn.put(key_byte, data)

        # if (idx + 1) % 200 == 0 or (idx + 1) == len(utts):
    txn.commit()
    # commit 之后需要再次 begin
    # txn = env.begin(write=True)
    env.close()
    
    # # print('Finish writing lmdb.')
    env = lmdb.open(lmdb_file, map_size=data_size * 10)
    deviation = 0
    with env.begin(write=False) as txn:
        for key in utts:
            buf = txn.get(key.encode('ascii'))
            deviation += np.sum(np.abs(np.frombuffer(buf, dtype=np.int16).reshape(kwds[key].shape) - kwds[key].astype(np.int16))) 
    
    env.close()

    return lmdir_dir, deviation

@fn_timer
def h5py_save(h5py_dir, kwds, utts, compression_opts=4):
    h5py_file = os.path.join(h5py_dir, 'test.h5py')

    with h5py.File(h5py_file, 'w') as f:  # 写入的时候是‘w’
        for u in kwds.keys():
            # np.save(os.path.join(np_dir, u + '.npy'), kwds[u])
            f.create_dataset(u, data=kwds[u].astype(np.int16),)
    
    deviation = 0
    with h5py.File(h5py_file, 'r') as f:  # 写入的时候是‘w’
        for u in utts:
            # np.save(os.path.join(np_dir, u + '.npy'), kwds[u])
            deviation += np.sum(np.abs(f.get(u)[:] - kwds[u].astype(np.int16))) 

    return h5py_dir, deviation


In [4]:
npz_point = []
npy_point = []
mat_point = []
kaldi_io_point = []
kaldiio_mat_point = []
kaldi_io_mat_point = []
kaldiio_point = []
pick_point = []
lmdb_point = []
h5py_point = []
num_sam = np.array([100, 1000, 10000])
sets = ['npz', 'npy', 'mat', 'kaldi_io', 'kaldiio', 'pickle', 'lmdb', 'h5py']
save_dir = 'data/io_test'
npz_file = save_dir + '/test.npz'
np_dir = save_dir + '/test_npy'
mat_file = save_dir + '/test.mat'
kaldiio_dir = save_dir + '/kaldi_io'
kaldiio_mat_dir = save_dir + '/kaldi_io'
ark_dir = save_dir + '/kaldiio'
pick_dir = save_dir + '/pick'
lmdir_dir = save_dir + '/lmdir'
h5py_dir = save_dir + '/h5py'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

for num in num_sam:
    kwds = {}
    utts = []
    print('\nRandomly generate array with lenght %d: ' % num)
    for i in range(num):
        # feat_len = np.random.randint(250, 400)
        # kwds['mat%s'%str(i)] = np.random.rand(feat_len, 64).astype(np.float32)
        
        # feat_len = np.random.randint(2, 20) * 0.5 + 1
        feat_len = np.abs(np.random.randn(1) + 0.1) * 20
        kwds['mat%s'%str(i)] = np.random.randint(-32768, 32767, int(feat_len*16000), dtype='int16')#.astype(np.float32)
        
        utts.append('mat%s'%str(i))
    
    utts = utts*20
    random.shuffle(utts)
    
#     if os.path.exists(npz_file):
#         os.remove(npz_file)

#     t1,npzsize, deviation = npz_save(npz_file, kwds)
#     npz_point.append([t1, npzsize, deviation])

#     if os.path.exists(np_dir):
#         shutil.rmtree(np_dir)

#     os.makedirs(np_dir)
#     t2,npysize, deviation = npy_save(np_dir, kwds)
#     npy_point.append([t2, npysize, deviation])

    
#     if os.path.exists(mat_file):
#         os.remove(mat_file)
#     t3,matsize, deviation = mat_save(mat_file, kwds)
#     mat_point.append([t3, matsize, deviation])

    # Kaldi_io
#     if os.path.exists(ark_dir):
#         shutil.rmtree(ark_dir)

#     os.makedirs(ark_dir)
#     t4,kaldisize, deviation = kalid_io_save(ark_dir, kwds, utts)
#     kaldi_io_point.append([t4, kaldisize, deviation])
    
#     # kaldiio mat
#     if os.path.exists(kaldiio_mat_dir):
#         shutil.rmtree(kaldiio_mat_dir)

#     os.makedirs(kaldiio_mat_dir)
#     t4,kaldisize, deviation = kaldiio_mat_save(kaldiio_mat_dir, kwds, utts)
#     kaldi_io_mat_point.append([t4, kaldisize, deviation])

    # Kaldiio
    if os.path.exists(kaldiio_dir):
        shutil.rmtree(kaldiio_dir)

    os.makedirs(kaldiio_dir)
    t5, kaldiiosize, deviation = kaldiio_save(kaldiio_dir, kwds, utts)
    kaldiio_mat_point.append([t5, kaldiiosize, deviation])

    if os.path.exists(pick_dir):
        shutil.rmtree(pick_dir)
    os.makedirs(pick_dir)

    t6,picksize, deviation = pickle_save(pick_dir, kwds, utts)
    pick_point.append([t6, picksize, deviation])

    if os.path.exists(lmdir_dir):
        shutil.rmtree(lmdir_dir)
    os.makedirs(lmdir_dir)

    t7,lmdbsize, deviation = lmdb_save(lmdir_dir, kwds, utts)
    lmdb_point.append([t7, lmdbsize, deviation])
    
    if os.path.exists(h5py_dir):
        shutil.rmtree(h5py_dir)
    os.makedirs(h5py_dir)
    t8,h5pysize, deviation = h5py_save(h5py_dir, kwds, utts, compression_opts=0)
    h5py_point.append([t8, h5pysize, deviation])

# plt.savefig("PythonProgramDesign/IOTest/mat.io.all.png")
# plt.show()


# Randomly generate array with lenght 100: 
#  Running    kalid_io_save:    0.1537 seconds,   21.0074 MB, deviation:     0.0000.
#  Running kaldiio_mat_save:    0.0437 seconds,   10.5080 MB, deviation: 1376666.3105.
#  Running     kaldiio_save:    0.0181 seconds,   21.0080 MB, deviation:     0.0000.
#  Running        lmdb_save:    0.6035 seconds,   10.7422 MB, deviation:     0.0000.
#  Running        h5py_save:    0.5740 seconds,   10.7527 MB, deviation:     0.0000.

# Randomly generate array with lenght 1000: 
#  Running    kalid_io_save:    0.9837 seconds,  209.7400 MB, deviation:     0.0000.
#  Running kaldiio_mat_save:    0.2518 seconds,  104.9154 MB, deviation: 13741173.6377.
#  Running     kaldiio_save:    0.1764 seconds,  209.7476 MB, deviation:     0.0000.
#  Running        lmdb_save:    4.6234 seconds,  107.0898 MB, deviation:     0.0000.
#  Running        h5py_save:    3.2097 seconds,  107.3484 MB, deviation:     0.0000.

# Randomly generate array with lenght 10000: 
#  Running    kalid_io_save:    9.9929 seconds, 2104.4081 MB, deviation:     0.0000.
#  Running kaldiio_mat_save:    2.5101 seconds, 1052.6769 MB, deviation: 137866423.9150.
#  Running     kaldiio_save:    4.6262 seconds, 2104.4929 MB, deviation:     0.0000.
#  Running        lmdb_save:   45.2163 seconds, 1073.8281 MB, deviation:     0.0000.
#  Running        h5py_save:   30.5054 seconds, 1076.9591 MB, deviation:     0.0000.

# Randomly generate array with lenght 100: 
#  Running    kalid_io_save:    0.3140 seconds,   88.7046 MB, deviation:     0.0000.
#  Running kaldiio_mat_save:    0.0832 seconds,   44.3567 MB, deviation: 5812296.4434.
#  Running     kaldiio_save:    0.0644 seconds,   88.7052 MB, deviation:     0.0000.
#  Running        lmdb_save:    1.0934 seconds,   44.5664 MB, deviation:     0.0000.
#  Running        h5py_save:    0.4625 seconds,   44.3868 MB, deviation:     0.0000.

# Randomly generate array with lenght 1000: 
#  Running    kalid_io_save:    3.4349 seconds,  978.7751 MB, deviation:     0.0000.
#  Running kaldiio_mat_save:    0.8629 seconds,  489.4335 MB, deviation: 64139335.1338.
#  Running     kaldiio_save:    0.6808 seconds,  978.7826 MB, deviation:     0.0000.
#  Running        lmdb_save:   10.2313 seconds,  491.3594 MB, deviation:     0.0000.
#  Running        h5py_save:    5.0313 seconds,  489.7213 MB, deviation:     0.0000.

# Randomly generate array with lenght 10000: 
#  Running    kalid_io_save:  442.8909 seconds, 9780.4763 MB, deviation:     0.0000.
#  Running kaldiio_mat_save:   66.2275 seconds, 4890.7165 MB, deviation: 640922628.8037.
#  Running     kaldiio_save:  133.9422 seconds, 9780.5611 MB, deviation:     0.0000.
#  Running        lmdb_save:  438.9209 seconds, 4910.1250 MB, deviation:     0.0000.
#  Running        h5py_save:   70.8426 seconds, 4893.5340 MB, deviation:     0.0000.


Randomly generate array with lenght 100: 
 Running     kaldiio_save:    0.0657 seconds,   85.7937 MB, deviation:     0.0000.
 Running      pickle_save:    0.4366 seconds,   42.8981 MB, deviation:     0.0000.
 Running        lmdb_save:    1.2660 seconds,   43.0977 MB, deviation:     0.0000.
 Running        h5py_save:    0.7973 seconds,   42.9320 MB, deviation:     0.0000.

Randomly generate array with lenght 1000: 
 Running     kaldiio_save:    0.6763 seconds,  920.7798 MB, deviation:     0.0000.
 Running      pickle_save:    4.7119 seconds,  460.4020 MB, deviation:     0.0000.
 Running        lmdb_save:    9.6857 seconds,  462.4766 MB, deviation:     0.0000.
 Running        h5py_save:    8.2041 seconds,  460.7205 MB, deviation:     0.0000.

Randomly generate array with lenght 10000: 
 Running     kaldiio_save:   49.3162 seconds, 9750.9634 MB, deviation:     0.0000.
 Running      pickle_save:   79.9595 seconds, 4875.5963 MB, deviation:     0.0000.
 Running        lmdb_save:  106.6450 s

In [17]:
h5py_file = 'data/io_test/h5py/test.h5py'
f = h5py.File(h5py_file, 'r')

In [19]:
f.get('mat%s' % str(i))

<HDF5 dataset "mat9999": shape (260823,), type "<i2">

In [38]:
h5py_point = []
num_sam = np.array([500, 2000])
sets = ['npz', 'npy', 'mat', 'kaldi_io', 'kaldiio', 'pickle', 'lmdb', 'h5py']

h5py_dir = save_dir + '/h5py'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

for compression_opts in [0, 1, 5, 7]:
    print('\nCompress: %d' % compression_opts)
    for num in num_sam:
        kwds = {}
        utts = []
        # print('Randomly generate array with lenght %d: ' % num)
        for i in range(num):
            # feat_len = np.random.randint(250, 400)
            # kwds['mat%s'%str(i)] = np.random.rand(feat_len, 64).astype(np.float32)
            feat_len = np.random.randint(10, 20) * 0.1 + 2
            kwds['mat%s'%str(i)] = np.random.randint(-32768, 32767, int(feat_len*16000), dtype='int16').astype(np.float32)

            utts.append('mat%s'%str(i))

        random.shuffle(utts)

        if os.path.exists(h5py_dir):
            shutil.rmtree(h5py_dir)
        os.makedirs(h5py_dir)
        t8,h5pysize, deviation = h5py_save(h5py_dir, kwds, compression_opts=compression_opts)
        h5py_point.append([t8, h5pysize, deviation])

# plt.savefig("PythonProgramDesign/IOTest/mat.io.all.png")
# plt.show()


Compress: 0
 Running        h5py_save:    0.6077 seconds,  106.2885 MB, deviation:     0.0000.
 Running        h5py_save:    2.3266 seconds,  425.9930 MB, deviation:     0.0000.

Compress: 1
 Running        h5py_save:    4.0209 seconds,   76.6369 MB, deviation:     0.0000.
 Running        h5py_save:   16.4079 seconds,  308.5839 MB, deviation:     0.0000.

Compress: 5
 Running        h5py_save:    6.6968 seconds,   76.4792 MB, deviation:     0.0000.
 Running        h5py_save:   26.8814 seconds,  305.9803 MB, deviation:     0.0000.

Compress: 7
 Running        h5py_save:    7.8199 seconds,   76.4706 MB, deviation:     0.0000.
 Running        h5py_save:   31.8503 seconds,  307.3417 MB, deviation:     0.0000.


In [None]:
plt.figure(figsize=(12, 8))
plt.title('Data IO')

all_lst = {}
all_lst['npz'] = np.array(npz_point)
all_lst['npy'] = np.array(npy_point)
all_lst['mat'] = np.array(mat_point)
all_lst['kaldi_io'] = np.array(kaldi_io_point)
all_lst['kaldiio'] = np.array(kaldiio_point)
all_lst['pickle'] = np.array(pick_point)
all_lst['lmdb'] = np.array(lmdb_point)
all_lst['h5py'] = np.array(h5py_point)

# print(all_lst)

plt.rc('font', family='Times New Roman')
plt.figure(figsize=(8, 6))
plt.title('Data IO With 100,500,1k,2k,5k,1w,2w,5w NumPy [x,64] Matrix', fontsize=16)

for k in sets:
    lst = np.array(all_lst[k])
    plt.plot(lst[:, 0], lst[:, 1], marker='o')
    # plt.plot(np.mean(lst[:, 0]/num_sam), np.mean(lst[:, 1]/num_sam), marker='o')

plt.legend(sets, fontsize=15)
plt.xlabel('Time (s)', fontsize=14)
plt.xticks(fontsize=14)
plt.ylabel('Size (MB)', fontsize=14)
plt.yticks(fontsize=14)

In [None]:
# Train Epoch 1: [ 3.0% ] Batch Len: 48000 Accuracy(%):   0.00% Avg Loss: 15.2635: : 54it [07:07,  4.43s/it]