quick h5 speed test

Rhoana · Apr 17, 2017 · e768f28 · e768f28
1 parent 64fea6e
commit e768f28
Show file tree

Hide file tree

Showing 4 changed files with 369 additions and 0 deletions.
diff --git a/scripts/test/make_nested.py b/scripts/test/make_nested.py
@@ -0,0 +1,98 @@
+import numpy as np
+import json
+
+def flatten(output, item):
+    if 'path' in item:
+        return output + [item]
+    if not isinstance(item, dict):
+        return output
+    return reduce(flatten, item.values(), output)
+
+def values(res, key, nest=[], **keywords):
+    name = '{}_{}_{}'.format(*key)
+    # Format the nested paths
+    paths = '/'.join(nest)
+    # add values to dictionary
+    if 'path' in keywords:
+        fmt = keywords['path']
+        # Add the full formatted path
+        full = fmt.format(paths, *key)
+        keywords['path'] = full
+        # Add the position list
+        keywords['nest'] = nest + [name]
+    # add to parent dictionary
+    res[name] = keywords
+
+def add(res, keys, nest=[], **keywords):
+    targets = res.keys()
+    if not targets:
+        for key in keys:
+            # add values to this level
+            values(res, key, nest, **keywords)
+    else:
+        for target in targets:
+            # Add to the list of nest levels
+            nested = nest + [target]
+            # add next levels to this level
+            add(res[target], keys, nested, **keywords)
+
+def add_all(result, shape, **keywords):
+    # Add all possible keys at this level
+    all_keys = np.where(np.ones(shape))
+    add(result, zip(*all_keys), **keywords)
+    # Get the result
+    return result
+
+
+def nester(tile_shape, group_shapes):
+
+    result = {}
+    file_fmt = 'data/{0}/raw_{1:04d}_{2:04d}_{3:04d}.h5'
+
+    # Add all the folder levels
+    for shape in group_shapes:
+        result = add_all(result, shape)
+    # Add all the tile endpoints
+    result = add_all(result, tile_shape, path=file_fmt)
+
+    # Get full result
+    return result
+
+class Simplifier():
+    def __init__(self, nesting):
+        self.nesting = nesting
+
+    def simplify(self, item):
+        nests = [n.split('_') for n in item['nest']]
+        offset = (self.nesting * np.uint32(nests)).sum(0)
+        return {
+            'path': item['path'],
+            'offset': list(offset),
+        }
+
+def main(out_file, tile_shape, group_shapes):
+    # Get file path structures
+    nested = nester(tile_shape, group_shapes)
+    listed = reduce(flatten, nested.values(), [])
+
+    # Get the sizes for each nesting
+    levels = [tile_shape] + group_shapes[:-1]
+    # Get the cumulative effect of each layer
+    nesting = np.cumprod(levels, axis=0)
+    nesting = np.r_[nesting[::-1], [[1,1,1]]]
+    # Simplify
+    simp = Simplifier(nesting)
+    output = map(simp.simplify, listed)
+
+    # Document the model
+    with open(out_file,'w') as fd:
+        json.dump(output, fd, indent=4)
+
+if __name__ == '__main__':
+    out_file = 'dataset.json'
+    tile_shape = [10,10,10]
+    group_shapes = [
+        [2,2,2],
+    ]
+    main(out_file, tile_shape, group_shapes)
+
diff --git a/scripts/test/speed_demo.py b/scripts/test/speed_demo.py
@@ -0,0 +1,85 @@
+import make_nested
+import test_nested
+import numpy as np
+import shutil
+import json
+import os
+
+def get_filename(tile_shape, group_shape):
+    t_n = np.prod(tile_shape)
+    g_n = np.prod(group_shape)
+    return '{}_in_{}.json'.format(t_n, g_n)
+
+def make_one(shapes):
+    tile_shape, group_shape = shapes
+    out_file = get_filename(tile_shape, group_shape)
+    return make_nested.main(out_file, tile_shape, [group_shape])
+
+def test_one(shapes, trials, full_shape, file_block):
+    if os.path.exists('data'):
+        shutil.rmtree('data')
+    tile_shape, group_shape = shapes
+    in_file = get_filename(tile_shape, group_shape)
+    return test_nested.main(in_file, trials, full_shape, file_block)
+
+if __name__ == '__main__':
+    tile_group_shapes = [
+#       [ [1,1,1], [2,2,2] ],
+#       [ [2,2,2], [2,2,2] ],
+#       [ [4,4,4], [2,2,2] ],
+       [ [1,1,1], [1,1,1] ],
+       [ [1,1,2], [1,1,1] ],
+       [ [1,2,2], [1,1,1] ],
+       [ [1,2,4], [1,1,1] ],
+       [ [1,4,4], [1,1,1] ],
+#       [ [8,8,8], [1,1,1] ],
+    ]
+
+    # Make both json files
+    map(make_one, tile_group_shapes)
+
+    trials = 2
+    full_shape = [8, 8192, 8192]
+    file_blocks = [
+        [1, 1, 1],
+        [1, 1, 2],
+        [1, 2, 2],
+        [1, 2, 4],
+        [1, 4, 4],
+        [1, 4, 8],
+        [1, 8, 8],
+        [1, 16, 8],
+        [1, 16, 16],
+        [1, 32, 16],
+        [1, 32, 32],
+    ]
+    speeds = []
+    # Make and test hdf5 files
+    for t_g in tile_group_shapes:
+        for f_b in file_blocks:
+            # Set a max cutoff at 1024 blocks
+            n_blocks = np.prod(f_b * np.multiply(*t_g))
+            if n_blocks > 1024:
+                continue
+            # Get the speed for the requested sizes
+            speed = test_one(t_g, trials, full_shape, f_b)
+            if not speed:
+                continue
+            speed.update({
+                'n_folders': t_g[1],
+                'n_trials': trials,
+            })
+            speeds.append(speed)
+            msg = """
+mean of {n_trials} trials: {mean_time} for
+    {n_tiles_kji} tiles from {n_files_zyx} files
+    over {n_folders} folders.""".format(**speed)
+            print(msg)
+
+    # Document the model
+    speed_fmt = 'speed_{}_{}_{}.json'
+    speed_out = speed_fmt.format(*full_shape)
+    # Write the model to json
+    with open(speed_out, 'w') as fd:
+        json.dump(speeds, fd, indent=4)
+
diff --git a/scripts/test/speed_graph.py b/scripts/test/speed_graph.py
@@ -0,0 +1,53 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import json
+
+def n_key_filter(n, key):
+    def trial_filter(trial):
+        return n == np.prod(trial[key])
+    return trial_filter
+
+if __name__ == '__main__':
+
+    full_shape = [8, 8192, 8192]
+    in_fmt = 'speed_{}_{}_{}.json'
+    in_file = in_fmt.format(*full_shape)
+
+    # Load the model
+    with open(in_file,'r') as fd:
+        trials = json.load(fd)
+
+    ##### 
+    # TEST 1
+    # Plot time by number of h5 files used
+    # Plot lines for every different number of tiles
+    #####
+    fig, ax = plt.subplots()
+    # Filter only for trials with files in one folder
+    in_one_folder = n_key_filter(1, 'n_folders')
+    one_folder = filter(in_one_folder, trials)
+    # Filter for trials with 1, 8, 64, or 512 files
+    for n_files in [1, 2, 4, 8, 16]:
+        has_n_files = n_key_filter(n_files, 'n_files_zyx')
+        n_file_dicts = filter(has_n_files, one_folder)
+        # Now, get number of tiles and time
+        def get_tile_times(d):
+            n_tiles = np.prod(d['n_tiles_kji'])
+            mean_time = d['mean_time']
+            return [n_tiles, mean_time]
+        tile_times = map(get_tile_times, n_file_dicts)
+        cause, result = zip(*tile_times)
+        ax.plot(cause, result, label = '{} files'.format(n_files))
+
+    legend = ax.legend(loc='upper left')
+    # Set the fontsize
+    for label in legend.get_texts():
+            label.set_fontsize('large')
+
+    # Save to file
+    plt.ylabel('time (seconds)')
+    plt.xlabel('total number of tiles')
+    title = 'time to load {}x{}x{} hdf5'.format(*full_shape)
+    plt.title(title)
+    plt.xscale('log', basex=2)
+    plt.savefig('out.png')
diff --git a/scripts/test/test_nested.py b/scripts/test/test_nested.py
@@ -0,0 +1,133 @@
+from numpy.random import randint
+import numpy as np
+import json
+import h5py
+import time
+import os
+
+class Maker():
+    def __init__(self, size, dtype):
+        self.size = size
+        self.dtype = dtype
+
+    def make(self, filedata):
+        filename = filedata['path']
+        # Make the parent directory
+        directory = os.path.dirname(filename)
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+        # Get the datatype, noise range, and size
+        dtype = getattr(np, 'uint{}'.format(self.dtype))
+        dmax = 2 ** self.dtype
+        dsize = self.size
+        # Create the file from a path
+        with h5py.File(filename, 'w') as fd:
+            # Make a random uint array
+            pattern = randint(dmax, size= dsize, dtype= dtype)
+            fd.create_dataset('stack', data= pattern)
+
+class Hasher():
+    def __init__(self):
+        self.offsets = []
+    @property
+    def shape(self):
+        max_offset = np.max(self.offsets,0)
+        return (max_offset + 1)
+
+    def hash(self, hashed, filedata):
+        offset = filedata['offset']
+        # Track all the offsets
+        self.offsets.append(offset)
+        # Map the offset to the path
+        hashed[str(offset)] = filedata['path']
+        return hashed
+
+class Tester():
+    def __init__(self, files, zyx_shape, kji_shape):
+        self.zyx_shape = np.array(zyx_shape)
+        self.kji_shape = np.array(kji_shape)
+        self.int_shape = np.uint32(kji_shape)
+        self.files = files
+
+    def load(self, path, start):
+        with h5py.File(path, 'r') as fd:
+            dataset = fd[fd.keys()[0]]
+            k0,j0,i0 = start
+            k1,j1,i1 = start + np.uint32(self.int_shape)
+            v = dataset[k0:k1,j0:j1,i0:i1]
+
+    def test(self, kji):
+        # Get xyz tile file
+        kji_full = kji * self.kji_shape
+        zyx = np.floor(kji_full / self.zyx_shape)
+        zyx_full = np.uint32(zyx * self.zyx_shape)
+        # Get file path and file start
+        file_key = str(list(np.uint32(zyx)))
+        file_path = self.files.get(file_key)
+        if not file_path:
+            raise AttributeError
+        file_start = np.uint32(kji_full - zyx_full)
+        # Load file from path and start
+        self.load(file_path, file_start)
+
+def main(in_file, trials, full_shape, block_shape):
+    # Load the model
+    with open(in_file,'r') as fd:
+        listed = json.load(fd)
+
+    # Hash the files
+    hasher = Hasher()
+    model = reduce(hasher.hash, listed, {})
+    tile_shape = np.uint32(hasher.shape)
+
+    # Get the tile shape
+    zyx_shape = full_shape / tile_shape
+    # Get the block shape
+    kji_shape = zyx_shape / block_shape
+    dtype = 8
+
+    # Make the files
+    make = Maker(zyx_shape, dtype)
+    map(make.make, listed)
+
+    # Get all kji blocks
+    kji_range = full_shape // kji_shape
+    kji_count = np.prod(tile_shape*block_shape)
+    # Ensure consistencey
+    if np.prod(kji_range) != kji_count:
+        return {}
+    # Get all the block indexes
+    all_kji = zip(*np.where(np.ones(kji_range)))
+
+    # Test the files
+    time_results = []
+    test = Tester(model, zyx_shape, kji_shape)
+    # Run the testing
+    for t in range(trials):
+        time_start = time.time()
+        map(test.test, all_kji)
+        time_end = time.time()
+        # Record time difference
+        time_diff = time_end - time_start
+        time_results.append(time_diff)
+
+    return {
+        'time': time_results,
+        'mean_time': float(np.mean(time_results)),
+        'zyx_shape': map(int, zyx_shape),
+        'kji_shape': map(int, kji_shape),
+        'n_files_zyx': map(int, tile_shape),
+        'n_tiles_kji': map(int, kji_range),
+        'in_file': str(in_file),
+    }
+
+
+if __name__ == '__main__':
+
+    block_shape = [1, 2, 2]
+    in_file = 'dataset.json'
+    full_shape = np.uint32([20, 10000, 10000])
+
+    main(in_file, full_shape, block_shape)
+
+