Skip to content

Commit

Permalink
quick h5 speed test
Browse files Browse the repository at this point in the history
  • Loading branch information
thejohnhoffer committed Apr 17, 2017
1 parent 64fea6e commit e768f28
Show file tree
Hide file tree
Showing 4 changed files with 369 additions and 0 deletions.
98 changes: 98 additions & 0 deletions scripts/test/make_nested.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import numpy as np
import json

def flatten(output, item):
if 'path' in item:
return output + [item]
if not isinstance(item, dict):
return output
return reduce(flatten, item.values(), output)

def values(res, key, nest=[], **keywords):
name = '{}_{}_{}'.format(*key)
# Format the nested paths
paths = '/'.join(nest)
# add values to dictionary
if 'path' in keywords:
fmt = keywords['path']
# Add the full formatted path
full = fmt.format(paths, *key)
keywords['path'] = full
# Add the position list
keywords['nest'] = nest + [name]
# add to parent dictionary
res[name] = keywords

def add(res, keys, nest=[], **keywords):
targets = res.keys()
if not targets:
for key in keys:
# add values to this level
values(res, key, nest, **keywords)
else:
for target in targets:
# Add to the list of nest levels
nested = nest + [target]
# add next levels to this level
add(res[target], keys, nested, **keywords)

def add_all(result, shape, **keywords):
# Add all possible keys at this level
all_keys = np.where(np.ones(shape))
add(result, zip(*all_keys), **keywords)
# Get the result
return result


def nester(tile_shape, group_shapes):

result = {}
file_fmt = 'data/{0}/raw_{1:04d}_{2:04d}_{3:04d}.h5'

# Add all the folder levels
for shape in group_shapes:
result = add_all(result, shape)
# Add all the tile endpoints
result = add_all(result, tile_shape, path=file_fmt)

# Get full result
return result

class Simplifier():
def __init__(self, nesting):
self.nesting = nesting

def simplify(self, item):
nests = [n.split('_') for n in item['nest']]
offset = (self.nesting * np.uint32(nests)).sum(0)
return {
'path': item['path'],
'offset': list(offset),
}

def main(out_file, tile_shape, group_shapes):
# Get file path structures
nested = nester(tile_shape, group_shapes)
listed = reduce(flatten, nested.values(), [])

# Get the sizes for each nesting
levels = [tile_shape] + group_shapes[:-1]
# Get the cumulative effect of each layer
nesting = np.cumprod(levels, axis=0)
nesting = np.r_[nesting[::-1], [[1,1,1]]]
# Simplify
simp = Simplifier(nesting)
output = map(simp.simplify, listed)

# Document the model
with open(out_file,'w') as fd:
json.dump(output, fd, indent=4)

if __name__ == '__main__':
out_file = 'dataset.json'
tile_shape = [10,10,10]
group_shapes = [
[2,2,2],
]
main(out_file, tile_shape, group_shapes)

85 changes: 85 additions & 0 deletions scripts/test/speed_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import make_nested
import test_nested
import numpy as np
import shutil
import json
import os

def get_filename(tile_shape, group_shape):
t_n = np.prod(tile_shape)
g_n = np.prod(group_shape)
return '{}_in_{}.json'.format(t_n, g_n)

def make_one(shapes):
tile_shape, group_shape = shapes
out_file = get_filename(tile_shape, group_shape)
return make_nested.main(out_file, tile_shape, [group_shape])

def test_one(shapes, trials, full_shape, file_block):
if os.path.exists('data'):
shutil.rmtree('data')
tile_shape, group_shape = shapes
in_file = get_filename(tile_shape, group_shape)
return test_nested.main(in_file, trials, full_shape, file_block)

if __name__ == '__main__':
tile_group_shapes = [
# [ [1,1,1], [2,2,2] ],
# [ [2,2,2], [2,2,2] ],
# [ [4,4,4], [2,2,2] ],
[ [1,1,1], [1,1,1] ],
[ [1,1,2], [1,1,1] ],
[ [1,2,2], [1,1,1] ],
[ [1,2,4], [1,1,1] ],
[ [1,4,4], [1,1,1] ],
# [ [8,8,8], [1,1,1] ],
]

# Make both json files
map(make_one, tile_group_shapes)

trials = 2
full_shape = [8, 8192, 8192]
file_blocks = [
[1, 1, 1],
[1, 1, 2],
[1, 2, 2],
[1, 2, 4],
[1, 4, 4],
[1, 4, 8],
[1, 8, 8],
[1, 16, 8],
[1, 16, 16],
[1, 32, 16],
[1, 32, 32],
]
speeds = []
# Make and test hdf5 files
for t_g in tile_group_shapes:
for f_b in file_blocks:
# Set a max cutoff at 1024 blocks
n_blocks = np.prod(f_b * np.multiply(*t_g))
if n_blocks > 1024:
continue
# Get the speed for the requested sizes
speed = test_one(t_g, trials, full_shape, f_b)
if not speed:
continue
speed.update({
'n_folders': t_g[1],
'n_trials': trials,
})
speeds.append(speed)
msg = """
mean of {n_trials} trials: {mean_time} for
{n_tiles_kji} tiles from {n_files_zyx} files
over {n_folders} folders.""".format(**speed)
print(msg)

# Document the model
speed_fmt = 'speed_{}_{}_{}.json'
speed_out = speed_fmt.format(*full_shape)
# Write the model to json
with open(speed_out, 'w') as fd:
json.dump(speeds, fd, indent=4)

53 changes: 53 additions & 0 deletions scripts/test/speed_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import matplotlib.pyplot as plt
import numpy as np
import json

def n_key_filter(n, key):
def trial_filter(trial):
return n == np.prod(trial[key])
return trial_filter

if __name__ == '__main__':

full_shape = [8, 8192, 8192]
in_fmt = 'speed_{}_{}_{}.json'
in_file = in_fmt.format(*full_shape)

# Load the model
with open(in_file,'r') as fd:
trials = json.load(fd)

#####
# TEST 1
# Plot time by number of h5 files used
# Plot lines for every different number of tiles
#####
fig, ax = plt.subplots()
# Filter only for trials with files in one folder
in_one_folder = n_key_filter(1, 'n_folders')
one_folder = filter(in_one_folder, trials)
# Filter for trials with 1, 8, 64, or 512 files
for n_files in [1, 2, 4, 8, 16]:
has_n_files = n_key_filter(n_files, 'n_files_zyx')
n_file_dicts = filter(has_n_files, one_folder)
# Now, get number of tiles and time
def get_tile_times(d):
n_tiles = np.prod(d['n_tiles_kji'])
mean_time = d['mean_time']
return [n_tiles, mean_time]
tile_times = map(get_tile_times, n_file_dicts)
cause, result = zip(*tile_times)
ax.plot(cause, result, label = '{} files'.format(n_files))

legend = ax.legend(loc='upper left')
# Set the fontsize
for label in legend.get_texts():
label.set_fontsize('large')

# Save to file
plt.ylabel('time (seconds)')
plt.xlabel('total number of tiles')
title = 'time to load {}x{}x{} hdf5'.format(*full_shape)
plt.title(title)
plt.xscale('log', basex=2)
plt.savefig('out.png')
133 changes: 133 additions & 0 deletions scripts/test/test_nested.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from numpy.random import randint
import numpy as np
import json
import h5py
import time
import os

class Maker():
def __init__(self, size, dtype):
self.size = size
self.dtype = dtype

def make(self, filedata):
filename = filedata['path']
# Make the parent directory
directory = os.path.dirname(filename)
if not os.path.exists(directory):
os.makedirs(directory)
# Get the datatype, noise range, and size
dtype = getattr(np, 'uint{}'.format(self.dtype))
dmax = 2 ** self.dtype
dsize = self.size
# Create the file from a path
with h5py.File(filename, 'w') as fd:
# Make a random uint array
pattern = randint(dmax, size= dsize, dtype= dtype)
fd.create_dataset('stack', data= pattern)

class Hasher():
def __init__(self):
self.offsets = []
@property
def shape(self):
max_offset = np.max(self.offsets,0)
return (max_offset + 1)

def hash(self, hashed, filedata):
offset = filedata['offset']
# Track all the offsets
self.offsets.append(offset)
# Map the offset to the path
hashed[str(offset)] = filedata['path']
return hashed

class Tester():
def __init__(self, files, zyx_shape, kji_shape):
self.zyx_shape = np.array(zyx_shape)
self.kji_shape = np.array(kji_shape)
self.int_shape = np.uint32(kji_shape)
self.files = files

def load(self, path, start):
with h5py.File(path, 'r') as fd:
dataset = fd[fd.keys()[0]]
k0,j0,i0 = start
k1,j1,i1 = start + np.uint32(self.int_shape)
v = dataset[k0:k1,j0:j1,i0:i1]

def test(self, kji):
# Get xyz tile file
kji_full = kji * self.kji_shape
zyx = np.floor(kji_full / self.zyx_shape)
zyx_full = np.uint32(zyx * self.zyx_shape)
# Get file path and file start
file_key = str(list(np.uint32(zyx)))
file_path = self.files.get(file_key)
if not file_path:
raise AttributeError
file_start = np.uint32(kji_full - zyx_full)
# Load file from path and start
self.load(file_path, file_start)

def main(in_file, trials, full_shape, block_shape):
# Load the model
with open(in_file,'r') as fd:
listed = json.load(fd)

# Hash the files
hasher = Hasher()
model = reduce(hasher.hash, listed, {})
tile_shape = np.uint32(hasher.shape)

# Get the tile shape
zyx_shape = full_shape / tile_shape
# Get the block shape
kji_shape = zyx_shape / block_shape
dtype = 8

# Make the files
make = Maker(zyx_shape, dtype)
map(make.make, listed)

# Get all kji blocks
kji_range = full_shape // kji_shape
kji_count = np.prod(tile_shape*block_shape)
# Ensure consistencey
if np.prod(kji_range) != kji_count:
return {}
# Get all the block indexes
all_kji = zip(*np.where(np.ones(kji_range)))

# Test the files
time_results = []
test = Tester(model, zyx_shape, kji_shape)
# Run the testing
for t in range(trials):
time_start = time.time()
map(test.test, all_kji)
time_end = time.time()
# Record time difference
time_diff = time_end - time_start
time_results.append(time_diff)

return {
'time': time_results,
'mean_time': float(np.mean(time_results)),
'zyx_shape': map(int, zyx_shape),
'kji_shape': map(int, kji_shape),
'n_files_zyx': map(int, tile_shape),
'n_tiles_kji': map(int, kji_range),
'in_file': str(in_file),
}


if __name__ == '__main__':

block_shape = [1, 2, 2]
in_file = 'dataset.json'
full_shape = np.uint32([20, 10000, 10000])

main(in_file, full_shape, block_shape)


0 comments on commit e768f28

Please sign in to comment.