In [None]:
from __future__ import print_function
from __future__ import absolute_import

from ku import generators as gr
from ku import generic as gen
from ku import image_utils as iu
from ku import model_helper as mh
from ku import image_augmenter as aug

from munch import Munch
import pandas as pd, numpy as np
import pytest, shutil, os
from matplotlib import pyplot as plt

%load_ext autoreload
%autoreload 2

In [None]:
gen_params = Munch(batch_size    = 2,
                   data_path     = 'images',
                   input_shape   = (224,224,5),
                   inputs        = ['filename'],
                   outputs       = ['score'],
                   shuffle       = False,
                   fixed_batches = True)

ids = pd.read_csv(u'ids.csv', encoding='latin-1')

np.all(ids.columns == ['filename', 'score'])
np.all(ids.score == range(1,5))

In [None]:
def preproc(im, arg1, arg2):
    return np.zeros(1) + arg1 + arg2

gen_params_local = gen_params.copy()
gen_params_local.process_fn = preproc
gen_params_local.process_args  = {'filename': ['filename_args','filename_args']}
gen_params_local.batch_size = 4

ids_local = ids.copy()
ids_local['filename_args'] = range(len(ids_local))

g = gr.DataGeneratorDisk(ids_local, **gen_params_local)
x = g[0]
# gen.pretty(g)
assert np.array_equal(np.squeeze(x[0][0].T), np.arange(4)*2)

In [None]:
g = gr.DataGeneratorDisk(ids, **gen_params)
print(isinstance(g[0][1], list))
print(np.all(g[0][1][0] == np.array([[1],[2]])))

gen.get_sizes(g[0])=='([array<2,224,224,3>], [array<2,1>])'

In [None]:
# read_fn = lambda p: iu.resize_image(iu.read_image(p), (100,100))
# g = gr.DataGeneratorDisk(ids, read_fn=read_fn, **gen_params)
# gen.get_sizes(g[0]) =='([array<2,100,100,3>], [array<2,1>])'

In [None]:
# # reload(gen)
# x = np.array([[1,2,3]])
# print(gen.get_sizes(([x.T],1,[4,5])))
# y = np.array([[1,[1,2]]])
# print(gen.get_sizes(y))
# z = [g[0],([2],)]
# print(gen.get_sizes(z[1]))

In [None]:
gen_params.inputs = ['filename', 'filename']
g = gr.DataGeneratorDisk(ids, **gen_params)
assert gen.get_sizes(g[0]) == '([array<2,224,224,3>, array<2,224,224,3>], [array<2,1>])'

g.inputs_df = ['score', 'score']
g.inputs = []
g.outputs = []
gen.get_sizes(g[0])

g.inputs_df = [['score'], ['score','score']]

assert gen.get_sizes(g[0]) == '([array<2,1>, array<2,2>], [])'

g.inputs_df = []
g.outputs = ['score']
assert gen.get_sizes(g[0]) == '([], [array<2,1>])'

g.outputs = ['score',['score']]
with pytest.raises(AssertionError): g[0]

g.outputs = [['score'],['score']]
assert gen.get_sizes(g[0]) == '([], [array<2,1>, array<2,1>])'

In [None]:
with gen.H5Helper('data.h5', overwrite=True) as h:
    data = np.expand_dims(np.array(ids.score), 1)
    h.write_data(data, list(ids.filename))

with gen.H5Helper('data.h5', 'r') as h:
    data = h.read_data(list(ids.filename))
    assert all(data == np.array([[1],[2],[3],[4]]))

In [None]:
gen_params.update(data_path='data.h5', 
                  inputs=['filename'],
                  batch_size=2)
gen.pretty(gen_params)
g = gr.DataGeneratorHDF5(ids, **gen_params)
assert gen.get_sizes(g[0]) == '([array<2,1>], [array<2,1>])'

g.inputs_df = ['score', 'score']
g.inputs = []
g.outputs = []
assert gen.get_sizes(g[0]) == '([array<2,2>], [])'

g.inputs_df = [['score'], ['score','score']]
assert gen.get_sizes(g[0]) == '([array<2,1>, array<2,2>], [])'

g.inputs_df = []
g.outputs = ['score']
assert gen.get_sizes(g[0]) == '([], [array<2,1>])'

g.outputs = ['score',['score']]
with pytest.raises(AssertionError): g[0]

g.outputs = [['score'],['score']]
assert gen.get_sizes(g[0]) == '([], [array<2,1>, array<2,1>])'

In [None]:
d = {'features': [1, 2, 3, 4, 5], 'mask': [1, 0, 1, 1, 0]}
df = pd.DataFrame(data=d)

def filter_features(df):
    return np.array(df.loc[df['mask']==1,['features']])

gen_params.update(data_path = None, 
                  outputs   = filter_features,
                  inputs    = [],
                  inputs_df = ['features'],
                  shuffle   = False,
                  batch_size= 5)
# gen.pretty(gen_params)

g = gr.DataGeneratorHDF5(df, **gen_params)
assert gen.get_sizes(g[0]) == '([array<5,1>], array<3,1>)'
assert all(np.squeeze(g[0][0]) == np.arange(1,6))
assert all(np.squeeze(g[0][1]) == [1,3,4])

In [None]:
m = np.zeros((5,5,3))
c = np.zeros((5,5,3))
c[1:4,1:4,:] = 1

assert np.array_equal(aug.cropout_patch(m, patch_size=(3,3), patch_position=(0.5,0.5), fill_val=1), c)

m = np.zeros((256,256,3))
plt.imshow(aug.cropout_random_patch(m.copy(), patch_size=(128,128), fill_val=1))
plt.show()
plt.imshow(aug.cropout_random_patch(m.copy(), patch_size=(128,128), fill_val=1))
plt.show()

In [None]:
from ku import image_utils as iu
assert isinstance(iu.ImageAugmenter(np.ones(1)), aug.ImageAugmenter)

m = np.zeros((5,5,3))
c = np.zeros((5,5,3))
c[1:4,1:4,:] = 1

assert np.array_equal(aug.cropout_patch(m, patch_size=(3,3), patch_position=(0.5,0.5), fill_val=1), c)
assert np.array_equal(aug.ImageAugmenter(c).cropout((3,3), crop_pos=(0.5,0.5), fill_val=1).result, c)
assert np.array_equal(aug.ImageAugmenter(c).cropout((3,3), crop_pos=(0.5,0.5), fill_val=0).result, m)

assert np.array_equal(aug.ImageAugmenter(c).crop((3,3), crop_pos=(0.5,0.5)).result, np.ones((3,3,3)))

In [None]:
m = np.zeros((5,5,3))
ml, mr = [m]*2
ml[0:2,0:2,:] = 1
mr[0:2,-2:,:] = 1

assert np.array_equal(iu.ImageAugmenter(m).fliplr().result, m)
assert np.array_equal(iu.ImageAugmenter(ml).fliplr().result, mr)

In [None]:
# reload(gr)

def preproc(im, *arg):
    if arg:
        return np.zeros(im.shape) + arg
    else:
        return im

gen_params_local = gen_params.copy()
gen_params_local.update(process_fn = preproc,
                        data_path = 'data.h5', 
                        inputs    = ['filename', 'filename1'],
                        process_args = {'filename' :'args'},
                        batch_size = 4,
                        shuffle    = False)

ids_local = ids.copy()
ids_local['filename1'] = ids_local['filename']
ids_local['args'] = range(len(ids_local))
ids_local['args1'] = range(len(ids_local),0,-1)

g = gr.DataGeneratorHDF5(ids_local, **gen_params_local)

assert np.array_equal(np.squeeze(g[0][0][0]), np.arange(4))
assert np.array_equal(np.squeeze(g[0][0][1]), np.arange(1,5))
assert np.array_equal(np.squeeze(g[0][1]), np.arange(1,5))

In [None]:
# np.stack is much faster on float32, and still faster for float16 data
data_elem = np.arange(100000, dtype=np.float32)
data = [data_elem.copy() for i in range(10000)]

with gen.Timer('stack, convert float32'):
    data_new_stack = np.float32(np.stack(data))

with gen.Timer('iterate, init float32'):
    data_new = None
    for i, d in enumerate(data):
        if data_new is None:
            data_new = np.zeros((len(data),)+d.shape, dtype=np.float32)
        data_new[i, ...] = d

assert np.array_equal(data_new, data_new_stack)
gen.print_sizes(data_new)
gen.print_sizes(data_new_stack)

In [None]:
reload(gr)

gen_params_ = gen_params.copy()
gen_params_.process_fn = lambda im: [im, im+1]

g = gr.DataGeneratorDisk(ids, **gen_params_)
gen.print_sizes(g[0])
assert np.array_equal(g[0][0][0], g[0][0][1]-1)
assert np.array_equal(g[0][1][0], np.array([[1],[2]]))

In [None]:
def read_fn(*args):
    g = args[1]
    score = np.float32(g.ids[g.ids.filename==args[0]].score)
    return np.ones((3,3)) * score

gen_params_local = gen_params.copy()
gen_params_local.batch_size = 3
gen_params_local.read_fn = read_fn
gen_params_local.process_fn = lambda im: [im, im+1]

g = gr.DataGeneratorDisk(ids, **gen_params_local)
gen.print_sizes(g[0])
print(g[0][0][1])
assert np.array_equal(g[0][0][0], g[0][0][1]-1)
assert np.array_equal(g[0][0][1][0,...], np.ones((3,3))*2.)

In [None]:
m = np.ones((4,4))
assert np.array_equal(aug.imshuffle(m, [2,2]), np.ones((4,4)))

m[:,0] = 0
assert np.sum(aug.imshuffle(m, [4,4])==0)==4
assert np.array_equal(aug.imshuffle(m, [1,1]), m)

m = np.zeros((2,2))
m[0,0] = 1
for _ in range(1000):
    assert np.sum(aug.imshuffle_pair(m, m, [2,2]))<=2
    assert np.sum(aug.imshuffle_pair(m, 1-m, [2,2]))>=1

In [None]:
m1 = np.ones((4,4))
m2 = np.zeros((4,4))

for _ in range(1000):
    for ratio in [0,0.25,0.5,0.75,1]:
        assert np.sum(aug.imshuffle_pair(m1, m2, [4,4], ratio)) == ratio*16

assert np.sum(aug.imshuffle_pair(m1, m2, [1,4], 0.6)) == 8
assert np.sum(aug.imshuffle_pair(m1, m2, [1,4], 0.7)) == 12
assert np.sum(aug.imshuffle_pair(m1, m2, [1,4], 0.75)) == 12
assert np.sum(aug.imshuffle_pair(m1, m2, [1,4], 0.8)) == 12
assert np.sum(aug.imshuffle_pair(m1, m2, [1,4], 0.9)) == 16
assert np.sum(aug.imshuffle_pair(m1, m2, [1,4], 1)) == 16

In [None]:
m1 = np.ones((4,4))
m2 = np.zeros((4,4))

mix1 = aug.imshuffle_pair(m1, m2, [2,2], flip=True)
mix2 = aug.imshuffle_pair(m1, m2, [2,2], flip=False)

assert np.sum(mix1) == np.sum(mix2)

In [None]:
size = 10
ids_defa = pd.read_csv(u'ids.csv', encoding='latin-1')
fnames = np.concatenate([ids_defa.filename.values]*3)[:size]
ids  = pd.DataFrame(dict(cats  = ['cat{}'.format(i) for i in range(size)],
                        dogs  = ['dog{}'.format(i) for i in range(size)],
                        image_name = fnames,
                        group = [i//4 for i in range(10)]))

gen_params = Munch(batch_size    = 1,
                   inputs        = ['image_name'],
                   outputs       = ['dogs'],
                   data_path     = 'images',
                   group_by      = 'group',
                   shuffle       = False,
                   fixed_batches = True)


for batch_size, len_g in zip(range(1, 5), [10, 5, 5, 3]):
    gen_params.batch_size = batch_size
    g = gr.DataGeneratorDisk(ids, **gen_params)
    # gen.print_sizes(g[0])
#     print('num batches:',len(g))
    assert len(g)==len_g
    a = g.ids_index.groupby('batch_index').group_by.mean().values
    b = g.ids_index.groupby('batch_index').group_by.last().values
    assert np.array_equal(a, b)

In [None]:
gen_params.group_by = None
for batch_size, len_g in zip(range(1, 5), [10, 5, 3, 2]):
    gen_params.batch_size = batch_size
    g = gr.DataGeneratorDisk(ids, **gen_params)
#     print('num batches:',len(g))
    assert len(g)==len_g
#     display(g.ids_index)

gen_params.fixed_batches = False
for batch_size, len_g in zip(range(1, 5), [10, 5, 4, 3]):
    gen_params.batch_size = batch_size
    g = gr.DataGeneratorDisk(ids, **gen_params)
#     print('num batches:',len(g))
    assert len(g)==len_g
#     display(g.ids_index)

# g.ids_index.sort_values('batch_index')
# iu.view_stack(gen.mapmm(g[0][0][0]))

In [None]:
iu.resize_folder('images/', 'images_temp/',
                 image_size_dst=(50,50), over_write=True)
image_list = iu.glob_images('images_temp', verbose=False)
assert image_list
ims = iu.read_image_batch(image_list)
assert ims.shape == (4, 50, 50, 3)

failed_images, all_images = iu.check_images('images_temp/')
assert len(failed_images)==0
assert len(all_images)==4

iu.save_images_to_h5('images_temp', 'images.h5', 
                     overwrite=True)
with gr.H5Helper('images.h5') as h:
    assert list(h.hf.keys()) == sorted(all_images)
shutil.rmtree('images_temp')
os.unlink('images.h5')

In [None]:
path_src='images/'
path_dst='images_aug/'

def process_gen():
    for num_patch in [(i,j) for i in [1,2,4,8] for j in [1,2,4,8]]:
        fn = lambda im: aug.imshuffle(im, num_patch)
        yield fn, dict(num_patch=num_patch)
        
ids_aug, errors = iu.augment_folder(path_src, path_dst, 
                                    process_gen, verbose=False)

assert len(errors)==0
assert len(ids_aug)==64

(image_path, ext) = os.path.split(ids_aug.iloc[0,:].image_path)
_, file_names = iu.glob_images('{}{}/'.format(path_dst,image_path), split=True)

first_group_names = list(ids_aug.groupby('num_patch'))[0][1].image_name
assert sorted(first_group_names) == sorted(file_names)

shutil.rmtree(path_dst)

In [None]:
iu.resize_folder('images/', 'images1/', image_size_dst=(100,100), overwrite=True)

gp = gen_params.copy()
gp.inputs = ['filename']
gp.group_names = ['images/']
gp.data_path   = ''
g = gr.DataGeneratorDisk(ids, **gp)
assert gen.get_sizes(g[0]) == '([array<2,224,224,3>], [array<2,1>])'

gp.group_names = ['images/', 'images1/']
g = gr.DataGeneratorDisk(ids, **gp)
assert gen.get_sizes(g[0]) == '([array<2,224,224,3>, array<2,100,100,3>], [array<2,1>])'

gp.group_names = [['images/'], ['images1/']]
sizes = []
for i in range(100):
    g = gr.DataGeneratorDisk(ids, **gp)
    sizes.append(g[0][0][0].shape[1])

assert np.unique(sizes).shape[0]>1

!rm -R images1/

In [None]:
iu.resize_folder('images/', 'base/images100/', image_size_dst=(100,100), overwrite=True)
iu.resize_folder('images/', 'base/images50/', image_size_dst=(50,50), overwrite=True)

gp = gen_params.copy()
gp.inputs       = ['filename']
gp.data_path    = ''
gp.group_names  = ['base']
gp.random_group = True
g = gr.DataGeneratorDisk(ids, **gp)
assert np.array_equal(np.unique([x[0][0].shape[1] 
                          for i in range(100) for x in g]), [50,100])
    
!rm -R base/

In [None]:
ids = pd.DataFrame(dict(a = range(10), 
                        b = list(range(9,-1,-1))))
gen_params = Munch(batch_size    = 4,
                   data_path     = None,
                   input_shape   = None,
                   inputs_df     = ['a'],
                   outputs       = ['b'],
                   shuffle       = False,
                   fixed_batches = True)

# check fixed batches switch
g = gr.DataGeneratorDisk(ids, **gen_params)
assert np.array_equal([gen.get_sizes(x) for x in g], 
                      ['([array<4,1>], [array<4,1>])', 
                       '([array<4,1>], [array<4,1>])'])
assert np.array_equal(g[0][0][0].squeeze(), range(4))

gen_params.fixed_batches = False
g = gr.DataGeneratorDisk(ids, **gen_params)
assert np.array_equal([gen.get_sizes(x) for x in g], 
                      ['([array<4,1>], [array<4,1>])',
                       '([array<4,1>], [array<4,1>])',
                       '([array<2,1>], [array<2,1>])'])
assert np.array_equal(g[2][0][0].squeeze(), [8, 9])

# check randomized
gen_params.shuffle = True
gen_params.fixed_batches = False # maintain
g = gr.DataGeneratorDisk(ids, **gen_params)

# check if it returns all items
data = list(zip(*list(g)))
data0 = np.concatenate([l[0] for l in data[0]], axis=0).squeeze()
data1 = np.concatenate([l[0] for l in data[1]], axis=0).squeeze()
assert np.array_equal(np.sort(data0), np.arange(10))
assert np.array_equal(np.sort(data1), np.arange(10))

# check if randomization is applied, consistently
num_randoms0 = 0
num_randoms1 = 0
for i in range(100):
    g = gr.DataGeneratorDisk(ids, **gen_params)
    data = list(zip(*list(g)))
    data0 = np.concatenate([l[0] for l in data[0]], axis=0).squeeze()
    data1 = np.concatenate([l[0] for l in data[1]], axis=0).squeeze()

    # check consistency
    ids_ = ids.copy()
    ids_.index = ids_.a
    np.array_equal(ids_.loc[data0].b, data1)

    num_randoms0 += not np.array_equal(data0, np.arange(10))
    num_randoms1 += not np.array_equal(data1, np.arange(10))

# check randomizatino, at least once
assert num_randoms0
assert num_randoms0

In [None]:
from keras.layers import Input
from keras.models import Model
from ku import applications as apps

ids = pd.DataFrame(dict(a = np.arange(100), 
                        b = np.flip(np.arange(100))))
ids = apps.get_train_test_sets(ids)
# display(ids)

X = Input(shape=(1,), dtype='float32')
y = apps.fc_layers(X, name = 'head',
                   fc_sizes      = [5, 1],
                   dropout_rates = [0, 0],
                   batch_norm    = 0)
model = Model(inputs=X, outputs=y)

gen_params = Munch(batch_size   = 4,
                  data_path     = '',
                  input_shape   = (1,),
                  inputs_df     = ['a'],
                  outputs       = ['b'])

helper = mh.ModelHelper(model, 'test_model', ids, 
                        loss       = 'MSE',
                        metrics    = ['mean_absolute_error'],
                        monitor_metric = 'val_mean_absolute_error',
                        multiproc  = False, workers = 2,
                        logs_root  = 'logs',
                        models_root= 'models',
                        gen_params = gen_params)

print('Model name:', helper.model_name(test='on'))
helper.update_name()

valid_gen = helper.make_generator(ids[ids.set == 'validation'], 
                                  shuffle     =  False)
valid_gen.batch_size = len(valid_gen.ids)
valid_gen.on_epoch_end()
assert valid_gen.ids_index.batch_index.unique().size == 1

helper.train(lr=1e-1, epochs=50, verbose=False, valid_in_memory=True);

assert path.exists(helper.params.logs_root + '/' + helper.model_name())

helper.load_model(); # best
valid_best1 = helper.validate(verbose=1)

helper.train(lr=1, epochs=10, verbose=False, valid_in_memory=True);

# validate final model
valid_res_fin = helper.validate(verbose=1)

helper.load_model(); # best
valid_best2 = helper.validate(verbose=1)

if valid_res_fin['loss'] > valid_best1['loss']:
    assert valid_best1['loss'] == valid_best2['loss']

y_pred = helper.predict(valid_gen)
y_true = ids[ids.set=='validation'].b.values
_, _, val_mae, _ = apps.rating_metrics(y_true, y_pred, show_plot=False);
print(valid_best2)
assert np.abs(val_mae - valid_best2['mean_absolute_error']) < 1e-2

In [None]:
import glob
res = (256, 192)
archive_url = "http://datasets.vqa.mmsp-kn.de/archives/koniq10k_{}x{}.tar".format(*res)
print('download URL:', archive_url)
gen.download_archive(archive_url,'./')
assert os.path.exists('256x192')
assert len(glob.glob('256x192/*')) == 10373
shutil.rmtree('256x192')

archive_url = "http://datasets.vqa.mmsp-kn.de/archives/koniq10k_{}x{}_test.zip".format(*res)
print('download URL:', archive_url)
gen.download_archive(archive_url,'./')
assert os.path.exists('256x192')
assert len(glob.glob('256x192/*')) == 2015
shutil.rmtree('256x192')

In [None]:
gen_params_local = gen_params.copy()
ids_local = ids.copy()
def ids_fn():
    ids_local.score = -ids_local.score
    return ids_local

gen_params_local.ids_fn = ids_fn
gen_params_local.batch_size = 4
g = gr.DataGeneratorDisk(ids, **gen_params_local)
x = g[0][1][0]
g.on_epoch_end()
y = g[0][1][0]
assert np.array_equal(-x, y)