In [1]:
# This notebook is about using shared variables.
# The upside of shared variables is that they can remain on the GPU between calls,
# effectively removing the GPU transfer bottleneck when you have enough memory.
#
# The downside is that many libraries provide precompiled function expressions,
# where can need to substitute real variables with shared variables somehow.
#
# Theano does not provide an elegant mechanism for doing so.
#
# There appears to be a way around this, assuming we *know* what input variable was used.
# 
# Unfortunately I was not able to find a way to retrieve the input variables of a function.
#
# FunctionMaker.inputs does exist, but seems to be a long way from what theano.function actually returns.
# However, a more elegant solution than the one presented here may exist.

In [2]:
import theano 
import theano.tensor as T
import numpy as np

Using gpu device 0: Quadro K1000M (CNMeM is disabled, CuDNN not available)


Couldn't import dot_parser, loading of dot files will not be possible.


In [3]:
x = T.ivector('x')
expr = x.sum()
f = theano.function([x], [expr])

In [4]:
v = np.arange(100, dtype=np.int32)

In [5]:
f(v)

[array(4950)]

In [6]:
b, e = T.iscalar('b'), T.iscalar('e')
v_shared = theano.shared(v)
f_batched = theano.function([b,e],f.outputs,givens={x: v_shared[b:e]})

In [8]:
print(f_batched(0, 100), f_batched(0,10), f_batched(90,100))

([array(4950)], [array(45)], [array(945)])


In [None]:
# Using the same trick on nolearn.lasagne's Neural network.
#
# The main work occurs in the train_loop function, and looks like this:

 for Xb, yb in self.batch_iterator_train(X_train, y_train):
     batch_train_loss, accuracy = self.apply_batch_func(
         self.train_iter_, Xb, yb)
     train_losses.append(batch_train_loss)
     train_accuracies.append(accuracy)

     for func in on_batch_finished:
         func(self, self.train_history_)

# apply_batch_func seems to do nothing, other than simply calling the function:
    @staticmethod
    def apply_batch_func(func, Xb, yb=None):
        if isinstance(Xb, dict):
            kwargs = dict(Xb)
            if yb is not None:
                kwargs['y'] = yb
            return func(**kwargs)
        else:
            return func(Xb) if yb is None else func(Xb, yb)

# The self.train_iter function is initialized here:

        iter_funcs = self._create_iter_funcs(
            self.layers_, self.objective, self.update,
            self.y_tensor_type,
            )
        self.train_iter_, self.eval_iter_, self.predict_iter_ = iter_funcs

# And self._create_iter_funcs looks like this:

    def _create_iter_funcs(self, layers, objective, update, output_type):
        y_batch = output_type('y_batch')

        output_layer = layers[-1]
        objective_kw = self._get_params_for('objective')

        loss_train = objective(
            layers, target=y_batch, **objective_kw)
        loss_eval = objective(
            layers, target=y_batch, deterministic=True, **objective_kw)
        predict_proba = get_output(output_layer, None, deterministic=True)
        if not self.regression:
            predict = predict_proba.argmax(axis=1)
            accuracy = T.mean(T.eq(predict, y_batch))
        else:
            accuracy = loss_eval

        all_params = self.get_all_params(trainable=True)
        update_params = self._get_params_for('update')
        updates = update(loss_train, all_params, **update_params)

        input_layers = [layer for layer in layers.values()
                        if isinstance(layer, InputLayer)]

        X_inputs = [theano.Param(input_layer.input_var, name=input_layer.name)
                    for input_layer in input_layers]
        inputs = X_inputs + [theano.Param(y_batch, name="y")]

        train_iter = theano.function(
            inputs=inputs,
            outputs=[loss_train, accuracy],
            updates=updates,
            allow_input_downcast=True,
            )
        eval_iter = theano.function(
            inputs=inputs,
            outputs=[loss_eval, accuracy],
            allow_input_downcast=True,
            )
        predict_iter = theano.function(
            inputs=X_inputs,
            outputs=predict_proba,
            allow_input_downcast=True,
            )

        return train_iter, eval_iter, predict_iter

# The inputs are as follows:
# [inputs of input layers, input for y]
#
# self.batch_iterator_train needs to take two parameters and return (Xb, yb)
# The output types do not matter.
#
# The functions in turn will be called as f(Xb, yb), 
# so whatever we compile our GPU functions to needs to take two arguments.
#
# Presumably we can make Xb and yb our batch indexes. They will be identical but nevertheless must be duplicated.
#
# The batch_iterator is set during construction:

        batch_iterator_train=BatchIterator(batch_size=128),
        batch_iterator_test=BatchIterator(batch_size=128),

# Here BatchIterator is a class that will return batches based on slices.
# We are better off writing our own BatchIterator.
#
# The BatchIterator is called on X_train,X_valid,y_train,y_valid,
# which is a split produced by self.train_split(X,y,self)
#
# self.train_split is set in the constructor:
        train_split=TrainSplit(eval_size=0.2),
#
# TrainSplit is a simple class that splits our train and test set.
#
# It would appear we are better off by passing in indices instead of data.
# Then splitting and iteration will occur on theses indices.
# These indices are then passed to our custom iterators, which used shared variables and indices as inputs.
#
# So all we have to do is overwrite:
self.train_iter_, self.eval_iter_, self.predict_iter_
# which can only be done after .initialize() is called,
# which is normally done when .fit is called,
# but we can call it manually beforehand since it sets a flag whether it has been called already,
# and subsequent calls in .fit will not overwrite our behavior.

In [106]:
import nolearn.lasagne
import lasagne

class GpuNeuralNet(nolearn.lasagne.NeuralNet):
    def __init__(
        self,
        X,
        y,
        layers,
        update=lasagne.updates.nesterov_momentum,
        loss=None,  # BBB
        objective=nolearn.lasagne.objective,
        objective_loss_function=None,
        batch_iterator_train=nolearn.lasagne.BatchIterator(batch_size=128),
        batch_iterator_test=nolearn.lasagne.BatchIterator(batch_size=128),
        regression=False,
        max_epochs=100,
        train_split=nolearn.lasagne.TrainSplit(eval_size=0.2),
        custom_score=None,
        X_tensor_type=None,
        y_tensor_type=None,
        use_label_encoder=False,
        on_batch_finished=None,
        on_epoch_finished=None,
        on_training_started=None,
        on_training_finished=None,
        more_params=None,
        verbose=0,
        **kwargs
    ):
        super(GpuNeuralNet, self).__init__(
            layers,
            update,
            loss,
            objective,
            objective_loss_function,
            batch_iterator_train,
            batch_iterator_test,
            regression,
            max_epochs,
            train_split,
            custom_score,
            X_tensor_type,
            y_tensor_type,
            use_label_encoder,
            on_batch_finished,
            on_epoch_finished,
            on_training_started,
            on_training_finished,
            more_params,
            verbose,
            **kwargs
        )
        self.X_shared = theano.shared(X)
        self.y_shared = theano.shared(y)
        
    def _create_iter_funcs(self, layers, objective, update, output_type):
        y_batch = output_type('y_batch')

        output_layer = layers[-1]
        objective_kw = self._get_params_for('objective')

        loss_train = objective(
            layers, target=y_batch, **objective_kw)
        loss_eval = objective(
            layers, target=y_batch, deterministic=True, **objective_kw)
        predict_proba = get_output(output_layer, None, deterministic=True)
        if not self.regression:
            predict = predict_proba.argmax(axis=1)
            accuracy = T.mean(T.eq(predict, y_batch))
        else:
            accuracy = loss_eval

        all_params = self.get_all_params(trainable=True)
        update_params = self._get_params_for('update')
        updates = update(loss_train, all_params, **update_params)

        input_layers = [layer for layer in layers.values()
                        if isinstance(layer, InputLayer)]
        assert len(input_layers) == 1, 'Multiple input layers not supported'
        
        # FIXME: Original code wraps all variables in params.
        # For whatever reason Theano does not want this.
        # Not sure what the effects are of not wrapping Variable in Param.
        Xvar = input_layers[0].input_var
        yvar = y_batch
        
        # FIXME: using vectors here is the easiest solution since the batch iterator
        # does not need to change.
        # However, it might not be faster than the original code.
        # Should this not work then the BatchIterator needs to be adapted such that Xb, yb
        # returns by its __iter__ func are scalars indicating the start of the batch.
        #batch_idxs = [T.iscalar('Xidx'), T.iscalar('yidx')]
        batch_idxs = [T.ivector('Xidx'), T.ivector('yidx')]
        ix, iy = batch_idxs
        bs = self.batch_iterator_train.batch_size

        train_iter = theano.function(
            inputs=batch_idxs,
            outputs=[loss_train, accuracy],
            updates=updates,
            givens={
                #Xvar: self.X_shared[ix * bs: (ix + 1) * bs],
                #yvar: self.y_shared[iy * bs: (iy + 1) * bs],
                Xvar: self.X_shared[ix],
                yvar: self.y_shared[iy],
            },
            allow_input_downcast=True,
            )
        
        bs = self.batch_iterator_test.batch_size
        eval_iter = theano.function(
            inputs=batch_idxs,
            outputs=[loss_eval, accuracy],
            givens={
                #Xvar: self.X_shared[ix * bs: (ix + 1) * bs],
                #yvar: self.y_shared[iy * bs: (iy + 1) * bs],
                Xvar: self.X_shared[ix],
                yvar: self.y_shared[iy],
            },
            allow_input_downcast=True,
            )
        
        # Uses the same input as eval_iter
        predict_iter = theano.function(
            inputs=batch_idxs[0:-1],
            outputs=predict_proba,
            givens={
                #Xvar: self.X_shared[ix * bs: (ix + 1) * bs],
                Xvar: self.X_shared[ix]
            },
            allow_input_downcast=True,
            )

        return train_iter, eval_iter, predict_iter

In [107]:
import vizlib

In [108]:
ds = vizlib.data.counting_2d()
ds.X.shape, len(set(ds.y))

((10000, 1, 32, 32), 6)

In [109]:
from lasagne.layers import *

In [110]:
input_layer = InputLayer((None, 1, 32, 32))
conv_layer = Conv2DLayer(input_layer, num_filters=1, filter_size=(3,3))
dense_layer = DenseLayer(conv_layer, num_units=6)

In [111]:
nn = GpuNeuralNet(ds.X, ds.y, layers=dense_layer, update_learning_rate=1e-2, update_momentum=0.9)

In [113]:
import time

t0 = time.time()
Xidx = np.arange(len(ds), dtype=theano.config.floatX)
yidx = Xidx.astype(np.int32)
nn.fit(Xidx, yidx)
t1 = time.time()
print(t1 - t0)

47.7457330227


In [114]:
nn2 = nolearn.lasagne.NeuralNet(layers=dense_layer, update_learning_rate=1e-3, update_momentum=0.9)

In [116]:
t00 = time.time()
nn2.fit(ds.X, ds.y)
t01 = time.time()
print(t01 - t00)

112.071755171


  for input_layer in input_layers]
  inputs = X_inputs + [theano.Param(y_batch, name="y")]


In [119]:
# So it seems the speedup is alright, but rather marginal.
# I wonder if we can improve by switching to integer input.
import nolearn.lasagne
import lasagne

# This batch iterator is rather silly
# It has to match the interface used by nolearn.lasagne,
# but really it is rather simply looping over the start indices of the batches.
class DummyBatchIterator(object):
    def __init__(self, batch_size):
        self.batch_size = batch_size

    def __call__(self, X, y=None):
        self.X, self.y = X, y
        return self

    def __iter__(self):
        bs = self.batch_size
        for i in range((self.n_samples + bs - 1) // bs):
            yield i, i

    @property
    def n_samples(self):
        X = self.X
        if isinstance(X, dict):
            return len(list(X.values())[0])
        else:
            return len(X)

class GpuNeuralNet2(nolearn.lasagne.NeuralNet):
    def __init__(
        self,
        X,
        y,
        layers,
        update=lasagne.updates.nesterov_momentum,
        loss=None,  # BBB
        objective=nolearn.lasagne.objective,
        objective_loss_function=None,
        batch_iterator_train=DummyBatchIterator(batch_size=128),
        batch_iterator_test=DummyBatchIterator(batch_size=128),
        regression=False,
        max_epochs=100,
        train_split=nolearn.lasagne.TrainSplit(eval_size=0.2),
        custom_score=None,
        X_tensor_type=None,
        y_tensor_type=None,
        use_label_encoder=False,
        on_batch_finished=None,
        on_epoch_finished=None,
        on_training_started=None,
        on_training_finished=None,
        more_params=None,
        verbose=0,
        **kwargs
    ):
        super(GpuNeuralNet2, self).__init__(
            layers,
            update,
            loss,
            objective,
            objective_loss_function,
            batch_iterator_train,
            batch_iterator_test,
            regression,
            max_epochs,
            train_split,
            custom_score,
            X_tensor_type,
            y_tensor_type,
            use_label_encoder,
            on_batch_finished,
            on_epoch_finished,
            on_training_started,
            on_training_finished,
            more_params,
            verbose,
            **kwargs
        )
        self.X_shared = theano.shared(X)
        self.y_shared = theano.shared(y)
        
    def _create_iter_funcs(self, layers, objective, update, output_type):
        y_batch = output_type('y_batch')

        output_layer = layers[-1]
        objective_kw = self._get_params_for('objective')

        loss_train = objective(
            layers, target=y_batch, **objective_kw)
        loss_eval = objective(
            layers, target=y_batch, deterministic=True, **objective_kw)
        predict_proba = get_output(output_layer, None, deterministic=True)
        if not self.regression:
            predict = predict_proba.argmax(axis=1)
            accuracy = T.mean(T.eq(predict, y_batch))
        else:
            accuracy = loss_eval

        all_params = self.get_all_params(trainable=True)
        update_params = self._get_params_for('update')
        updates = update(loss_train, all_params, **update_params)

        input_layers = [layer for layer in layers.values()
                        if isinstance(layer, InputLayer)]
        assert len(input_layers) == 1, 'Multiple input layers not supported'
        
        # FIXME: Original code wraps all variables in params.
        # For whatever reason Theano does not want this.
        # Not sure what the effects are of not wrapping Variable in Param.
        Xvar = input_layers[0].input_var
        yvar = y_batch
        
        batch_idxs = [T.iscalar('Xidx'), T.iscalar('yidx')]
        ix, iy = batch_idxs
        bs = self.batch_iterator_train.batch_size

        train_iter = theano.function(
            inputs=batch_idxs,
            outputs=[loss_train, accuracy],
            updates=updates,
            givens={
                Xvar: self.X_shared[ix * bs: (ix + 1) * bs],
                yvar: self.y_shared[iy * bs: (iy + 1) * bs],
            },
            allow_input_downcast=True,
            )
        
        bs = self.batch_iterator_test.batch_size
        eval_iter = theano.function(
            inputs=batch_idxs,
            outputs=[loss_eval, accuracy],
            givens={
                Xvar: self.X_shared[ix * bs: (ix + 1) * bs],
                yvar: self.y_shared[iy * bs: (iy + 1) * bs],
            },
            allow_input_downcast=True,
            )
        
        # Uses the same input as eval_iter
        predict_iter = theano.function(
            inputs=batch_idxs[0:-1],
            outputs=predict_proba,
            givens={
                Xvar: self.X_shared[ix * bs: (ix + 1) * bs],
            },
            allow_input_downcast=True,
            )

        return train_iter, eval_iter, predict_iter

In [120]:
nn2 = GpuNeuralNet2(ds.X, ds.y, layers=dense_layer, update_learning_rate=1e-3, update_momentum=0.9)
t10 = time.time()
nn2.fit(ds.X, ds.y)
t11 = time.time()
print(t11 - t10)

112.897287846


In [121]:
# Seems even slower
# Try to get a definate answer:

In [126]:
%%timeit
input_layer = InputLayer((None, 1, 32, 32))
conv_layer = Conv2DLayer(input_layer, num_filters=1, filter_size=(3,3))
dense_layer = DenseLayer(conv_layer, num_units=6)
nn = GpuNeuralNet(ds.X, ds.y, layers=dense_layer, update_learning_rate=1e-2, update_momentum=0.9)
X = np.arange(len(ds.X), dtype=theano.config.floatX)
y = np.arange(len(ds.y), dtype=np.int32)
nn.fit(X, y)

1 loops, best of 3: 46.5 s per loop


In [129]:
%%timeit
input_layer = InputLayer((None, 1, 32, 32))
conv_layer = Conv2DLayer(input_layer, num_filters=1, filter_size=(3,3))
dense_layer = DenseLayer(conv_layer, num_units=6)
nn2 = GpuNeuralNet2(ds.X, ds.y, layers=dense_layer, update_learning_rate=1e-2, update_momentum=0.9)
nn2.fit(X, y)

1 loops, best of 3: 44.1 s per loop


In [128]:
%%timeit
input_layer = InputLayer((None, 1, 32, 32))
conv_layer = Conv2DLayer(input_layer, num_filters=1, filter_size=(3,3))
dense_layer = DenseLayer(conv_layer, num_units=6)
nn3 = nolearn.lasagne.NeuralNet(layers=dense_layer, update_learning_rate=1e-2, update_momentum=0.9)
nn3.fit(ds.X, ds.y)

1 loops, best of 3: 1min 51s per loop


In [None]:
# Local machine (Quadro K1000M) (single filter):
# GpuNeuralNet  |  46.5
# GpuNeuralNet2 |  44.1
# NeuralNet     | 111.0
#
#
# Romulus (K40c) (20 filters):
# GpuNeuralNet    | 104s
# GpuNeuralNet2   | 333s
# NeuralNet       | 340s
#
#
# So GpuNeuralNet is the fastest implementation (which is using VECTORS instead of SLICES -- rather strange).
# Nevertheless, this is the one I implemented in vizlib.