Timings here are for a machine with a K80 (specifically an Azure NC6). Running with a CPU only machine is going to quite a bit slower. 

In [1]:
import matplotlib
%matplotlib inline

import sys
sys.path.append('../src')
sys.path.append('../../DNSGP/GPflow')

import numpy as np
import tensorflow as tf

from GPflow.likelihoods import MultiClass
from GPflow.kernels import RBF, White, Linear, Matern32, Matern52
from GPflow.svgp import SVGP
from GPflow.gpr import GPR

from GPflow.param import AutoFlow

from scipy.stats import mode
from scipy.cluster.vq import kmeans2

from get_data import get_mnist_data
from dgp import DGP

import time

X, Y, Xs, Ys = get_mnist_data()

Extracting /vol/bitbucket/hrs13/dnsgpfiles/data/MNIST_data/train-images-idx3-ubyte.gz
Extracting /vol/bitbucket/hrs13/dnsgpfiles/data/MNIST_data/train-labels-idx1-ubyte.gz
Extracting /vol/bitbucket/hrs13/dnsgpfiles/data/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting /vol/bitbucket/hrs13/dnsgpfiles/data/MNIST_data/t10k-labels-idx1-ubyte.gz


We'll use 100 inducing points 

In [2]:
M = 100
Z = kmeans2(X, M, minit='points')[0]

Slightly annoyingly,  `AutoFlow` takes `Ynew` as a `float_type` in `predict_density`, but for the mutliclass likelihood the input is `tf.int32` (also the number of dimensions are different). We defined both versions in out `DGP` class, but as a workaround for `SVGP` we just override the behaviour:


In [3]:
class MultiClassSVPG(SVGP):
    @AutoFlow((tf.float64, [None, None]), (tf.int32, [None,]))
    def predict_density(self, Xnew, Ynew):
        pred_f_mean, pred_f_var = self.build_predict(Xnew)
        return self.likelihood.predict_density(pred_f_mean, pred_f_var, Ynew)

We'll compare three models: an ordinary sparse GP and DGPs with 2 and 3 layers. 

We'll use a batch size of 10000 for all models (it works fine with smaller batch sizes, but this is what we did in the paper)


In [4]:
m_sgp = MultiClassSVPG(X, Y, RBF(784, lengthscales=2, variance=2), 
             MultiClass(10), Z, 
             num_latent=10, minibatch_size=1000)

def make_dgp(L):
    kernels = [RBF(784, lengthscales=2., variance=2.)]
    for l in range(L-1):
        kernels.append(RBF(30, lengthscales=2., variance=2.))
    model = DGP(X, Y, Z, kernels, MultiClass(10), 
                num_samples=1,
                minibatch_size=1000,
                num_latent_Y=10)

    for layer in model.layers[:-1]:
        layer.q_sqrt = layer.q_sqrt.value * 1e-5 
    
    return model

m_dgp2 = make_dgp(2)
m_dgp3 = make_dgp(3)

We'll train for 10000 iterations, printing every 1000 to see how convergence is doing. We'll predict also at the training data to see what's going (we don't use a validation set). 

For the SGP model we'll calcuate accuracy by simply taking the max mean prediction:

In [5]:
def assess_model_sgp(model, X_batch, Y_batch):
    m, v = model.predict_y(X_batch)
    l = model.predict_density(X_batch, Y_batch)
    a = (np.argmax(m, 1)==Y_batch)
    return l, a

For the DGP models we have stochastic predictions. We need a single prediction for each datum, so to do this we take $S$ samples for the one-hot predictions ($(S, N, 10)$ matrices for mean and var), then we take the max over the class means (to give a $(S, N)$ matrix), and finally we take the modal class over the samples (to give a vector of length $N$):

We'll use 100 samples

In [6]:
S = 100
def assess_model_dgp(model, X_batch, Y_batch):
    m, v = model.predict_y(X_batch, S)
    l = model.predict_density_multiclass(X_batch, Y_batch, S)
    a = (mode(np.argmax(m, 2), 0)[0].flatten()==Y_batch)
    return l, a

We need batch predictions (we might run out of memory as `Xs` is 10,000 points)

In [7]:
def batch_assess(model, assess_model, X, Y):
    n_batches = int(len(X)/100)
    lik, acc = [], []
    for X_batch, Y_batch in zip(np.split(X, n_batches), np.split(Y, n_batches)):
        l, a = assess_model(model, X_batch, Y_batch)
        lik.append(l)
        acc.append(a)
    lik = np.concatenate(lik, 0)
    acc = np.array(np.concatenate(acc, 0), dtype=float)
    return np.average(lik), np.average(acc)

Finally, we'll use the following callback to log what's going on 

In [8]:
class CB(object):
    def __init__(self, model, assess_model):
        self.model = model
        self.assess_model = assess_model
        self.i = 0
        self.t = time.time()
        self.train_time = 0
        self.ob = []
        self.train_lik = []
        self.train_acc = []
    def cb(self, x):
        self.i += 1
        if self.i % 1000 == 0:
            # time how long we've be training 
            self.train_time += time.time() - self.t
            self.t = time.time()
            
            # assess the model on the training data
            self.model.set_state(x)
            lik, acc = batch_assess(self.model, self.assess_model, X, Y)
            self.train_lik.append(lik)
            self.train_acc.append(acc)
            
            # calculate the objective, averaged over S samples 
            ob = 0
            for _ in range(S):
                ob += self.model.compute_log_likelihood()/float(S)
            self.ob.append(ob)
            
            st = 'it: {}, ob: {:.1f}, train lik: {:.4f}, train acc {:.4f}'
            print st.format(self.i, ob, lik, acc)

Now we're ready to go

The sparse GP:

In [9]:
cb_sgp = CB(m_sgp, assess_model_sgp)
m_sgp.optimize(tf.train.AdamOptimizer(0.01), maxiter=20000, callback=cb_sgp.cb)
print 'sgp total train time {:.4f}'.format(cb_sgp.train_time)
l, a = batch_assess(m_sgp, assess_model_sgp, Xs, Ys)
print 'spg test lik: {:.4f}, test acc {:.4f}'.format(l, a)

it: 1000, ob: -38777.3, train lik: -0.1424, train acc 0.9634
it: 2000, ob: -34423.9, train lik: -0.1268, train acc 0.9691
it: 3000, ob: -32992.6, train lik: -0.1179, train acc 0.9717
it: 4000, ob: -32617.8, train lik: -0.1178, train acc 0.9725
it: 5000, ob: -32131.9, train lik: -0.1129, train acc 0.9738
it: 6000, ob: -31842.0, train lik: -0.1150, train acc 0.9740
it: 7000, ob: -31593.7, train lik: -0.1127, train acc 0.9735
it: 8000, ob: -32258.5, train lik: -0.1106, train acc 0.9746
it: 9000, ob: -31711.1, train lik: -0.1091, train acc 0.9751
it: 10000, ob: -31647.1, train lik: -0.1093, train acc 0.9748
it: 11000, ob: -31358.6, train lik: -0.1079, train acc 0.9750
it: 12000, ob: -31733.9, train lik: -0.1066, train acc 0.9753
it: 13000, ob: -31090.2, train lik: -0.1066, train acc 0.9754
it: 14000, ob: -31161.8, train lik: -0.1063, train acc 0.9753
it: 15000, ob: -31063.1, train lik: -0.1033, train acc 0.9759
it: 16000, ob: -30912.5, train lik: -0.1050, train acc 0.9758
it: 17000, ob: -3

As expected, it scores a respectible 97%, but nothing spectacular. Using more inducing points improves things, but at the expense of very slow computation (500 inducing points takes about a day)

Here is the two layer DGP

In [10]:
cb_dgp2 = CB(m_dgp2, assess_model_dgp)
m_dgp2.optimize(tf.train.AdamOptimizer(0.01), maxiter=20000, callback=cb_dgp2.cb)
print 'dgp2 total train time {:.4f}'.format(cb_dgp2.train_time)
l, a = batch_assess(m_dgp2, assess_model_dgp, Xs, Ys)
print 'dgp2 test lik: {:.4f}, test acc {:.4f}'.format(l, a)

it: 1000, ob: -37047.9, train lik: -0.1161, train acc 0.9708
it: 2000, ob: -31068.3, train lik: -0.0927, train acc 0.9771
it: 3000, ob: -27779.5, train lik: -0.0836, train acc 0.9802
it: 4000, ob: -25298.2, train lik: -0.0790, train acc 0.9815
it: 5000, ob: -25096.4, train lik: -0.0774, train acc 0.9817
it: 6000, ob: -23652.5, train lik: -0.0742, train acc 0.9828
it: 7000, ob: -22826.9, train lik: -0.0730, train acc 0.9831
it: 8000, ob: -22949.9, train lik: -0.0699, train acc 0.9838
it: 9000, ob: -22131.3, train lik: -0.0696, train acc 0.9844
it: 10000, ob: -22001.0, train lik: -0.0687, train acc 0.9845
it: 11000, ob: -21834.3, train lik: -0.0681, train acc 0.9849
it: 12000, ob: -22019.1, train lik: -0.0669, train acc 0.9850
it: 13000, ob: -21296.0, train lik: -0.0662, train acc 0.9854
it: 14000, ob: -21743.0, train lik: -0.0651, train acc 0.9851
it: 15000, ob: -21310.5, train lik: -0.0642, train acc 0.9856
it: 16000, ob: -21324.8, train lik: -0.0645, train acc 0.9855
it: 17000, ob: -2

In [11]:
And the three layer 

SyntaxError: invalid syntax (<ipython-input-11-02c29d3e9515>, line 1)

In [None]:
cb_dgp3 = CB(m_dgp3, assess_model_dgp)
m_dgp3.optimize(tf.train.AdamOptimizer(0.01), maxiter=20000, callback=cb_dgp3.cb)
print 'dgp3 total train time {:.4f}'.format(cb_dgp3.train_time)
l, a = batch_assess(m_dgp3, assess_model_dgp, Xs, Ys)
print 'dgp3 test lik: {:.4f}, test acc {:.4f}'.format(l, a)

The 3 layer DGP wins! 

We can see how they've done over time

In [None]:
plt.plot(cb_sgp.train_acc, label='sgp')
plt.plot(cb_dgp2.train_acc, label='dgp2')
plt.plot(cb_dgp3.train_acc, label='dgp3')
plt.title('train accuray')
plt.show()