Timings here are for a machine with a K80 (specifically an Azure NC6). Running with a CPU only machine is going to quite a bit slower. 

In [1]:
import matplotlib
%matplotlib inline

import sys
sys.path.append('../src')
sys.path.append('../../DNSGP/GPflow/')

import numpy as np
import tensorflow as tf

from GPflow.likelihoods import MultiClass
from GPflow.kernels import RBF, White, Linear, Matern32, Matern52
from GPflow.svgp import SVGP
from GPflow.gpr import GPR

from GPflow.param import AutoFlow

from scipy.stats import mode
from scipy.cluster.vq import kmeans2

from get_data import get_mnist_data
from dgp import DGP

import time

X, Y, Xs, Ys = get_mnist_data()

Extracting ../data/MNIST_data/train-images-idx3-ubyte.gz
Extracting ../data/MNIST_data/train-labels-idx1-ubyte.gz
Extracting ../data/MNIST_data/t10k-images-idx3-ubyte.gz
Extracting ../data/MNIST_data/t10k-labels-idx1-ubyte.gz


We'll use 100 inducing points 

In [2]:
M = 100
Z = kmeans2(X, M, minit='points')[0]

In [12]:
print Z.shape

(100, 784)


Slightly annoyingly,  `AutoFlow` takes `Ynew` as a `float_type` in `predict_density`, but for the mutliclass likelihood the input is `tf.int32` (also the number of dimensions are different). We defined both versions in out `DGP` class, but as a workaround for `SVGP` we just override the behaviour:


In [3]:
class MultiClassSVPG(SVGP):
    @AutoFlow((tf.float64, [None, None]), (tf.int32, [None,]))
    def predict_density(self, Xnew, Ynew):
        pred_f_mean, pred_f_var = self.build_predict(Xnew)
        return self.likelihood.predict_density(pred_f_mean, pred_f_var, Ynew)

We'll compare three models: an ordinary sparse GP and DGPs with 2 and 3 layers. 

We'll use a batch size of 10000 for all models 

In [4]:
m_sgp = MultiClassSVPG(X, Y, RBF(784, lengthscales=2, variance=2), 
             MultiClass(10), Z, 
             num_latent=10, minibatch_size=1000, whiten=True)

def make_dgp(L):
    kernels = [RBF(784, lengthscales=2., variance=2.)]
    for l in range(L-1):
        kernels.append(RBF(30, lengthscales=2., variance=2.))
    model = DGP(X, Y, Z, kernels, MultiClass(10), 
                num_samples=1,
                minibatch_size=1000,
                num_latent_Y=10)

    for layer in model.layers[:-1]:
        layer.q_sqrt = layer.q_sqrt.value * 1e-5 
    
    return model

m_dgp2 = make_dgp(2)
m_dgp3 = make_dgp(3)

For the SGP model we'll calcuate accuracy by simply taking the max mean prediction:

In [5]:
def assess_model_sgp(model, X_batch, Y_batch):
    m, v = model.predict_y(X_batch)
    l = model.predict_density(X_batch, Y_batch)
    a = (np.argmax(m, 1)==Y_batch)
    return l, a

For the DGP models we have stochastic predictions. We need a single prediction for each datum, so to do this we take $S$ samples for the one-hot predictions ($(S, N, 10)$ matrices for mean and var), then we take the max over the class means (to give a $(S, N)$ matrix), and finally we take the modal class over the samples (to give a vector of length $N$):

We'll use 1000 samples

In [6]:
S = 1000
def assess_model_dgp(model, X_batch, Y_batch):
    m, v = model.predict_y(X_batch, S)
    l = model.predict_density_multiclass(X_batch, Y_batch, S)
    a = (mode(np.argmax(m, 2), 0)[0].flatten()==Y_batch)
    return l, a

We need batch predictions (we might run out of memory as `Xs` is 10,000 points)

In [7]:
def batch_assess(model, assess_model, X, Y):
    n_batches = int(len(X)/100)
    lik, acc = [], []
    for X_batch, Y_batch in zip(np.split(X, n_batches), np.split(Y, n_batches)):
        l, a = assess_model(model, X_batch, Y_batch)
        lik.append(l)
        acc.append(a)
    lik = np.concatenate(lik, 0)
    acc = np.array(np.concatenate(acc, 0), dtype=float)
    return np.average(lik), np.average(acc)

Finally, we'll use the following callback to log what's going on 

We'll train for 10000 iterations, printing every 1000 to see how convergence is doing. We'll predict also at the training data to see what's going (we don't use a validation set). 

In [10]:
class CB(object):
    def __init__(self, model, assess_model):
        self.model = model
        self.assess_model = assess_model
        self.i = 0
        self.t = time.time()
        self.train_time = 0
        self.ob = []
        self.train_lik = []
        self.train_acc = []
    def cb(self, x):
        self.i += 1
        if self.i % 1 == 0:
            # time how long we've be training 
            self.train_time += time.time() - self.t
            self.t = time.time()
            
            # assess the model on the training data
            self.model.set_state(x)
            lik, acc = batch_assess(self.model, self.assess_model, X, Y)
            self.train_lik.append(lik)
            self.train_acc.append(acc)
            
            # calculate the objective, averaged over S samples 
            ob = 0
            for _ in range(1):
                ob += self.model.compute_log_likelihood()/float(1)
            self.ob.append(ob)
            
            st = 'it: {}, ob: {:.1f}, train lik: {:.4f}, train acc {:.4f}'
            print st.format(self.i, ob, lik, acc)

Now we're ready to go

The sparse GP:

In [11]:
cb_sgp = CB(m_sgp, assess_model_sgp)
m_sgp.optimize(tf.train.AdamOptimizer(0.01), maxiter=10, callback=cb_sgp.cb)
print 'sgp total train time {:.4f}'.format(cb_sgp.train_time)
l, a = batch_assess(m_sgp, assess_model_sgp, Xs, Ys)
print 'spg test lik: {:.4f}, test acc {:.4f}'.format(l, a)

it: 1, ob: -486895.3, train lik: -2.2299, train acc 0.8340
it: 2, ob: -485875.8, train lik: -2.2204, train acc 0.8342
it: 3, ob: -485273.1, train lik: -2.2106, train acc 0.8349
it: 4, ob: -485001.6, train lik: -2.2008, train acc 0.8351
it: 5, ob: -483369.6, train lik: -2.1907, train acc 0.8353
it: 6, ob: -482753.4, train lik: -2.1805, train acc 0.8358
it: 7, ob: -480550.8, train lik: -2.1703, train acc 0.8370
it: 8, ob: -480483.6, train lik: -2.1599, train acc 0.8374
it: 9, ob: -479240.3, train lik: -2.1494, train acc 0.8380
it: 10, ob: -477229.9, train lik: -2.1388, train acc 0.8387
sgp total train time 173.0981
spg test lik: -2.1352, test acc 0.8445


In [None]:
from GPflow.gpr import GPR
X = np.linspace(0, 1, 100)[:, None]
Y = np.sin(10*X)
Xs = np.random.uniform(0, 1, 100)[:, None]
Ys = np.sin(10*X)

model = GPR(X, Y, RBF(1))
m, v = model.predict_y(Xs)
print  np.average((m - Xs)**2)**0.5


In [28]:
from GPflow.kernels import White, RBF
from GPflow.likelihoods import Gaussian
from GPflow.sgpr import SGPR
from GPflow.gpr import GPR

svgp = SVGP(X, np.array(Y[:, None], dtype=float), RBF(784, lengthscales=20, variance=2.), Gaussian(), Z, num_latent=10)
svgp = SVGP(X, Y, RBF(784, lengthscales=20, variance=2.), MultiClass(10), Z, num_latent=10)
# svgp = SVGP(X[:10], np.array(Y[:10, None], dtype=float), White(784), Gaussian(), X[:10])#, num_latent=10)
# svgp = GPR(X[:100], np.array(Y[:100, None], dtype=float), RBF(784))#, num_latent=10)
# svgp = SGPR(X[:1000], np.array(Y[:1000, None], dtype=float), RBF(784), X[:100])#, num_latent=10)
t = time.time()
print svgp.compute_log_likelihood()


svgp.optimize(maxiter=2)
print time.time() - t
# -1506.89385332
# print Y[:10, None]

-491708.145615
34.8770301342


In [29]:
print svgp.compute_log_likelihood()
t = time.time()
m, v = svgp.predict_f(Xs[:2])
print time.time() - t
print m


-334113.725606
0.505330085754
[[-1.43891014 -3.46352971  3.98614993 -0.7374889   2.92201216 -9.4036516
   9.11916085  0.54827626 -1.24523868 -0.28678017]
 [-0.67047457  1.99230342  5.58693348  1.35280026 -0.65118404 -9.5814999
   4.00512604 -0.09307582  0.42266724 -2.36359611]]


In [30]:
print svgp.kern
print svgp.q_mu.value.shape
print svgp.q_sqrt.value.shape

kern.[1mlengthscales[0m transform:+ve prior:None
[ 16.38015575]
kern.[1mvariance[0m transform:+ve prior:None
[ 2.00002661]
(100, 10)
(100, 100, 10)


Using more inducing points improves things, but at the expense of very slow computation (500 inducing points takes about a day)

The two layer DGP:

In [None]:
cb_dgp2 = CB(m_dgp2, assess_model_dgp)
m_dgp2.optimize(tf.train.AdamOptimizer(0.01), maxiter=20000, callback=cb_dgp2.cb)
print 'dgp2 total train time {:.4f}'.format(cb_dgp2.train_time)
l, a = batch_assess(m_dgp2, assess_model_dgp, Xs, Ys)
print 'dgp2 test lik: {:.4f}, test acc {:.4f}'.format(l, a)

In [None]:
And the three layer:

In [None]:
cb_dgp3 = CB(m_dgp3, assess_model_dgp)
m_dgp3.optimize(tf.train.AdamOptimizer(0.01), maxiter=20000, callback=cb_dgp3.cb)
print 'dgp3 total train time {:.4f}'.format(cb_dgp3.train_time)
l, a = batch_assess(m_dgp3, assess_model_dgp, Xs, Ys)
print 'dgp3 test lik: {:.4f}, test acc {:.4f}'.format(l, a)

The 3 layer DGP wins! 

We can see how they've done over time

In [None]:
plt.plot(cb_sgp.train_acc, label='sgp')
plt.plot(cb_dgp2.train_acc, label='dgp2')
plt.plot(cb_dgp3.train_acc, label='dgp3')
plt.title('train accuray')
plt.show()