In [1]:
from rtbm.riemann_theta.riemann_theta import RiemannTheta

import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.metrics import confusion_matrix, classification_report

import theano

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Dropout, Reshape  
from keras.optimizers import SGD

from sklearn.preprocessing import LabelBinarizer

import time

from rtbm import RTBM, minimizer

import rtbm.layers as layers
import rtbm.model as mdl

from rtbm.costfunctions import mse
from rtbm.activations import sigmoid

Using Theano backend.


In [2]:
# Load MNIST dataset
MNIST_train = pd.read_csv('~/data/mnist_train.csv', delimiter=",",header=None).values
MNIST_test  = pd.read_csv('~/data/mnist_test.csv', delimiter=",",header=None).values

# Prepare data (normalized onto [0,1])
Y_train = MNIST_train[0:10000,0]
X_train = MNIST_train[0:10000,1:]/255.0

Y_test = MNIST_test[:,0]
X_test = MNIST_test[:,1:]/255.0

enc = LabelBinarizer()
enc.fit(np.diag([1,1,1,1,1,1,1,1,1,1]))
enc.classes_ = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
T=enc.transform(Y_train)

# Keras (200 sigmoids + 10 sigmoids + 1 linear + MSE)

In [48]:
model = Sequential() 

model.add(Dense(200,  input_dim=784))
model.add(Activation('sigmoid'))
model.add(Dense(10,  input_dim=784))
model.add(Activation('sigmoid'))
model.add(Dense(output_dim=1))
model.add(Activation('linear'))


sgd = SGD(lr=0.1)

tic = time.clock()

model.compile(loss='mse', optimizer=sgd)

toc = time.clock()

print("Compile time: ",toc-tic)

tic = time.clock()

model.fit(X_train, Y_train, batch_size=1000, nb_epoch=100, validation_data=None, shuffle=False, verbose=0)  
toc = time.clock()

print("Run time: ",toc-tic)

  import sys


('Compile time: ', 0.005583000000001448)
('Run time: ', 45.66247800000008)


In [49]:
# On train set
P=np.abs(np.round(np.real(model.predict(X_train)))).flatten()


print(classification_report(Y_train,P.T))
print(confusion_matrix(Y_train, P.T))

             precision    recall  f1-score   support

          0       1.00      0.65      0.78      1001
          1       0.74      0.66      0.70      1127
          2       0.52      0.44      0.48       991
          3       0.50      0.51      0.51      1032
          4       0.03      0.02      0.02       980
          5       0.26      0.52      0.35       863
          6       0.52      0.71      0.60      1014
          7       0.47      0.36      0.40      1070
          8       0.26      0.34      0.29       944
          9       0.66      0.52      0.59       978

avg / total       0.50      0.48      0.48     10000

[[646 217  71  36  20   8   3   0   0   0]
 [  0 739 267  71  23  19   4   1   3   0]
 [  0  30 435 356  80  61  20   4   4   1]
 [  0   2  52 527 241 116  50  22  16   6]
 [  0   1   0   4  17 771 128  40  16   3]
 [  1   3  11  28  59 452 230  63  15   1]
 [  1   2   1   4  23 200 725  54   3   1]
 [  0   0   0  12  18  49  82 380 526   3]
 [  0   0   0  12

In [50]:
# On test set
P=np.abs(np.round(np.real(model.predict(X_test)))).flatten()


print(classification_report(Y_test,P.T))
print(confusion_matrix(Y_test, P.T))

             precision    recall  f1-score   support

          0       0.99      0.62      0.77       980
          1       0.73      0.66      0.70      1135
          2       0.50      0.39      0.44      1032
          3       0.46      0.50      0.48      1010
          4       0.02      0.01      0.01       982
          5       0.22      0.43      0.29       892
          6       0.44      0.60      0.51       958
          7       0.44      0.34      0.38      1028
          8       0.26      0.33      0.29       974
          9       0.62      0.50      0.56      1009

avg / total       0.47      0.44      0.45     10000

[[610 220  75  33  22  17   1   1   1   0]
 [  0 749 268  66  28  18   4   1   1   0]
 [  2  31 404 406  89  57  23  13   7   0]
 [  0   8  35 508 236 108  60  30  22   3]
 [  0   0   1   5   9 733 163  46  18   7]
 [  1   3  10  40  73 381 259  85  36   4]
 [  0   4   6  12  24 281 576  54   1   0]
 [  0   1   5  16  22  42  79 350 503  10]
 [  0   2   3   7

# With Theta and SGD

In [51]:
M = mdl.Model()
M.add(layers.NonLinear(784,200,sigmoid()))
M.add(layers.NonLinear(200,10,sigmoid()))
M.add(layers.Linear(10,1))

minim = minimizer.SGD()
sol=minim.train(mse(), M, np.transpose(X_train), Y_train.reshape(1,len(Y_train)), lr=0.1, maxiter=100, batch_size=1000, log_step=10)


Iteration 0 in 0.39(s), cost = 4.186906
Iteration 10 in 4.28(s), cost = 4.007818
Iteration 20 in 8.17(s), cost = 2.072728
Iteration 30 in 12.06(s), cost = 1.707744
Iteration 40 in 15.95(s), cost = 1.607710
Iteration 50 in 19.84(s), cost = 1.546584
Iteration 60 in 23.73(s), cost = 1.499166
Iteration 70 in 27.62(s), cost = 1.454172
Iteration 80 in 31.51(s), cost = 1.403379
Iteration 90 in 35.40(s), cost = 1.340367
('Cost: ', 1.2735204231210209)
('Sol: ', array([-0.04635235, -0.02339451, -0.05133954, ...,  0.97528025,
        0.55323013,  1.55437903]))
Time: 38 s


In [52]:
# On train set
P=np.abs(np.round(np.real(M.predict(np.transpose(X_train)))))

print(classification_report(Y_train,P.T))
print(confusion_matrix(Y_train, P.T))

             precision    recall  f1-score   support

          0       0.96      0.24      0.38      1001
          1       0.39      0.35      0.37      1127
          2       0.26      0.31      0.28       991
          3       0.34      0.32      0.33      1032
          4       0.10      0.09      0.10       980
          5       0.21      0.29      0.25       863
          6       0.25      0.40      0.31      1014
          7       0.23      0.37      0.28      1070
          8       0.17      0.21      0.19       944
          9       0.00      0.00      0.00       978

avg / total       0.30      0.26      0.25     10000

[[236 443 169  66  43  24  17   3   0   0]
 [  0 400 457 142  69  37  16   5   1   0]
 [  6 130 303 259 156  80  40  15   2   0]
 [  0  30 179 330 255 115  64  38  20   1]
 [  1   1   5  33  91 247 359 202  41   0]
 [  4   6  13  61 152 249 234 111  33   0]
 [  0   3   7  25  81 230 403 258   7   0]
 [  0   1   6  16  17  63 144 400 423   0]
 [  0   1   3  23

In [53]:
# On test set
P=np.abs(np.round(np.real(M.predict(np.transpose(X_test)))))

print(classification_report(Y_test,P.T))
print(confusion_matrix(Y_test, P.T))


             precision    recall  f1-score   support

          0       0.92      0.23      0.36       980
          1       0.41      0.37      0.39      1135
          2       0.25      0.28      0.26      1032
          3       0.33      0.32      0.33      1010
          4       0.10      0.09      0.09       982
          5       0.20      0.25      0.22       892
          6       0.23      0.38      0.29       958
          7       0.20      0.36      0.26      1028
          8       0.20      0.24      0.22       974
          9       0.00      0.00      0.00      1009

avg / total       0.29      0.25      0.24     10000

[[221 413 188  79  37  20  17   4   1   0]
 [  0 418 449 149  61  34  18   5   1   0]
 [ 16 143 288 269 155  91  48  17   5   0]
 [  1  32 173 322 230 130  67  40  15   0]
 [  0   0   1  16  85 253 357 243  27   0]
 [  0   8  17  53 134 227 251 150  51   1]
 [  1   5  17  44  98 221 365 204   3   0]
 [  0   0  10  15  38  55 144 370 389   7]
 [  0   3   6  15

# SoftmaxKeras (200 sigmoids + 10 Softmax + MSE)¶

In [54]:
model = Sequential() 

model.add(Dense(200,  input_dim=784))
model.add(Activation('sigmoid'))
model.add(Dense(10,  input_dim=784))
model.add(Activation('softmax'))


sgd = SGD(lr=0.1)

tic = time.clock()

model.compile(loss='mse', optimizer=sgd)

toc = time.clock()

print("Compile time: ",toc-tic)

tic = time.clock()

model.fit(X_train, T, batch_size=1000, nb_epoch=100, validation_data=None, shuffle=False, verbose=0)  
toc = time.clock()

print("Run time: ",toc-tic)

('Compile time: ', 0.004078000000163229)
('Run time: ', 45.71273099999985)


In [55]:
# On train set
P=np.argmax(model.predict(X_train),axis=1)

print(classification_report(Y_train,P.T))
print(confusion_matrix(Y_train, P.T))

             precision    recall  f1-score   support

          0       0.47      0.97      0.63      1001
          1       0.46      0.98      0.63      1127
          2       0.88      0.49      0.63       991
          3       0.73      0.45      0.56      1032
          4       0.83      0.53      0.65       980
          5       0.00      0.00      0.00       863
          6       0.74      0.72      0.73      1014
          7       0.37      0.94      0.53      1070
          8       1.00      0.01      0.03       944
          9       0.00      0.00      0.00       978

avg / total       0.55      0.53      0.45     10000

[[ 973    8    2    1    1    0    8    8    0    0]
 [   1 1107    3    1    0    0    0   15    0    0]
 [  82  227  490   21   14    0   77   80    0    0]
 [ 189  178   18  464    3    0   11  169    0    0]
 [  59   40    2    5  520    0   40  314    0    0]
 [ 356  252    1   33    7    0   51  163    0    0]
 [ 116  109    8    1   15    0  735   30  

In [56]:
# On test set
P=np.argmax(model.predict(X_test),axis=1)

print(classification_report(Y_test,P.T))
print(confusion_matrix(Y_test, P.T))

             precision    recall  f1-score   support

          0       0.44      0.98      0.61       980
          1       0.50      0.99      0.66      1135
          2       0.89      0.47      0.62      1032
          3       0.73      0.48      0.58      1010
          4       0.80      0.50      0.61       982
          5       0.00      0.00      0.00       892
          6       0.72      0.69      0.71       958
          7       0.34      0.93      0.50      1028
          8       1.00      0.03      0.05       974
          9       0.00      0.00      0.00      1009

avg / total       0.55      0.52      0.44     10000

[[ 963    6    1    0    0    0    3    7    0    0]
 [   1 1124    2    1    0    0    0    7    0    0]
 [  95  265  487   31   10    0   49   95    0    0]
 [ 200  151   13  487    0    0   21  138    0    0]
 [  58   36    1    5  489    0   48  345    0    0]
 [ 436  194    4   21   15    0   42  179    0    1]
 [ 158   72    8    1   23    0  663   33  

In [61]:
M = mdl.Model()
M.add(layers.NonLinear(784,200,sigmoid()))
M.add(layers.Linear(200,10))
M.add(layers.SoftMaxLayer(10))

minim = minimizer.SGD()
sol=minim.train(mse(), M, np.transpose(X_train), T.T, lr=0.1, maxiter=100, batch_size=1000, log_step=10)


Iteration 0 in 0.41(s), cost = 0.449818
Iteration 10 in 4.49(s), cost = 0.449799
Iteration 20 in 8.58(s), cost = 0.449759
Iteration 30 in 12.67(s), cost = 0.449722
Iteration 40 in 16.75(s), cost = 0.449676
Iteration 50 in 20.84(s), cost = 0.449616
Iteration 60 in 24.94(s), cost = 0.449547
Iteration 70 in 29.02(s), cost = 0.449488
Iteration 80 in 33.11(s), cost = 0.449472
Iteration 90 in 37.20(s), cost = 0.449502
('Cost: ', 0.44954842310257515)
('Sol: ', array([ 0.04898311,  0.04930198,  0.04964121, ..., -0.20971305,
       -0.20161145, -0.19696665]))
Time: 40 s
