In [1]:
from rtbm.riemann_theta.riemann_theta import RiemannTheta

import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.metrics import confusion_matrix, classification_report

import theano

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Dropout, Reshape  
from keras.optimizers import SGD

from sklearn.preprocessing import LabelBinarizer

import time

from rtbm import RTBM, minimizer

import rtbm.layers as layers
import rtbm.model as mdl

from rtbm.costfunctions import mse
from rtbm.activations import sigmoid, linear

Using Theano backend.


In [2]:
# Load MNIST dataset
MNIST_train = pd.read_csv('~/data/mnist_train.csv', delimiter=",",header=None).values
MNIST_test  = pd.read_csv('~/data/mnist_test.csv', delimiter=",",header=None).values

# Prepare data (normalized onto [0,1])
Y_train = MNIST_train[0:10000,0]
X_train = MNIST_train[0:10000,1:]/255.0

Y_test = MNIST_test[:,0]
X_test = MNIST_test[:,1:]/255.0

enc = LabelBinarizer()
enc.fit(np.diag([1,1,1,1,1,1,1,1,1,1]))
enc.classes_ = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
T=enc.transform(Y_train)

# Keras (500 linear + 100 linear + 1 linear + MSE)

In [None]:
model = Sequential() 

model.add(Dense(500,  input_dim=784))
model.add(Activation('linear'))
#model.add(Dense(100,  input_dim=784))
#model.add(Activation('linear'))
model.add(Dense(output_dim=1))
model.add(Activation('linear'))


sgd = SGD(lr=0.001)

tic = time.clock()

model.compile(loss='mse', optimizer=sgd)

toc = time.clock()

print("Compile time: ",toc-tic)

tic = time.clock()

model.fit(X_train, Y_train, batch_size=10000, nb_epoch=100, validation_data=None, shuffle=False, verbose=1)  
toc = time.clock()

print("Run time: ",toc-tic)

In [None]:
# On train set
P=np.abs(np.round(np.real(model.predict(X_train)))).flatten()


print(classification_report(Y_train,P.T))
print(confusion_matrix(Y_train, P.T))

In [None]:
# On test set
P=np.abs(np.round(np.real(model.predict(X_test)))).flatten()


print(classification_report(Y_test,P.T))
print(confusion_matrix(Y_test, P.T))

## With Theta and SGD


In [3]:
M = mdl.Model()
M.add(layers.NonLinear(784,500,linear()))
#M.add(layers.NonLinear(500,100,linear()))
M.add(layers.NonLinear(500,1,linear()))

minim = minimizer.SGD()
sol=minim.train(mse(), M, np.transpose(X_train), Y_train.reshape(1,len(Y_train)), lr=0.001, maxiter=100, batch_size=10000, log_step=10)


('Batch C: ', 26.949287783655517)
Iteration 0 in 0.49(s), cost = 19.760788
('Batch C: ', 7.6847045878354505)
Iteration 10 in 3.63(s), cost = 7.536740
('Batch C: ', 6.691516365115832)
Iteration 20 in 6.71(s), cost = 6.619935
('Batch C: ', 6.0888996304586165)
Iteration 30 in 9.77(s), cost = 6.040583
('Batch C: ', 5.6793073799621965)
Iteration 40 in 12.82(s), cost = 5.646176
('Batch C: ', 5.396317650266337)
Iteration 50 in 15.89(s), cost = 5.373155
('Batch C: ', 5.196349806893555)
Iteration 60 in 18.95(s), cost = 5.179716
('Batch C: ', 5.050745746312988)
Iteration 70 in 22.00(s), cost = 5.038390
('Batch C: ', 4.940835938722546)
Iteration 80 in 25.05(s), cost = 4.931301
('Batch C: ', 4.854576459623521)
Iteration 90 in 28.10(s), cost = 4.846926
('Cost: ', 4.7907242554340375)
('Sol: ', array([0.00214954, 0.00696797, 0.00887331, ..., 0.05668997, 0.00395233,
       0.08101485]))
Time: 30 s


In [4]:
# On train set
P=np.abs(np.round(np.real(M.predict(np.transpose(X_train)))))

print(classification_report(Y_train,P.T))
print(confusion_matrix(Y_train, P.T))

             precision    recall  f1-score   support

        0.0       0.88      0.14      0.25      1001
        1.0       0.22      0.12      0.15      1127
        2.0       0.16      0.21      0.18       991
        3.0       0.18      0.29      0.22      1032
        4.0       0.09      0.16      0.11       980
        5.0       0.09      0.17      0.12       863
        6.0       0.10      0.13      0.11      1014
        7.0       0.26      0.21      0.23      1070
        8.0       0.15      0.06      0.09       944
        9.0       0.47      0.07      0.13       978
       10.0       0.00      0.00      0.00         0
       11.0       0.00      0.00      0.00         0
       12.0       0.00      0.00      0.00         0

avg / total       0.26      0.16      0.16     10000

[[144 319 244 130  89  42  17  12   4   0   0   0   0]
 [  0 131 464 333 133  50  11   3   2   0   0   0   0]
 [ 13  74 207 302 214 123  44  12   2   0   0   0   0]
 [  3  40 196 295 251 148  62  21  13

  'recall', 'true', average, warn_for)


In [5]:
# On train set
P=np.abs(np.round(np.real(M.predict(np.transpose(X_train)))))

print(classification_report(Y_train,P.T))
print(confusion_matrix(Y_train, P.T))

             precision    recall  f1-score   support

        0.0       0.88      0.14      0.25      1001
        1.0       0.22      0.12      0.15      1127
        2.0       0.16      0.21      0.18       991
        3.0       0.18      0.29      0.22      1032
        4.0       0.09      0.16      0.11       980
        5.0       0.09      0.17      0.12       863
        6.0       0.10      0.13      0.11      1014
        7.0       0.26      0.21      0.23      1070
        8.0       0.15      0.06      0.09       944
        9.0       0.47      0.07      0.13       978
       10.0       0.00      0.00      0.00         0
       11.0       0.00      0.00      0.00         0
       12.0       0.00      0.00      0.00         0

avg / total       0.26      0.16      0.16     10000

[[144 319 244 130  89  42  17  12   4   0   0   0   0]
 [  0 131 464 333 133  50  11   3   2   0   0   0   0]
 [ 13  74 207 302 214 123  44  12   2   0   0   0   0]
 [  3  40 196 295 251 148  62  21  13

# Keras (200 sigmoids + 10 sigmoids + 1 linear + MSE)

In [44]:
model = Sequential() 

model.add(Dense(10,  input_dim=784))
model.add(Activation('sigmoid'))
#model.add(Dense(10))
#model.add(Activation('sigmoid'))
model.add(Dense(output_dim=1))
model.add(Activation('linear'))


sgd = SGD(lr=0.1)

tic = time.clock()

model.compile(loss='mse', optimizer=sgd)

toc = time.clock()

print("Compile time: ",toc-tic)

tic = time.clock()

model.fit(X_train, Y_train, batch_size=10000, nb_epoch=200, validation_data=None, shuffle=False, verbose=0)  
toc = time.clock()

print("Run time: ",toc-tic)

  import sys


('Compile time: ', 0.003959000000008928)
('Run time: ', 43.429708000000005)


In [45]:
# On train set
P=np.abs(np.round(np.real(model.predict(X_train)))).flatten()


print(classification_report(Y_train,P.T))
print(confusion_matrix(Y_train, P.T))

             precision    recall  f1-score   support

          0       0.91      0.47      0.62      1001
          1       0.62      0.54      0.58      1127
          2       0.40      0.60      0.48       991
          3       0.38      0.30      0.34      1032
          4       0.03      0.02      0.03       980
          5       0.20      0.54      0.29       863
          6       0.11      0.12      0.12      1014
          7       0.35      0.27      0.31      1070
          8       0.17      0.25      0.20       944
          9       0.00      0.00      0.00       978

avg / total       0.33      0.31      0.30     10000

[[466 302 125  49  36  16   6   1   0   0]
 [ 38 606 275 122  56  20   8   2   0   0]
 [  1  51 591 189  79  60  14   6   0   0]
 [  1   8 430 311 112 101  40  13  16   0]
 [  0   1   0   9  21 571 323  44  11   0]
 [  5   4  36  60 133 464 100  43  18   0]
 [  0   3   5  20  86 770 121   8   1   0]
 [  0   1   7  18  40 105 192 290 417   0]
 [  0   0  11  31

In [46]:
# On test set
P=np.abs(np.round(np.real(model.predict(X_test)))).flatten()


print(classification_report(Y_test,P.T))
print(confusion_matrix(Y_test, P.T))

             precision    recall  f1-score   support

          0       0.94      0.45      0.61       980
          1       0.61      0.55      0.58      1135
          2       0.40      0.59      0.48      1032
          3       0.34      0.27      0.30      1010
          4       0.03      0.02      0.02       982
          5       0.18      0.46      0.26       892
          6       0.07      0.08      0.07       958
          7       0.33      0.27      0.30      1028
          8       0.21      0.29      0.24       974
          9       0.00      0.00      0.00      1009

avg / total       0.32      0.30      0.29     10000

[[439 297 134  63  25  15   5   1   1   0]
 [ 23 625 294 115  32  41   4   1   0   0]
 [  2  61 610 193  78  58  22   5   3   0]
 [  1  17 408 272 132 105  40  23  12   0]
 [  0   0   0   6  20 551 346  47  12   0]
 [  2  12  46  48 158 410 125  62  29   0]
 [  1   5  12  38  86 733  77   6   0   0]
 [  0   2  11  24  47  97 171 276 400   0]
 [  0   3  11  26

# With Theta and SGD

In [30]:
M = mdl.Model()
M.add(layers.NonLinear(784,10,sigmoid()))
#M.add(layers.NonLinear(200,10,sigmoid()))
M.add(layers.Linear(10,1))

minim = minimizer.SGD()
sol=minim.train(mse(), M, np.transpose(X_train), Y_train.reshape(1,len(Y_train)), lr=0.1, maxiter=200, batch_size=10000, log_step=10)


('Batch C: ', 31.87743043119631)
Iteration 0 in 0.06(s), cost = 8.117631
('Batch C: ', 6.057554959999378)
Iteration 10 in 0.54(s), cost = 5.860846
('Batch C: ', 4.612933130291003)
Iteration 20 in 1.02(s), cost = 4.520134
('Batch C: ', 3.936395437084709)
Iteration 30 in 1.50(s), cost = 3.891226
('Batch C: ', 3.7096171514261966)
Iteration 40 in 1.98(s), cost = 3.777202
('Batch C: ', 4.234339501213176)
Iteration 50 in 2.46(s), cost = 3.894293
('Batch C: ', 3.3857447499345703)
Iteration 60 in 2.93(s), cost = 3.368551
('Batch C: ', 3.235045523283662)
Iteration 70 in 3.41(s), cost = 3.221552
('Batch C: ', 3.1083328177461995)
Iteration 80 in 3.89(s), cost = 3.096569
('Batch C: ', 2.9972193636453244)
Iteration 90 in 4.34(s), cost = 2.986890
('Batch C: ', 2.9012325880601773)
Iteration 100 in 4.75(s), cost = 2.892743
('Batch C: ', 2.8367884411591033)
Iteration 110 in 5.18(s), cost = 2.834637
('Batch C: ', 2.898089346268338)
Iteration 120 in 5.59(s), cost = 2.913072
('Batch C: ', 2.90410953671692

In [31]:
# On train set
P=np.abs(np.round(np.real(M.predict(np.transpose(X_train)))))

print(classification_report(Y_train,P.T))
print(confusion_matrix(Y_train, P.T))

             precision    recall  f1-score   support

          0       0.94      0.49      0.64      1001
          1       0.48      0.54      0.51      1127
          2       0.37      0.35      0.36       991
          3       0.50      0.58      0.54      1032
          4       0.16      0.15      0.15       980
          5       0.21      0.36      0.27       863
          6       0.28      0.46      0.35      1014
          7       0.37      0.34      0.35      1070
          8       0.16      0.18      0.17       944
          9       0.00      0.00      0.00       978

avg / total       0.35      0.35      0.34     10000

[[490 307 102  46  33  15   8   0   0   0]
 [ 22 607 309 107  55  16  11   0   0   0]
 [  2 287 348 199  85  45  19   6   0   0]
 [  1  34 121 601 129  80  39  13  14   0]
 [  0   1   3  23 143 330 426  46   8   0]
 [  4   8  20 118 203 313 151  35  11   0]
 [  1   3  10  29 108 394 467   2   0   0]
 [  0   2   6  23  45  91 198 361 344   0]
 [  0   2  12  40

In [32]:
# On test set
P=np.abs(np.round(np.real(M.predict(np.transpose(X_test)))))

print(classification_report(Y_test,P.T))
print(confusion_matrix(Y_test, P.T))


             precision    recall  f1-score   support

          0       0.95      0.46      0.62       980
          1       0.50      0.56      0.52      1135
          2       0.37      0.36      0.36      1032
          3       0.48      0.56      0.52      1010
          4       0.15      0.13      0.14       982
          5       0.22      0.33      0.26       892
          6       0.25      0.47      0.33       958
          7       0.32      0.32      0.32      1028
          8       0.18      0.19      0.19       974
          9       0.00      0.00      0.00      1009

avg / total       0.34      0.34      0.33     10000

[[452 307 124  46  27  17   6   1   0   0]
 [ 18 631 298 117  36  30   4   1   0   0]
 [  6 286 369 198  95  43  28   5   2   0]
 [  2  24 130 567 151  77  32  21   6   0]
 [  0   0   2  12 127 314 471  49   7   0]
 [  0  11  20 118 204 296 169  61  13   0]
 [  0   6  23  48 105 329 446   1   0   0]
 [  0   3  13  31  48  83 179 334 337   0]
 [  0   5   9  32

# SoftmaxKeras (200 sigmoids + 10 Softmax + MSE)¶

In [None]:
model = Sequential() 

model.add(Dense(200,  input_dim=784))
model.add(Activation('sigmoid'))
model.add(Dense(10,  input_dim=784))
model.add(Activation('softmax'))


sgd = SGD(lr=0.1)

tic = time.clock()

model.compile(loss='mse', optimizer=sgd)

toc = time.clock()

print("Compile time: ",toc-tic)

tic = time.clock()

model.fit(X_train, T, batch_size=1000, nb_epoch=100, validation_data=None, shuffle=False, verbose=0)  
toc = time.clock()

print("Run time: ",toc-tic)

In [None]:
# On train set
P=np.argmax(model.predict(X_train),axis=1)

print(classification_report(Y_train,P.T))
print(confusion_matrix(Y_train, P.T))

In [None]:
# On test set
P=np.argmax(model.predict(X_test),axis=1)

print(classification_report(Y_test,P.T))
print(confusion_matrix(Y_test, P.T))

In [None]:
M = mdl.Model()
M.add(layers.NonLinear(784,200,sigmoid()))
M.add(layers.Linear(200,10))
M.add(layers.SoftMaxLayer(10))

minim = minimizer.SGD()
sol=minim.train(mse(), M, np.transpose(X_train), T.T, lr=0.1, maxiter=100, batch_size=1000, log_step=10)
