In [1]:
from keras.datasets import mnist
from keras.engine.topology import Layer
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten, TimeDistributed, LSTM
from keras.utils import to_categorical
from keras import optimizers

import t3f
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train / 127.5 - 1.0
x_test = x_test / 127.5 - 1.0

y_train = to_categorical(y_train, num_classes=10)
y_test = to_categorical(y_test, num_classes=10)

## LSTM

In [3]:
model = Sequential()
model.add(TimeDistributed(Flatten(), input_shape=(28, 28)))
model.add(LSTM(25))
model.add(Dense(10))
model.add(Activation('softmax'))

In [4]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_1 (TimeDist (None, 28, 28)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 25)                5400      
_________________________________________________________________
dense_1 (Dense)              (None, 10)                260       
_________________________________________________________________
activation_1 (Activation)    (None, 10)                0         
Total params: 5,660
Trainable params: 5,660
Non-trainable params: 0
_________________________________________________________________


In [5]:
optimizer = optimizers.Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [6]:
model.fit(x_train, y_train, epochs=3, batch_size=64, validation_data=(x_test, y_test))

Train on 60000 samples, validate on 10000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ffa7e82ae48>

## TT LSTM

#### In comparison to 'Tensor-Train Recurrent Neural Networks for Video Classification' paper we may compress both kernel and recurrent kernel in our recurrent cells. If we use basic implementaton from paper we will have much more parameters.

In [7]:
from lstm import TT_LSTM

In [8]:
model = Sequential()
model.add(TimeDistributed(Flatten(), input_shape=(28, 28)))
model.add(TT_LSTM(row_dims=[4, 7], column_dims=[5, 5], tt_rank=4))
model.add(Dense(10))
model.add(Activation('softmax'))

(?, 100)
(?, 25)
A TT-Matrix variable of size 25 x 100, underlying tensor shape: (5, 5) x (20, 5), TT-ranks: (1, 4, 1)
(?, 100)
(?, 100)
(?, 25)
A TT-Matrix variable of size 25 x 100, underlying tensor shape: (5, 5) x (20, 5), TT-ranks: (1, 4, 1)
(?, 100)


In [9]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_2 (TimeDist (None, 28, 28)            0         
_________________________________________________________________
tt_lstm_1 (TT_LSTM)          (None, 25)                960       
_________________________________________________________________
dense_2 (Dense)              (None, 10)                260       
_________________________________________________________________
activation_2 (Activation)    (None, 10)                0         
Total params: 1,220
Trainable params: 1,220
Non-trainable params: 0
_________________________________________________________________


In [10]:
optimizer = optimizers.Adam(lr=1e-2)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [11]:
model.fit(x_train, y_train, epochs=3, batch_size=64, validation_data=(x_test, y_test))

Train on 60000 samples, validate on 10000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ffa7f0ccdd8>

#### TTLSTM gives us similar to basic LSTM with 5x less params (just for toy example, in real cases ew may have 100x and more compression rates).

## TTLSTM + YOLO

using such layers allows us to train recurrent detectors end 2 end (in comparison to https://arxiv.org/abs/1607.04648 ) their they use precomputed features from convolutional networks.
Furthermore, even if we use pretrained features we will have ~200 millions of parameters in lstm if have default detection networks like yolo with LSTM layer. With TTLSTM we have several dozens of thousands of additional weights.

If we use implementation from TTRNN for classification paper we will have ~100 millions parameters due non compression of internal recurrent kernel.