In [5]:
# -*- coding: utf-8 -*-
'''An implementation of sequence to sequence learning for performing addition

Input: "535+61"
Output: "596"
Padding is handled by using a repeated sentinel character (space)

Input may optionally be reversed, shown to increase performance in many tasks in:
"Learning to Execute"
http://arxiv.org/abs/1410.4615
and
"Sequence to Sequence Learning with Neural Networks"
http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf
Theoretically it introduces shorter term dependencies between source and target.

Two digits reversed:
+ One layer LSTM (128 HN), 5k training examples = 99% train/test accuracy in 55 epochs

Three digits reversed:
+ One layer LSTM (128 HN), 50k training examples = 99% train/test accuracy in 100 epochs

Four digits reversed:
+ One layer LSTM (128 HN), 400k training examples = 99% train/test accuracy in 20 epochs

Five digits reversed:
+ One layer LSTM (128 HN), 550k training examples = 99% train/test accuracy in 30 epochs
'''  # noqa

from __future__ import print_function

#add 
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
import numpy as np
from six.moves import range


In [None]:
class CharacterTable(object):
    """Given a set of characters:
    + Encode them to a one-hot integer representation
    + Decode the one-hot or integer representation to their character output
    + Decode a vector of probabilities to their character output
    """
    def __init__(self, chars):
        """Initialize character table.

        # Arguments
            chars: Characters that can appear in the input.
        """
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

    def encode(self, C, num_rows):
        """One-hot encode given string C.

        # Arguments
            C: string, to be encoded.
            num_rows: Number of rows in the returned one-hot encoding. This is
                used to keep the # of rows for each data the same.
        """
        x = np.zeros((num_rows, len(self.chars)))
        for i, c in enumerate(C):
            x[i, self.char_indices[c]] = 1
        return x

    def decode(self, x, calc_argmax=True):
        """Decode the given vector or 2D array to their character output.

        # Arguments
            x: A vector or a 2D array of probabilities or one-hot representations;
                or a vector of character indices (used with `calc_argmax=False`).
            calc_argmax: Whether to find the character index with maximum
                probability, defaults to `True`.
        """
        if calc_argmax:
            x = x.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in x)


class colors:
    ok = '\033[92m'
    fail = '\033[91m'
    close = '\033[0m'


In [None]:

# Parameters for the model and dataset.
TRAINING_SIZE = 50000
DIGITS = 3
REVERSE = True

# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
# int is DIGITS.
MAXLEN = DIGITS + 1 + DIGITS

# All the numbers, plus sign and space for padding.
chars = '0123456789+ '
ctable = CharacterTable(chars)

questions = []
expected = []
seen = set()
print('Generating data...')
while len(questions) < TRAINING_SIZE:
    f = lambda: int(''.join(np.random.choice(list('0123456789'))
                    for i in range(np.random.randint(1, DIGITS + 1))))
    a, b = f(), f()
    # Skip any addition questions we've already seen
    # Also skip any such that x+Y == Y+x (hence the sorting).
    key = tuple(sorted((a, b)))
    if key in seen:
        continue
    seen.add(key)
    # Pad the data with spaces such that it is always MAXLEN.
    q = '{}+{}'.format(a, b)
    query = q + ' ' * (MAXLEN - len(q))
    ans = str(a + b)
    # Answers can be of maximum size DIGITS + 1.
    ans += ' ' * (DIGITS + 1 - len(ans))
    if REVERSE:
        # Reverse the query, e.g., '12+345  ' becomes '  543+21'. (Note the
        # space used for padding.)
        query = query[::-1]
    questions.append(query)
    expected.append(ans)
print('Total addition questions:', len(questions))

print('Vectorization...')
x = np.zeros((len(questions), MAXLEN, len(chars)), dtype=np.bool)
y = np.zeros((len(questions), DIGITS + 1, len(chars)), dtype=np.bool)
for i, sentence in enumerate(questions):
    x[i] = ctable.encode(sentence, MAXLEN)
for i, sentence in enumerate(expected):
    y[i] = ctable.encode(sentence, DIGITS + 1)

# Shuffle (x, y) in unison as the later parts of x will almost all be larger
# digits.
indices = np.arange(len(y))
np.random.shuffle(indices)
x = x[indices]
y = y[indices]

# Explicitly set apart 10% for validation data that we never train over.
split_at = len(x) - len(x) // 10
(x_train, x_val) = x[:split_at], x[split_at:]
(y_train, y_val) = y[:split_at], y[split_at:]

print('Training Data:')
print(x_train.shape)
print(y_train.shape)

print('Validation Data:')
print(x_val.shape)
print(y_val.shape)

# Try replacing GRU, or SimpleRNN.
RNN = layers.LSTM
HIDDEN_SIZE = 128
BATCH_SIZE = 128
LAYERS = 1

print('Build model...')
model = Sequential()
# "Encode" the input sequence using an RNN, producing an output of HIDDEN_SIZE.
# Note: In a situation where your input sequences have a variable length,
# use input_shape=(None, num_feature).
model.add(RNN(HIDDEN_SIZE, input_shape=(MAXLEN, len(chars))))
# As the decoder RNN's input, repeatedly provide with the last output of
# RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum
# length of output, e.g., when DIGITS=3, max output is 999+999=1998.
model.add(layers.RepeatVector(DIGITS + 1))
# The decoder RNN could be multiple layers stacked or a single layer.
for _ in range(LAYERS):
    # By setting return_sequences to True, return not only the last output but
    # all the outputs so far in the form of (num_samples, timesteps,
    # output_dim). This is necessary as TimeDistributed in the below expects
    # the first dimension to be the timesteps.
    model.add(RNN(HIDDEN_SIZE, return_sequences=True))

# Apply a dense layer to the every temporal slice of an input. For each of step
# of the output sequence, decide which character should be chosen.
model.add(layers.TimeDistributed(layers.Dense(len(chars), activation='softmax')))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

# Train the model each generation and show predictions against the validation
# dataset.
for iteration in range(1, 200):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(x_train, y_train,
              batch_size=BATCH_SIZE,
              epochs=1,
              validation_data=(x_val, y_val))
    # Select 10 samples from the validation set at random so we can visualize
    # errors.
    for i in range(10):
        ind = np.random.randint(0, len(x_val))
        rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
        preds = model.predict_classes(rowx, verbose=0)
        q = ctable.decode(rowx[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax=False)
        print('Q', q[::-1] if REVERSE else q, end=' ')
        print('T', correct, end=' ')
        if correct == guess:
            print(colors.ok + '☑' + colors.close, end=' ')
        else:
            print(colors.fail + '☒' + colors.close, end=' ')
        print(guess)


Generating data...
Total addition questions: 50000
Vectorization...
Training Data:
(45000, 7, 12)
(45000, 4, 12)
Validation Data:
(5000, 7, 12)
(5000, 4, 12)
Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 128)               72192     
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 4, 128)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 4, 128)            131584    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 4, 12)             1548      
Total params: 205,324
Trainable params: 205,324
Non-trainable params: 0
_________________________________________________________________

--------------------------------------------------
Iteration 1
Train on 45000 samples, validat

Q 206+500 T 706  [91m☒[0m 616 
Q 361+19  T 380  [92m☑[0m 380 
Q 89+29   T 118  [92m☑[0m 118 
Q 5+596   T 601  [91m☒[0m 502 
Q 283+952 T 1235 [92m☑[0m 1235
Q 823+58  T 881  [92m☑[0m 881 
Q 421+345 T 766  [92m☑[0m 766 
Q 138+24  T 162  [91m☒[0m 163 
Q 2+947   T 949  [91m☒[0m 948 
Q 423+380 T 803  [92m☑[0m 803 

--------------------------------------------------
Iteration 14
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 576+743 T 1319 [92m☑[0m 1319
Q 223+631 T 854  [91m☒[0m 844 
Q 887+918 T 1805 [91m☒[0m 1705
Q 704+251 T 955  [92m☑[0m 955 
Q 219+0   T 219  [92m☑[0m 219 
Q 436+924 T 1360 [92m☑[0m 1360
Q 41+502  T 543  [91m☒[0m 544 
Q 75+250  T 325  [92m☑[0m 325 
Q 45+43   T 88   [91m☒[0m 87  
Q 44+59   T 103  [92m☑[0m 103 

--------------------------------------------------
Iteration 15
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 532+12  T 544  [91m☒[0m 545 
Q 67+476  T 543  [92m☑[0m 543 
Q 269+119 T 388  [92m☑[0

Q 346+5   T 351  [92m☑[0m 351 
Q 291+47  T 338  [92m☑[0m 338 
Q 980+37  T 1017 [92m☑[0m 1017
Q 13+328  T 341  [92m☑[0m 341 
Q 0+201   T 201  [92m☑[0m 201 
Q 415+99  T 514  [92m☑[0m 514 
Q 40+894  T 934  [92m☑[0m 934 
Q 32+642  T 674  [92m☑[0m 674 
Q 470+0   T 470  [92m☑[0m 470 
Q 350+62  T 412  [92m☑[0m 412 

--------------------------------------------------
Iteration 28
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 693+1   T 694  [92m☑[0m 694 
Q 592+510 T 1102 [92m☑[0m 1102
Q 752+8   T 760  [92m☑[0m 760 
Q 32+778  T 810  [92m☑[0m 810 
Q 473+5   T 478  [92m☑[0m 478 
Q 831+405 T 1236 [92m☑[0m 1236
Q 155+833 T 988  [92m☑[0m 988 
Q 1+597   T 598  [92m☑[0m 598 
Q 229+217 T 446  [92m☑[0m 446 
Q 224+308 T 532  [92m☑[0m 532 

--------------------------------------------------
Iteration 29
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 581+64  T 645  [92m☑[0m 645 
Q 53+960  T 1013 [92m☑[0m 1013
Q 883+587 T 1470 [92m☑[0

Q 85+61   T 146  [92m☑[0m 146 
Q 516+8   T 524  [92m☑[0m 524 
Q 640+5   T 645  [92m☑[0m 645 
Q 435+9   T 444  [92m☑[0m 444 
Q 160+97  T 257  [92m☑[0m 257 
Q 371+62  T 433  [92m☑[0m 433 
Q 913+210 T 1123 [92m☑[0m 1123
Q 807+26  T 833  [92m☑[0m 833 
Q 926+2   T 928  [92m☑[0m 928 
Q 97+516  T 613  [92m☑[0m 613 

--------------------------------------------------
Iteration 42
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 932+98  T 1030 [92m☑[0m 1030
Q 29+963  T 992  [92m☑[0m 992 
Q 94+494  T 588  [92m☑[0m 588 
Q 86+324  T 410  [92m☑[0m 410 
Q 52+29   T 81   [92m☑[0m 81  
Q 99+25   T 124  [92m☑[0m 124 
Q 491+0   T 491  [92m☑[0m 491 
Q 532+1   T 533  [92m☑[0m 533 
Q 713+93  T 806  [92m☑[0m 806 
Q 276+2   T 278  [92m☑[0m 278 

--------------------------------------------------
Iteration 43
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 67+287  T 354  [92m☑[0m 354 
Q 72+598  T 670  [92m☑[0m 670 
Q 75+250  T 325  [92m☑[0

Q 906+73  T 979  [92m☑[0m 979 
Q 133+988 T 1121 [92m☑[0m 1121
Q 951+508 T 1459 [92m☑[0m 1459
Q 347+893 T 1240 [92m☑[0m 1240
Q 290+442 T 732  [92m☑[0m 732 
Q 833+8   T 841  [92m☑[0m 841 
Q 637+98  T 735  [92m☑[0m 735 
Q 630+585 T 1215 [92m☑[0m 1215
Q 271+1   T 272  [92m☑[0m 272 
Q 773+148 T 921  [92m☑[0m 921 

--------------------------------------------------
Iteration 56
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 932+3   T 935  [92m☑[0m 935 
Q 279+67  T 346  [92m☑[0m 346 
Q 82+42   T 124  [92m☑[0m 124 
Q 635+496 T 1131 [92m☑[0m 1131
Q 532+14  T 546  [92m☑[0m 546 
Q 69+471  T 540  [92m☑[0m 540 
Q 699+723 T 1422 [92m☑[0m 1422
Q 283+1   T 284  [92m☑[0m 284 
Q 953+76  T 1029 [92m☑[0m 1029
Q 750+83  T 833  [92m☑[0m 833 

--------------------------------------------------
Iteration 57
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 343+34  T 377  [92m☑[0m 377 
Q 0+145   T 145  [92m☑[0m 145 
Q 46+641  T 687  [92m☑[0

Q 207+62  T 269  [92m☑[0m 269 
Q 242+351 T 593  [92m☑[0m 593 
Q 5+116   T 121  [92m☑[0m 121 
Q 735+88  T 823  [92m☑[0m 823 
Q 810+793 T 1603 [92m☑[0m 1603
Q 425+888 T 1313 [92m☑[0m 1313
Q 49+828  T 877  [92m☑[0m 877 
Q 402+794 T 1196 [92m☑[0m 1196
Q 222+978 T 1200 [92m☑[0m 1200
Q 211+207 T 418  [92m☑[0m 418 

--------------------------------------------------
Iteration 70
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 52+37   T 89   [92m☑[0m 89  
Q 61+47   T 108  [92m☑[0m 108 
Q 3+15    T 18   [92m☑[0m 18  
Q 10+389  T 399  [92m☑[0m 399 
Q 655+77  T 732  [92m☑[0m 732 
Q 218+47  T 265  [92m☑[0m 265 
Q 751+26  T 777  [92m☑[0m 777 
Q 77+271  T 348  [92m☑[0m 348 
Q 633+190 T 823  [92m☑[0m 823 
Q 10+205  T 215  [92m☑[0m 215 

--------------------------------------------------
Iteration 71
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 83+46   T 129  [92m☑[0m 129 
Q 570+60  T 630  [92m☑[0m 630 
Q 178+606 T 784  [92m☑[0

Q 657+6   T 663  [92m☑[0m 663 
Q 823+122 T 945  [92m☑[0m 945 
Q 9+555   T 564  [92m☑[0m 564 
Q 570+365 T 935  [92m☑[0m 935 
Q 221+824 T 1045 [92m☑[0m 1045
Q 6+678   T 684  [92m☑[0m 684 
Q 56+176  T 232  [92m☑[0m 232 
Q 71+460  T 531  [92m☑[0m 531 
Q 710+353 T 1063 [92m☑[0m 1063
Q 6+539   T 545  [92m☑[0m 545 

--------------------------------------------------
Iteration 84
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 643+63  T 706  [92m☑[0m 706 
Q 34+616  T 650  [92m☑[0m 650 
Q 74+6    T 80   [92m☑[0m 80  
Q 402+948 T 1350 [92m☑[0m 1350
Q 573+3   T 576  [92m☑[0m 576 
Q 216+581 T 797  [92m☑[0m 797 
Q 315+352 T 667  [92m☑[0m 667 
Q 14+267  T 281  [92m☑[0m 281 
Q 952+781 T 1733 [92m☑[0m 1733
Q 9+357   T 366  [92m☑[0m 366 

--------------------------------------------------
Iteration 85
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 602+6   T 608  [92m☑[0m 608 
Q 77+205  T 282  [92m☑[0m 282 
Q 603+985 T 1588 [92m☑[0

Q 26+207  T 233  [92m☑[0m 233 
Q 19+43   T 62   [92m☑[0m 62  
Q 76+349  T 425  [92m☑[0m 425 
Q 21+201  T 222  [92m☑[0m 222 
Q 340+8   T 348  [92m☑[0m 348 
Q 423+663 T 1086 [92m☑[0m 1086
Q 306+19  T 325  [92m☑[0m 325 
Q 54+224  T 278  [92m☑[0m 278 
Q 415+479 T 894  [92m☑[0m 894 
Q 46+85   T 131  [92m☑[0m 131 

--------------------------------------------------
Iteration 98
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 797+472 T 1269 [92m☑[0m 1269
Q 99+631  T 730  [92m☑[0m 730 
Q 68+935  T 1003 [92m☑[0m 1003
Q 53+568  T 621  [92m☑[0m 621 
Q 80+31   T 111  [92m☑[0m 111 
Q 20+841  T 861  [92m☑[0m 861 
Q 432+883 T 1315 [92m☑[0m 1315
Q 8+174   T 182  [92m☑[0m 182 
Q 79+154  T 233  [92m☑[0m 233 
Q 55+504  T 559  [92m☑[0m 559 

--------------------------------------------------
Iteration 99
Train on 45000 samples, validate on 5000 samples
Epoch 1/1
Q 0+970   T 970  [92m☑[0m 970 
Q 27+155  T 182  [92m☑[0m 182 
Q 15+764  T 779  [92m☑[0