## Seq-to-Seq RNN to solve equations given in string format

Problem Statement:
Given the string "54+7", the model should return a prediction: "61". Similar for subtraction.

Approach:
In this project, we want to create a RNN model and train it to learn the meanings of various characters and understand a simple plus operation. The model needs to infer the meaning of various characters and then learn addition from the given data. RNNs are perfect for solving a problem like this because both the input and output are sequences. So, the model must learn the sequence of the input and then predict a sequence for the output.

In [26]:
import numpy as np
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import TimeDistributed, Dense, Dropout, SimpleRNN, RepeatVector
from tensorflow.keras.callbacks import EarlyStopping, LambdaCallback

from termcolor import colored

print('Tested with tensorflow version 2.0.1')
print('Using tensorflow version:', tf.__version__)

Tested with tensorflow version 2.0.1
Using tensorflow version: 2.9.0


### Generate Data


In [27]:
all_chars = '0123456789+-'

In [28]:
num_features = len(all_chars)

char_to_index = dict((c, i) for i, c in enumerate(all_chars))
index_to_char = dict((i, c) for i, c in enumerate(all_chars))

print('Number of features:', num_features)
print(char_to_index)
print(index_to_char)

Number of features: 12
{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, '+': 10, '-': 11}
{0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: '+', 11: '-'}


In [29]:
np.random.randint(low=0, high=100)>50

True

In [30]:
def generate_data():
    first_num = np.random.randint(low=0,high=100)
    second_num = np.random.randint(low=0,high=100)
    add = np.squeeze(np.random.randint(low=0, high=100)) > 50.
    if add:
        example = str(first_num) + '+' + str(second_num)
        label = str(first_num+second_num)
    else:
        example = str(first_num) + '-' + str(second_num)
        label = str(first_num-second_num)
    return example, label

generate_data()

('58+38', '96')

### Create the Model

In [31]:
hidden_units = 128
max_time_steps = 5

model = Sequential([
    SimpleRNN(hidden_units, input_shape=(None, num_features)),
    RepeatVector(max_time_steps),
    SimpleRNN(hidden_units, return_sequences=True),
    TimeDistributed(Dense(num_features, activation='softmax'))
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_2 (SimpleRNN)    (None, 128)               18048     
                                                                 
 repeat_vector_1 (RepeatVect  (None, 5, 128)           0         
 or)                                                             
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 5, 128)            32896     
                                                                 
 time_distributed_1 (TimeDis  (None, 5, 12)            1548      
 tributed)                                                       
                                                                 
Total params: 52,492
Trainable params: 52,492
Non-trainable params: 0
_________________________________________________________________


### Vectorize and De-Vectorize Data

In [32]:
def vectorize_example(example, label):
    
    x = np.zeros((max_time_steps, num_features))
    y = np.zeros((max_time_steps, num_features))
    
    diff_x = max_time_steps - len(example)
    diff_y = max_time_steps - len(label)
    
    for i, c in enumerate(example):
        x[diff_x+i, char_to_index[c]] = 1
    for i in range(diff_x):
        x[i, char_to_index['0']] = 1
    for i, c in enumerate(label):
        y[diff_y+i, char_to_index[c]] = 1
    for i in range(diff_y):
        y[i, char_to_index['0']] = 1
        
    return x, y

e, l = generate_data()
print('Text Example and Label:', e, l)
x, y = vectorize_example(e, l)
print('Vectorized Example and Label Shapes:', x.shape, y.shape)

Text Example and Label: 43-29 14
Vectorized Example and Label Shapes: (5, 12) (5, 12)


In [33]:
def devectorize_example(example):
    result = [index_to_char[np.argmax(vec)] for i, vec in enumerate(example)]
    return ''.join(result)

def strip_padding(example):
    encountered_non_zero = False
    output = ''
    for c in example:
        if not encountered_non_zero and c == '0':
            continue
        if c == '+' or c == '-':
            encountered_non_zero = False
        else:
            encountered_non_zero = True
        output += c
    return output

devectorize_example(x)

'43-29'

In [34]:
print(devectorize_example(y), ':', strip_padding(devectorize_example(y)))

00014 : 14


### Create Dataset

In [35]:
def create_dataset(num_examples=2000):

    x_train = np.zeros((num_examples, max_time_steps, num_features))
    y_train = np.zeros((num_examples, max_time_steps, num_features))

    for i in range(num_examples):
        e, l = generate_data()
        x, y = vectorize_example(e, l)
        x_train[i] = x
        y_train[i] = y
    
    return x_train, y_train

x_train, y_train = create_dataset(20000)
print(x_train.shape, y_train.shape)

(20000, 5, 12) (20000, 5, 12)


In [36]:
devectorize_example(x_train[0])

'68-66'

In [37]:
devectorize_example(y_train[0])

'00002'

### Training the Model

In [38]:
simple_logger = LambdaCallback(
    on_epoch_end=lambda e, l: print('{:.2f}'.format(l['val_accuracy']), end=' _ ')
)
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

model.fit(x_train, y_train, epochs=100, validation_split=0.2, verbose=False,
         batch_size=1024, callbacks=[simple_logger, early_stopping])

0.57 _ 0.62 _ 0.62 _ 0.64 _ 0.65 _ 0.65 _ 0.66 _ 0.66 _ 0.68 _ 0.69 _ 0.70 _ 0.71 _ 0.72 _ 0.73 _ 0.74 _ 0.75 _ 0.75 _ 0.75 _ 0.76 _ 0.77 _ 0.77 _ 0.77 _ 0.78 _ 0.79 _ 0.78 _ 0.79 _ 0.79 _ 0.79 _ 0.79 _ 0.80 _ 0.80 _ 0.81 _ 0.81 _ 0.82 _ 0.83 _ 0.84 _ 0.85 _ 0.86 _ 0.86 _ 0.88 _ 0.88 _ 0.89 _ 0.90 _ 0.90 _ 0.91 _ 0.92 _ 0.93 _ 0.94 _ 0.94 _ 0.95 _ 0.95 _ 0.95 _ 0.95 _ 0.96 _ 0.96 _ 0.97 _ 0.97 _ 0.96 _ 0.97 _ 0.97 _ 0.97 _ 0.97 _ 0.98 _ 0.98 _ 0.98 _ 0.98 _ 0.98 _ 0.98 _ 0.98 _ 0.98 _ 0.98 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.98 _ 0.98 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 0.99 _ 

<keras.callbacks.History at 0x7faee46e4bb0>

### Create a test set and look at some predictions

In [39]:
x_test, y_test = create_dataset(num_examples=20)
preds = model.predict(x_test)
full_seq_acc = 0

for i, pred in enumerate(preds):
    pred_str = strip_padding(devectorize_example(pred))
    y_test_str = strip_padding(devectorize_example(y_test[i]))
    x_test_str = strip_padding(devectorize_example(x_test[i]))
    col = 'green' if pred_str == y_test_str else 'red'
    full_seq_acc += 1/len(preds) * int(pred_str == y_test_str)
    outstring = 'Input: {}, Out: {}, Pred: {}'.format(x_test_str, y_test_str, pred_str)
    print(colored(outstring, col))
print('\nFull sequence accuracy: {:.3f} %'.format(100 * full_seq_acc))

[32mInput: 3+59, Out: 62, Pred: 62[0m
[32mInput: 1-28, Out: -27, Pred: -27[0m
[32mInput: 68+5, Out: 73, Pred: 73[0m
[32mInput: 81+55, Out: 136, Pred: 136[0m
[32mInput: 14+94, Out: 108, Pred: 108[0m
[32mInput: 49-93, Out: -44, Pred: -44[0m
[32mInput: 90+36, Out: 126, Pred: 126[0m
[32mInput: 98+50, Out: 148, Pred: 148[0m
[32mInput: 30-71, Out: -41, Pred: -41[0m
[32mInput: 84-4, Out: 80, Pred: 80[0m
[32mInput: 9+55, Out: 64, Pred: 64[0m
[32mInput: 7-70, Out: -63, Pred: -63[0m
[32mInput: 69-61, Out: 8, Pred: 8[0m
[32mInput: 69+84, Out: 153, Pred: 153[0m
[32mInput: 10+32, Out: 42, Pred: 42[0m
[31mInput: 2-93, Out: -91, Pred: -81[0m
[32mInput: 82-33, Out: 49, Pred: 49[0m
[32mInput: 74+92, Out: 166, Pred: 166[0m
[32mInput: 90+87, Out: 177, Pred: 177[0m
[32mInput: 69+6, Out: 75, Pred: 75[0m

Full sequence accuracy: 95.000 %
