In [222]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import RepeatVector, TimeDistributed
from math import sqrt
from sklearn.metrics import mean_squared_error
import operator

MAX_Q_LEN = 20
MAX_A_LEN = 5

In [223]:
# Helper functions to read data

from pathlib import Path
import re
import numpy as np

def ps(s):
    """Process String: convert a string into a list of lowercased words."""
    return [word.strip() for word in re.split(r'([+-/*()?]|\d|\w)', s) if word.strip()]

def read_data(filepath):
    '''
    returns character lists of questions and answers.
    '''
    # q,a lists
    X = []
    y = []

    with open(filepath) as dataset_file:
        # Grabbing a subset of the entire file
        for i in range(100000):
            line_q = dataset_file.readline().strip()
            line_a = dataset_file.readline().strip()
            if(len(line_q) < MAX_Q_LEN and len(line_a) < MAX_A_LEN):
                X.append(ps(line_q))
                y.append(ps(line_a))
            
    return X,y

def pad_data(X,y, max_question_len, max_answer_len):
    # dataset is of form [(q,a)]
    X_padded = list()
    for q in X:
        qpad =  [' ' for _ in range(max_question_len-len(q))] + q
        X_padded.append(qpad)
    y_padded = list()
    for a in y:
        apad =  [' ' for _ in range(max_answer_len-len(a))] + a
        y_padded.append(apad)
    return X_padded,y_padded

In [224]:
def create_alphabet_index(X,pad_char):

    char_to_int = {}

    for q in X:

        for word in q:
            if word not in char_to_int:
                char_to_int[word] = len(char_to_int)
    
    char_to_int[pad_char] = len(char_to_int)
    int_to_char = dict([(char_to_int[char],char) for char in char_to_int])

    return (char_to_int,int_to_char)

def encode_data(X,y,char_to_int):
    Xenc = list()
    for pattern in X:
        integer_encoded = [char_to_int[char] for char in pattern]
        Xenc.append(integer_encoded)
    yenc = list()
    for pattern in y:
        integer_encoded = [char_to_int[char] for char in pattern]
        yenc.append(integer_encoded)
    return Xenc, yenc

# one hot encode
def one_hot_encode(X, y, vocab_size):
	Xenc = list()
	for seq in X:
		pattern = list()
		for index in seq:
			vector = [0 for _ in range(vocab_size)]
			vector[index] = 1
			pattern.append(vector)
		Xenc.append(pattern)
	yenc = list()
	for seq in y:
		pattern = list()
		for index in seq:
			vector = [0 for _ in range(vocab_size)]
			vector[index] = 1
			pattern.append(vector)
		yenc.append(pattern)
	return Xenc, yenc

def one_hot_decode(seq, int_to_char):
	strings = list()
	for pattern in seq:
		string = int_to_char[np.argmax(pattern)]
		strings.append(string)
	return ''.join(strings)

def process_data(dataset_filename):
    X,y = read_data(dataset_filename)
    char_to_int,int_to_char = create_alphabet_index(X,' ')
    X,y = pad_data(X,y,MAX_Q_LEN,MAX_A_LEN)
    X,y = encode_data(X,y,char_to_int)
    X,y = one_hot_encode(X,y,len(char_to_int))
    return (X, y, char_to_int,int_to_char)

In [225]:
# Alternative method to generate dataset, in case our original dataset doesn't work.

valid_characters = '0123456789.+*-/ '
char_to_int = dict((character, index) for index, character in  enumerate(valid_characters))
int_to_char = dict((index, character) for index, character in  enumerate(valid_characters))

number_max = 100 #Up to this number
repeat_steps = len(str(number_max-1)) * 2 + 1
operators = ['+', '*', '-', '/']
operators_dict = { "+":operator.add, 
                  '*':operator.mul, 
                  "-":operator.sub,
                  '/':operator.truediv}

def oper_generator():
    number_1 = np.random.randint(1,number_max)
    operator_index = np.random.randint(0,len(operators))
    operator = operators[operator_index]
    number_2 = np.random.randint(1,number_max)
    number_1= max(number_1,number_2)
    number_2=min(number_1,number_2)
    operation = str(number_1) + operator + str(number_2)
    result = str(operators_dict[operator](number_1,number_2))[:repeat_steps]
    return ps(operation), ps(result)

def data_generator(training_size,test_size):
    x_train = []
    x_test = []
    y_train = []
    y_test = []
    for i in (range(0, training_size)):
        x, y = oper_generator()
        x_train.append(x)
        y_train.append(y)
    for i in (range(0, test_size)):
        x, y = oper_generator()
        x_test.append(x)
        y_test.append(y)
    X,y = pad_data(x_train,y_train,repeat_steps,repeat_steps)
    X,y = encode_data(X,y,char_to_int)
    X,y = one_hot_encode(X,y,len(char_to_int))
    X_test,y_test = pad_data(x_test,y_test,repeat_steps,repeat_steps)
    X_test,y_test = encode_data(X_test,y_test,char_to_int)
    X_test,y_test = one_hot_encode(X_test,y_test,len(char_to_int))
    return X,y,X_test,y_test

data_points = 10000
test_size = 0.2
training_size = int(round(data_points * (1-test_size),0))
test_size = data_points - training_size

In [233]:
n_batch = 256
n_epoch = 20

dataset_filename = Path("../train_data/arithmetic__mixed.txt")
#X,y,char_to_int,int_to_char = process_data(dataset_filename)
X,y, _, _ = data_generator(training_size,test_size)
model = Sequential()
model.add(LSTM(256, input_shape=(None, len(char_to_int))))
model.add(RepeatVector(repeat_steps))
model.add(LSTM(1024, return_sequences=True))
model.add(TimeDistributed(Dense(len(char_to_int), activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# train LSTM
for i in range(n_epoch):
	print(i)
	model.fit(np.array(X), np.array(y), epochs=1, batch_size=n_batch)

Model: "sequential_46"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_88 (LSTM)              (None, 256)               295936    
                                                                 
 repeat_vector_41 (RepeatVec  (None, 5, 256)           0         
 tor)                                                            
                                                                 
 lstm_89 (LSTM)              (None, 5, 1024)           5246976   
                                                                 
 time_distributed_40 (TimeDi  (None, 5, 32)            32800     
 stributed)                                                      
                                                                 
Total params: 5,575,712
Trainable params: 5,575,712
Non-trainable params: 0
_________________________________________________________________
None
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15


In [234]:
# evaluate on some new patterns
X , y, _,_ = process_data(dataset_filename)
result = model.predict(X, batch_size=n_batch, verbose=0)
# calculate error
expected = [one_hot_decode(x, int_to_char) for x in y]
predicted = [one_hot_decode(x, int_to_char) for x in result]
# show some examples
for i in range(100):
	print('Expected=%s, Predicted=%s' % (expected[i], predicted[i]))

KeyboardInterrupt: 