# Data Preprocessing

## Recursive Moldes
+ Naive GRU RNN
+ Naive LSTM RNN
+ Bi-directional GRU RNN
+ Bi-directional LSTM RNN
+ Bi-directional GRU RNN with Attention
+ Bi-directional LSTM RNN with Attention

## Notes:
+ Encoder and Decoder have separate embedding layers
+ There are two training methods, namely, online and offline

In [10]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [11]:
# dependency
# public
import os
import numpy as np
from collections import Counter
# private
from utils import *

In [12]:
# define parameters
method = 'recursion'
num_size = 10
seq_len = 5
data_size = 10000

In [13]:
# load path
data_src = 'aes'
indir = os.path.join(data_src, 
                     'num_size_{}'.format(num_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'aes/num_size_10/seq_len_5/data_size_10000'

In [14]:
# save path
outdir = os.path.join(method, 
                      'num_size_{}'.format(num_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'recursion/num_size_10/seq_len_5/data_size_10000'

In [126]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_val_xs = load_txt(os.path.join(indir, 'val_x.txt'))
raw_val_ys = load_txt(os.path.join(indir, 'val_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [127]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('val sample size', len(raw_val_xs))
print('val label size', len(raw_val_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 7000
train label size 7000
val sample size 1500
val label size 1500
test sample size 1500
test label size 1500


### Train

In [128]:
# white space tokenization
train_ys = white_space_tokenizer(raw_train_ys)
train_xs = white_space_tokenizer(raw_train_xs)

In [104]:
# take a look
for i in range(-10, 0, 1):
    print('src:', train_xs[i])
    print('tgt:', train_ys[i])
    print()

src: ['(', '7', '+', '3', ')', '+', '11', '-', '(', '8', '+', '3', ')', '-', '(', '3', '+', '2', ')', '==', '5']
tgt: ['10', '+', '11', '-', '11', '-', '5', '==', '5']

src: ['5', '+', '(', '3', '+', '7', ')', '-', '8', '-', '(', '-', '3', '+', '5', ')', '==', '5']
tgt: ['5', '+', '10', '-', '8', '-', '2', '==', '5']

src: ['9', '-', '3', '/', '6', '*', '10', '==', '4']
tgt: ['9', '-', '3', '/', '6', '*', '10', '==', '4']

src: ['-', '(', '7', '-', '4', ')', '+', '9', '+', '7', '-', '11', '==', '2']
tgt: ['-', '3', '+', '9', '+', '7', '-', '11', '==', '2']

src: ['6', '+', '2', '+', '10', '/', '5', '==', '(', '2', '+', '8', ')']
tgt: ['6', '+', '2', '+', '10', '/', '5', '==', '10']

src: ['(', '2', '+', '8', ')', '*', '7', '/', '(', '5', '+', '2', ')', '-', '6', '==', '4']
tgt: ['10', '*', '7', '/', '7', '-', '6', '==', '4']

src: ['7', '-', '11', '+', '7', '+', '8', '==', '11']
tgt: ['7', '-', '11', '+', '7', '+', '8', '==', '11']

src: ['9', '-', '(', '3', '+', '5', ')', '-', '3', '+

In [109]:
# source vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)

print(len(counter))
print(counter.most_common())

17
[('+', 18293), ('-', 15862), ('(', 14072), (')', 14072), ('2', 8981), ('3', 7126), ('==', 7000), ('4', 6445), ('5', 5660), ('6', 5009), ('7', 4365), ('*', 4279), ('9', 4127), ('8', 4059), ('/', 2733), ('10', 1831), ('11', 1469)]


In [110]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['(', ')', '*', '+', '-', '/', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9', '==']


In [111]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '(': 1, ')': 2, '*': 3, '+': 4, '-': 5, '/': 6, '10': 7, '11': 8, '2': 9, '3': 10, '4': 11, '5': 12, '6': 13, '7': 14, '8': 15, '9': 16, '==': 17}


In [118]:
tgt_vocab_list = np.arange(2, num_size+2).astype('str').tolist()
tgt_vocab_list += ['<pos_{}>'.format(i) for i in range(seq_len*7)]
tgt_vocab_list += ['<done>']
print(len(tgt_vocab_list))
print(tgt_vocab_list)

46
['2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '<pos_0>', '<pos_1>', '<pos_2>', '<pos_3>', '<pos_4>', '<pos_5>', '<pos_6>', '<pos_7>', '<pos_8>', '<pos_9>', '<pos_10>', '<pos_11>', '<pos_12>', '<pos_13>', '<pos_14>', '<pos_15>', '<pos_16>', '<pos_17>', '<pos_18>', '<pos_19>', '<pos_20>', '<pos_21>', '<pos_22>', '<pos_23>', '<pos_24>', '<pos_25>', '<pos_26>', '<pos_27>', '<pos_28>', '<pos_29>', '<pos_30>', '<pos_31>', '<pos_32>', '<pos_33>', '<pos_34>', '<done>']


In [119]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, '10': 10, '11': 11, '<pos_0>': 12, '<pos_1>': 13, '<pos_2>': 14, '<pos_3>': 15, '<pos_4>': 16, '<pos_5>': 17, '<pos_6>': 18, '<pos_7>': 19, '<pos_8>': 20, '<pos_9>': 21, '<pos_10>': 22, '<pos_11>': 23, '<pos_12>': 24, '<pos_13>': 25, '<pos_14>': 26, '<pos_15>': 27, '<pos_16>': 28, '<pos_17>': 29, '<pos_18>': 30, '<pos_19>': 31, '<pos_20>': 32, '<pos_21>': 33, '<pos_22>': 34, '<pos_23>': 35, '<pos_24>': 36, '<pos_25>': 37, '<pos_26>': 38, '<pos_27>': 39, '<pos_28>': 40, '<pos_29>': 41, '<pos_30>': 42, '<pos_31>': 43, '<pos_32>': 44, '<pos_33>': 45, '<pos_34>': 46, '<done>': 47}


### Val

In [121]:
# white space tokenization
val_xs = white_space_tokenizer(raw_val_xs)
val_ys = white_space_tokenizer(raw_val_ys)

In [122]:
# take a look
for i in range(-10, 0, 1):
    print('src:', val_xs[i])
    print('tgt:', val_ys[i])
    print()

src: ['-', '(', '4', '+', '5', ')', '+', '7', '+', '4', '+', '9', '==', '11']
tgt: ['-', '9', '+', '7', '+', '4', '+', '9', '==', '11']

src: ['-', '8', '/', '(', '7', '-', '3', ')', '+', '9', '+', '(', '8', '-', '5', ')', '==', '(', '5', '*', '2', ')']
tgt: ['-', '8', '/', '4', '+', '9', '+', '3', '==', '10']

src: ['8', '+', '(', '-', '6', '+', '9', ')', '*', '2', '/', '(', '2', '+', '4', ')', '==', '9']
tgt: ['8', '+', '3', '*', '2', '/', '6', '==', '9']

src: ['8', '/', '2', '+', '2', '+', '3', '==', '9']
tgt: ['8', '/', '2', '+', '2', '+', '3', '==', '9']

src: ['-', '8', '+', '11', '-', '9', '+', '10', '==', '4']
tgt: ['-', '8', '+', '11', '-', '9', '+', '10', '==', '4']

src: ['5', '*', '(', '9', '-', '7', ')', '-', '3', '*', '(', '5', '-', '3', ')', '==', '(', '7', '-', '3', ')']
tgt: ['5', '*', '2', '-', '3', '*', '2', '==', '4']

src: ['-', '4', '-', '(', '9', '-', '2', ')', '+', '(', '-', '4', '+', '9', ')', '*', '3', '==', '(', '8', '-', '4', ')']
tgt: ['-', '4', '-', '7', 

## Test

In [123]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [124]:
# take a look
for i in range(-10, 0, 1):
    print('src:', test_xs[i])
    print('tgt:', test_ys[i])
    print()

src: ['-', '10', '-', '6', '+', '(', '9', '-', '2', ')', '+', '(', '9', '+', '2', ')', '==', '(', '-', '6', '+', '8', ')']
tgt: ['-', '10', '-', '6', '+', '7', '+', '11', '==', '2']

src: ['5', '-', '(', '2', '*', '5', ')', '+', '4', '+', '7', '==', '6']
tgt: ['5', '-', '10', '+', '4', '+', '7', '==', '6']

src: ['(', '6', '-', '3', ')', '+', '(', '6', '+', '4', ')', '-', '(', '5', '-', '3', ')', '-', '8', '==', '(', '7', '-', '4', ')']
tgt: ['3', '+', '10', '-', '2', '-', '8', '==', '3']

src: ['-', '(', '3', '*', '3', ')', '-', '(', '-', '4', '+', '8', ')', '+', '9', '+', '(', '2', '+', '8', ')', '==', '6']
tgt: ['-', '9', '-', '4', '+', '9', '+', '10', '==', '6']

src: ['3', '+', '10', '-', '11', '+', '4', '==', '6']
tgt: ['3', '+', '10', '-', '11', '+', '4', '==', '6']

src: ['5', '+', '(', '3', '+', '4', ')', '*', '(', '2', '+', '2', ')', '/', '(', '9', '-', '2', ')', '==', '9']
tgt: ['5', '+', '7', '*', '4', '/', '7', '==', '9']

src: ['6', '/', '6', '*', '5', '+', '5', '==', '10

In [125]:
# combine data sets to a dict
train_dict = {}
train_dict['ys'] = train_ys

val_dict = {}
val_dict['xs'] = val_xs
val_dict['ys'] = val_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['val'] = val_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [126]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)

# Archive Code

In [33]:
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
x = raw_train_xs[0]
y = raw_train_ys[0]
x = x.split()
y = y.split()
print(x)
print(y)
print()

xs = [x.copy()]
ys_ = []

num_left = len([i for i in x if i == '('])
for i in range(num_left):
    left_idx = x.index('(') 
    right_idx = x.index(')') 
    v = y[left_idx] 
    ys_.append(['<pos_{}>'.format(left_idx), '<pos_{}>'.format(right_idx), v])
    x = x[:left_idx] + [v] + x[right_idx+1:]
    xs.append(x)

ys_.append(['<done>']*3)

for x, y_ in zip(xs, ys_):
    print(x)
    print(y_)

['2', '+', '(', '6', '-', '4', ')', '*', '(', '2', '+', '4', ')', '-', '(', '7', '+', '2', ')', '==', '(', '9', '-', '4', ')']
['2', '+', '2', '*', '6', '-', '9', '==', '5']

['2', '+', '(', '6', '-', '4', ')', '*', '(', '2', '+', '4', ')', '-', '(', '7', '+', '2', ')', '==', '(', '9', '-', '4', ')']
['<pos_2>', '<pos_6>', '2']
['2', '+', '2', '*', '(', '2', '+', '4', ')', '-', '(', '7', '+', '2', ')', '==', '(', '9', '-', '4', ')']
['<pos_4>', '<pos_8>', '6']
['2', '+', '2', '*', '6', '-', '(', '7', '+', '2', ')', '==', '(', '9', '-', '4', ')']
['<pos_6>', '<pos_10>', '9']
['2', '+', '2', '*', '6', '-', '9', '==', '(', '9', '-', '4', ')']
['<pos_8>', '<pos_12>', '5']
['2', '+', '2', '*', '6', '-', '9', '==', '5']
['<done>', '<done>', '<done>']


In [91]:
x = xs[0]
y_ = ys_[0]
print(x)
print(y_)

['2', '+', '(', '6', '-', '4', ')', '*', '(', '2', '+', '4', ')', '-', '(', '7', '+', '2', ')', '==', '(', '9', '-', '4', ')']
['<pos_2>', '<pos_6>', '2']


In [92]:
np_xs = [x for i in range(10)]
np_ys_ = [y_ for i in range(10)]
np_xs.append(xs[0])
np_ys_.append(ys_[-1])

np_xs = np.array(np_xs)
np_ys_ = np.array(np_ys_)

print(np_xs)
print()
print(np_ys_)

[['2' '+' '(' '6' '-' '4' ')' '*' '(' '2' '+' '4' ')' '-' '(' '7' '+' '2'
  ')' '==' '(' '9' '-' '4' ')']
 ['2' '+' '(' '6' '-' '4' ')' '*' '(' '2' '+' '4' ')' '-' '(' '7' '+' '2'
  ')' '==' '(' '9' '-' '4' ')']
 ['2' '+' '(' '6' '-' '4' ')' '*' '(' '2' '+' '4' ')' '-' '(' '7' '+' '2'
  ')' '==' '(' '9' '-' '4' ')']
 ['2' '+' '(' '6' '-' '4' ')' '*' '(' '2' '+' '4' ')' '-' '(' '7' '+' '2'
  ')' '==' '(' '9' '-' '4' ')']
 ['2' '+' '(' '6' '-' '4' ')' '*' '(' '2' '+' '4' ')' '-' '(' '7' '+' '2'
  ')' '==' '(' '9' '-' '4' ')']
 ['2' '+' '(' '6' '-' '4' ')' '*' '(' '2' '+' '4' ')' '-' '(' '7' '+' '2'
  ')' '==' '(' '9' '-' '4' ')']
 ['2' '+' '(' '6' '-' '4' ')' '*' '(' '2' '+' '4' ')' '-' '(' '7' '+' '2'
  ')' '==' '(' '9' '-' '4' ')']
 ['2' '+' '(' '6' '-' '4' ')' '*' '(' '2' '+' '4' ')' '-' '(' '7' '+' '2'
  ')' '==' '(' '9' '-' '4' ')']
 ['2' '+' '(' '6' '-' '4' ')' '*' '(' '2' '+' '4' ')' '-' '(' '7' '+' '2'
  ')' '==' '(' '9' '-' '4' ')']
 ['2' '+' '(' '6' '-' '4' ')' '*' '(' '2' '+' 

In [93]:
mask = (np_ys_ != '<done>').all(axis=-1)
mask

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False])

In [94]:
np_ys_[mask]

array([['<pos_2>', '<pos_6>', '2'],
       ['<pos_2>', '<pos_6>', '2'],
       ['<pos_2>', '<pos_6>', '2'],
       ['<pos_2>', '<pos_6>', '2'],
       ['<pos_2>', '<pos_6>', '2'],
       ['<pos_2>', '<pos_6>', '2'],
       ['<pos_2>', '<pos_6>', '2'],
       ['<pos_2>', '<pos_6>', '2'],
       ['<pos_2>', '<pos_6>', '2'],
       ['<pos_2>', '<pos_6>', '2']], dtype='<U7')

In [95]:
def parse_pos(pos):
    return int(''.join([i for i in pos if i.isdigit()]))

In [96]:
get_pos = np.vectorize(parse_pos, otypes=[int])

In [97]:
left = get_pos(np_ys_[mask, :2])[:, 0]
left

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [98]:
right = get_pos(np_ys_[mask, :2])[:, 1]
right

array([6, 6, 6, 6, 6, 6, 6, 6, 6, 6])

In [103]:
np_xs[mask, left]

array(['(', '(', '(', '(', '(', '(', '(', '(', '(', '('], dtype='<U2')

In [106]:
y_[0].startswith('<pos_')

True

In [107]:
left_idx = parse_pos(y_[0])

In [112]:
right_idx = parse_pos(y_[1])

In [109]:
x

['2',
 '+',
 '(',
 '6',
 '-',
 '4',
 ')',
 '*',
 '(',
 '2',
 '+',
 '4',
 ')',
 '-',
 '(',
 '7',
 '+',
 '2',
 ')',
 '==',
 '(',
 '9',
 '-',
 '4',
 ')']

In [120]:
x[:left_idx] + [y_[2]] + x[right_idx+1:]

['2',
 '+',
 '2',
 '*',
 '(',
 '2',
 '+',
 '4',
 ')',
 '-',
 '(',
 '7',
 '+',
 '2',
 ')',
 '==',
 '(',
 '9',
 '-',
 '4',
 ')']

In [121]:
a = [1,2,3]
for i in a:
    i = 0
print(a)

[1, 2, 3]
