# Data Preprocessing

## Recursive Moldes
+ Naive GRU RNN
+ Naive LSTM RNN
+ Bi-directional GRU RNN
+ Bi-directional LSTM RNN
+ Bi-directional GRU RNN with Attention
+ Bi-directional LSTM RNN with Attention

## Notes:
+ Encoder and Decoder have separate embedding layers
+ There are two training methods, namely, online and offline

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
# public
import os
import numpy as np
from collections import Counter
# private
from utils import *

## Offline

In [3]:
# define parameters
method = 'recursion'
num_size = 10
seq_len = 5
data_size = 10000

In [4]:
# load path
indir = 'raw/aoi'
indir = os.path.join(indir, 
                     'num_size_{}'.format(num_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'raw/aoi/num_size_10/seq_len_5/data_size_10000'

In [6]:
# save path
outdir = os.path.join(method, 
                      'num_size_{}'.format(num_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'recursion/num_size_10/seq_len_5/data_size_10000'

In [7]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_val_xs = load_txt(os.path.join(indir, 'val_x.txt'))
raw_val_ys = load_txt(os.path.join(indir, 'val_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [8]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('val sample size', len(raw_val_xs))
print('val label size', len(raw_val_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 7000
train label size 7000
val sample size 1500
val label size 1500
test sample size 1500
test label size 1500


### Train

In [30]:
def get_sequence_pair(y: list) -> list:
    # white space tokenization
    y = y.split()
    # make a copy
    x = y.copy()
    # get operator indexes
    operator_idxes = [i for i, token in enumerate(y) if not token.isdigit()][::-1]
    # decide how many operators to remove
    num_idxes = np.random.choice(range(len(operator_idxes)+1))
    if num_idxes == 0:
        return x, ['<completion>', '<none>', '<none>'], y
    else:
        # decide operators to remove
        idxes_to_remove = operator_idxes[:num_idxes]
        # generat label
        y_ = ['<insertion>', str(idxes_to_remove[-1]), x[idxes_to_remove[-1]]]
        # generate sample
        x = [x[i] for i in range(len(x)) if i not in idxes_to_remove]
        
        return x, y_, y

In [12]:
train_xs, train_ys_, train_ys = zip(*[get_sequence_pair(y) for y in raw_train_ys])

In [13]:
# take a look
for i in range(-10, 0, 1):
    print('src:', train_xs[i])
    print('tgt:', train_ys[i])
    print('pred:', train_ys_[i])
    print()

src: ['7', '9', '3', '3', '3']
tgt: ['-', '7', '+', '9', '+', '3', '/', '3', '==', '3']
pred: ['<insertion>', '0', '-']

src: ['-', '4', '10', '3', '2', '11']
tgt: ['-', '4', '+', '10', '*', '3', '/', '2', '==', '11']
pred: ['<insertion>', '2', '+']

src: ['10', '*', '3', '-', '11', '-', '9', '==', '10']
tgt: ['10', '*', '3', '-', '11', '-', '9', '==', '10']
pred: ['<completion>', '<none>', '<none>']

src: ['3', '*', '5', '+', '6', '10', '11']
tgt: ['3', '*', '5', '+', '6', '-', '10', '==', '11']
pred: ['<insertion>', '5', '-']

src: ['-', '5', '3', '6', '8', '5']
tgt: ['-', '5', '+', '3', '*', '6', '-', '8', '==', '5']
pred: ['<insertion>', '2', '+']

src: ['3', '3', '4', '5', '10']
tgt: ['3', '+', '3', '*', '4', '-', '5', '==', '10']
pred: ['<insertion>', '1', '+']

src: ['-', '8', '+', '5', '+', '11', '-', '4', '==', '4']
tgt: ['-', '8', '+', '5', '+', '11', '-', '4', '==', '4']
pred: ['<completion>', '<none>', '<none>']

src: ['10', '+', '9', '-', '4', '5', '10']
tgt: ['10', '+', '

In [18]:
# source vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)

print(counter.most_common())

[('-', 5699), ('+', 4997), ('2', 4916), ('3', 4259), ('4', 3984), ('5', 3615), ('6', 3562), ('8', 3157), ('10', 3104), ('7', 3043), ('9', 2976), ('11', 2384), ('*', 1719), ('/', 1538), ('==', 1323)]


In [19]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['*', '+', '-', '/', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9', '==']


In [20]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
src_vocab2idx_dict['</s>'] = 1 # to mark the end of a sequence

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '</s>': 1, '*': 2, '+': 3, '-': 4, '/': 5, '10': 6, '11': 7, '2': 8, '3': 9, '4': 10, '5': 11, '6': 12, '7': 13, '8': 14, '9': 15, '==': 16}


In [21]:
# target vocabulary frequency distribution
counter = Counter()
for y_ in train_ys_:
    counter.update(y_)

print(counter.most_common())

[('<insertion>', 5677), ('<none>', 2646), ('-', 1656), ('+', 1583), ('<completion>', 1323), ('==', 1311), ('1', 926), ('7', 902), ('5', 886), ('3', 865), ('*', 584), ('/', 543), ('4', 441), ('6', 434), ('0', 422), ('8', 409), ('2', 392)]


In [22]:
tgt_vocab_list = sorted(counter.keys())
print(tgt_vocab_list)

['*', '+', '-', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '<completion>', '<insertion>', '<none>', '==']


In [23]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['</s>'] = 1 # to mark the end of a sequence
tgt_vocab2idx_dict['<s>'] = 2 # to mark the start of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '</s>': 1, '<s>': 2, '*': 3, '+': 4, '-': 5, '/': 6, '0': 7, '1': 8, '2': 9, '3': 10, '4': 11, '5': 12, '6': 13, '7': 14, '8': 15, '<completion>': 16, '<insertion>': 17, '<none>': 18, '==': 19}


### Val

In [24]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_val_xs[i])
    print('tgt:', raw_val_ys[i])
    print()

src: 8 6 4 3 3
tgt: 8 - 6 + 4 - 3 == 3

src: 6 6 9 3 6
tgt: 6 / 6 * 9 - 3 == 6

src: 2 3 6 10 10
tgt: - 2 * 3 + 6 + 10 == 10

src: 11 3 4 2 2
tgt: 11 - 3 - 4 - 2 == 2

src: 6 3 2 4 4
tgt: - 6 + 3 * 2 + 4 == 4

src: 10 11 11 2 8
tgt: 10 * 11 / 11 - 2 == 8

src: 10 2 9 9 8
tgt: 10 - 2 * 9 / 9 == 8

src: 7 7 5 7 3
tgt: 7 / 7 - 5 + 7 == 3

src: 3 8 8 11 8
tgt: - 3 + 8 - 8 + 11 == 8

src: 3 8 2 2 11
tgt: 3 + 8 + 2 - 2 == 11



In [25]:
# white space tokenization
val_xs = white_space_tokenizer(raw_val_xs)
val_ys = white_space_tokenizer(raw_val_ys)

## Test

In [26]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_test_xs[i])
    print('tgt:', raw_test_ys[i])
    print()

src: 2 9 3 6 9
tgt: 2 * 9 * 3 / 6 == 9

src: 5 10 10 5 10
tgt: - 5 + 10 + 10 - 5 == 10

src: 7 4 7 11 7
tgt: 7 - 4 - 7 + 11 == 7

src: 5 5 8 7 2
tgt: 5 / 5 + 8 - 7 == 2

src: 2 7 4 6 7
tgt: 2 + 7 + 4 - 6 == 7

src: 4 8 8 6 6
tgt: - 4 + 8 + 8 - 6 == 6

src: 2 4 7 6 3
tgt: - 2 + 4 + 7 - 6 == 3

src: 9 4 3 6 2
tgt: 9 * 4 / 3 / 6 == 2

src: 4 2 7 3 6
tgt: 4 / 2 + 7 - 3 == 6

src: 3 11 6 9 11
tgt: - 3 + 11 - 6 + 9 == 11



In [27]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [28]:
# combine data sets to a dict
train_dict = {}
train_dict['xs'] = train_xs
train_dict['ys_'] = train_ys_
train_dict['ys'] = train_ys

val_dict = {}
val_dict['xs'] = val_xs
val_dict['ys'] = val_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['val'] = val_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [29]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)

# Archive Code

In [26]:
# # a function to generate a sequence pair
# # given a label sequence
# def get_sequence_pair(y: list) -> list:
#     # white space tokenization
#     y = y.split()
#     x = y.copy()
#     # get operator indexes
#     operator_idxes = list(range(1, len(x), 2))
#     # decide how many operators to remove
#     num_idxes = np.random.choice(range(len(operator_idxes)+1))
#     if num_idxes == 0:
#         return x, ['<completion>', '<none>', '<none>'], y
#     else:
#         # decide operators to remove
#         idxes_to_remove = sorted(np.random.choice(operator_idxes, num_idxes, replace=False))
#         # generat possible ys
#         ys_ = [['<insertion>', str(idxes_to_remove[i]-i), x[idxes_to_remove[i]]] 
#               for i in range(len(idxes_to_remove))]
#         # pick y randomly
#         y_ = ys_[np.random.choice(range(len(ys_)))]
#         # remove operators
#         x = [x[i] for i in range(len(x)) if i not in idxes_to_remove]
#         return x, y_, y

In [27]:
# a function to generate a sequence pair
# given a label sequence
def get_sequence_pair(y: list) -> list:
    # white space tokenization
    y = y.split()
    x = y.copy()
    # get operator indexes
#     operator_idxes = np.arange(1, len(x), 2)[::-1]
    operator_idxes = [i for i, token in enumerate(y) if not token.isdigit()][::-1]
    
    # decide how many operators to remove
    num_idxes = np.random.choice(range(len(operator_idxes)+1))
    if num_idxes == 0:
        return x, ['<completion>', '<none>', '<none>'], y
    else:
        # decide operators to remove
        idxes_to_remove = operator_idxes[:num_idxes]
#         idxes_to_remove = sorted(np.random.choice(operator_idxes, num_idxes, replace=False))
        # generat possible ys
#         ys_ = [['<insertion>', str(idxes_to_remove[i]+1-i), x[idxes_to_remove[i]]] 
#               for i in np.arange(len(idxes_to_remove))[::-1]]
#         print(['<insertion>', str(idxes_to_remove[-1]), x[idxes_to_remove[-1]]])
        y_ = ['<insertion>', str(idxes_to_remove[-1]), x[idxes_to_remove[-1]]]
        # pick y randomly
#         y_ = ys_[0]
#         y_ = ys_[np.random.choice(range(len(ys_)))]
#         remove operators
        x = [x[i] for i in range(len(x)) if i not in idxes_to_remove]
#         print(idxes_to_remove)
        return x, y_, y