# Data Preprocessing

## Recursive Moldes
+ Naive GRU RNN
+ Naive LSTM RNN
+ Bi-directional GRU RNN
+ Bi-directional LSTM RNN
+ Bi-directional GRU RNN with Attention
+ Bi-directional LSTM RNN with Attention

## Notes:
+ There is no validation set 
+ Encoder and Decoder have separate embedding layers
+ Train and Test have their own vocab space although there maybe an overlap
+ There are two training methods, namely, online and offline

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
# public
import os
import numpy as np
from collections import Counter
# private
from utils import *

## Offline

In [3]:
# define parameters
method = 'recursion'
vocab_size = 10
seq_len = 5
data_size = 30000

In [4]:
# load path
indir = 'raw'
indir = os.path.join(indir, 'vocab_size_{}'.format(vocab_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'raw/vocab_size_10/seq_len_5/data_size_30000'

In [5]:
# save path
outdir = os.path.join(method, 'vocab_size_{}'.format(vocab_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'recursion/vocab_size_10/seq_len_5/data_size_30000'

In [6]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [7]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 24000
train label size 24000
test sample size 6000
test label size 6000


### Train

In [42]:
# a function to generate a sequence pair
# given a label sequence
def get_sequence_pair(y: list) -> list:
    # white space tokenization
    y = y.split()
    x = y.copy()
    # get operator indexes
    operator_idxes = np.arange(1, len(x), 2)[::-1]
    # decide how many operators to remove
    num_idxes = np.random.choice(range(len(operator_idxes)+1))
    if num_idxes == 0:
        return x, ['<completion>', '<none>', '<none>'], y
    else:
        # decide operators to remove
        idxes_to_remove = operator_idxes[:num_idxes]
#         idxes_to_remove = sorted(np.random.choice(operator_idxes, num_idxes, replace=False))
        # generat possible ys
#         ys_ = [['<insertion>', str(idxes_to_remove[i]+1-i), x[idxes_to_remove[i]]] 
#               for i in np.arange(len(idxes_to_remove))[::-1]]
#         print(['<insertion>', str(idxes_to_remove[-1]), x[idxes_to_remove[-1]]])
        y_ = ['<insertion>', str(idxes_to_remove[-1]), x[idxes_to_remove[-1]]]
        # pick y randomly
#         y_ = ys_[0]
#         y_ = ys_[np.random.choice(range(len(ys_)))]
#         remove operators
        x = [x[i] for i in range(len(x)) if i not in idxes_to_remove]
#         print(idxes_to_remove)
        return x, y_, y

In [43]:
train_xs, train_ys_, train_ys = zip(*[get_sequence_pair(y) for y in raw_train_ys])

In [54]:
# take a look
for i in range(-10, 0, 1):
    print('src:', train_xs[i])
    print('tgt:', train_ys[i])
    print('pred:', train_ys_[i])
    print()

src: ['6', '/', '5', '+', '4', '/', '5', '==', '2']
tgt: ['6', '/', '5', '+', '4', '/', '5', '==', '2']
pred: ['<completion>', '<none>', '<none>']

src: ['7', '+', '10', '/', '2', '-', '3', '==', '9']
tgt: ['7', '+', '10', '/', '2', '-', '3', '==', '9']
pred: ['<completion>', '<none>', '<none>']

src: ['8', '-', '7', '+', '10', '-', '6', '==', '5']
tgt: ['8', '-', '7', '+', '10', '-', '6', '==', '5']
pred: ['<completion>', '<none>', '<none>']

src: ['10', '/', '10', '+', '4', '/', '2', '==', '3']
tgt: ['10', '/', '10', '+', '4', '/', '2', '==', '3']
pred: ['<completion>', '<none>', '<none>']

src: ['5', '-', '5', '*', '2', '7', '2']
tgt: ['5', '-', '5', '*', '2', '+', '7', '==', '2']
pred: ['<insertion>', '5', '+']

src: ['4', '8', '10', '2', '7']
tgt: ['4', '/', '8', '*', '10', '+', '2', '==', '7']
pred: ['<insertion>', '1', '/']

src: ['3', '*', '7', '/', '7', '2', '5']
tgt: ['3', '*', '7', '/', '7', '+', '2', '==', '5']
pred: ['<insertion>', '5', '+']

src: ['10', '-', '3', '-', '3'

In [44]:
# source vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)

print(counter.most_common())

[('2', 16999), ('-', 15516), ('3', 14744), ('+', 14655), ('4', 13648), ('5', 12187), ('6', 12185), ('8', 11031), ('10', 10233), ('7', 10114), ('9', 10100), ('11', 8759), ('*', 6642), ('/', 6238), ('==', 4806)]


In [45]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['*', '+', '-', '/', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9', '==']


In [46]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
src_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
src_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '*': 3, '+': 4, '-': 5, '/': 6, '10': 7, '11': 8, '2': 9, '3': 10, '4': 11, '5': 12, '6': 13, '7': 14, '8': 15, '9': 16, '==': 17}


In [47]:
# target vocabulary frequency distribution
counter = Counter()
for y_ in train_ys_:
    counter.update(y_)

print(counter.most_common())

[('<insertion>', 19194), ('<none>', 9612), ('-', 5321), ('+', 4957), ('3', 4890), ('<completion>', 4806), ('1', 4793), ('5', 4790), ('7', 4721), ('==', 4721), ('/', 2132), ('*', 2063)]


In [48]:
tgt_vocab_list = sorted(counter.keys())
print(tgt_vocab_list)

['*', '+', '-', '/', '1', '3', '5', '7', '<completion>', '<insertion>', '<none>', '==']


In [49]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
tgt_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '*': 3, '+': 4, '-': 5, '/': 6, '1': 7, '3': 8, '5': 9, '7': 10, '<completion>': 11, '<insertion>': 12, '<none>': 13, '==': 14}


## Test

In [50]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [51]:
# take a look
for i in range(-10, 0, 1):
    print('src:', test_xs[i])
    print('tgt:', test_ys[i])

src: ['5', '6', '6', '6', '11']
tgt: ['5', '*', '6', '/', '6', '+', '6', '==', '11']
src: ['11', '2', '2', '3', '10']
tgt: ['11', '-', '2', '*', '2', '+', '3', '==', '10']
src: ['8', '8', '10', '5', '5']
tgt: ['8', '-', '8', '+', '10', '-', '5', '==', '5']
src: ['6', '2', '6', '4', '5']
tgt: ['6', '/', '2', '+', '6', '-', '4', '==', '5']
src: ['4', '10', '4', '4', '2']
tgt: ['4', '-', '10', '+', '4', '+', '4', '==', '2']
src: ['8', '4', '4', '2', '6']
tgt: ['8', '+', '4', '-', '4', '-', '2', '==', '6']
src: ['3', '2', '6', '3', '8']
tgt: ['3', '*', '2', '+', '6', '/', '3', '==', '8']
src: ['4', '11', '7', '10', '10']
tgt: ['4', '-', '11', '+', '7', '+', '10', '==', '10']
src: ['2', '3', '3', '6', '8']
tgt: ['2', '/', '3', '*', '3', '+', '6', '==', '8']
src: ['5', '4', '8', '4', '3']
tgt: ['5', '-', '4', '/', '8', '*', '4', '==', '3']


In [26]:
# combine data sets to a dict
train_dict = {}
train_dict['xs'] = train_xs
train_dict['ys_'] = train_ys_
train_dict['ys'] = train_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [27]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)