# Data Preprocessing

## End2End Moldes
+ Naive GRU RNN
+ Naive LSTM RNN
+ Bi-directional GRU RNN
+ Bi-directional LSTM RNN
+ Bi-directional GRU RNN with Attention
+ Bi-directional LSTM RNN with Attention

## Notes:
+ There is no validation set 
+ Encoder and Decoder have separate embedding layers
+ Train and Test have their own vocab space although there maybe an overlap

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
# public
import os
import numpy as np
from collections import Counter
# private
from utils import *

In [3]:
# define parameters
vocab_size = 10
seq_len = 5
data_size = 10000

In [4]:
# load path
indir = 'raw'
indir = os.path.join(indir, 'vocab_size_{}'.format(vocab_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'raw/vocab_size_10/seq_len_5/data_size_10000'

In [5]:
# save path
outdir = 'end2end'

outdir = os.path.join(outdir, 'vocab_size_{}'.format(vocab_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'end2end/vocab_size_10/seq_len_5/data_size_10000'

In [7]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_val_xs = load_txt(os.path.join(indir, 'val_x.txt'))
raw_val_ys = load_txt(os.path.join(indir, 'val_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [8]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('val sample size', len(raw_val_xs))
print('val label size', len(raw_val_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 8000
train label size 8000
val sample size 1000
val label size 1000
test sample size 1000
test label size 1000


### Train

In [10]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_train_xs[i])
    print('tgt:', raw_train_ys[i])
    print()

src: 9 10 11 3 11
tgt: 9 + 10 - 11 + 3 == 11

src: 11 9 8 3 7
tgt: 11 - 9 + 8 - 3 == 7

src: 5 5 2 2 9
tgt: 5 + 5 - 2 / 2 == 9

src: 7 7 2 8 11
tgt: 7 / 7 + 2 + 8 == 11

src: 10 10 7 10 3
tgt: 10 + 10 - 7 - 10 == 3

src: 6 10 2 10 8
tgt: 6 - 10 + 2 + 10 == 8

src: 11 4 4 4 7
tgt: 11 + 4 - 4 - 4 == 7

src: 8 5 5 3 11
tgt: 8 / 5 * 5 + 3 == 11

src: 6 2 4 2 10
tgt: 6 * 2 - 4 / 2 == 10

src: 2 7 8 8 9
tgt: 2 + 7 * 8 / 8 == 9



In [11]:
# white space tokenization
train_xs = white_space_tokenizer(raw_train_xs)
train_ys = white_space_tokenizer(raw_train_ys)

In [12]:
# vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)
    
print(counter.most_common())

[('2', 5775), ('3', 4883), ('4', 4602), ('5', 4043), ('6', 4042), ('8', 3668), ('7', 3362), ('10', 3362), ('9', 3303), ('11', 2960)]


In [13]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['10', '11', '2', '3', '4', '5', '6', '7', '8', '9']


In [14]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
src_vocab2idx_dict['</s>'] = 1 # to mark the end of a sequence

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '</s>': 1, '10': 2, '11': 3, '2': 4, '3': 5, '4': 6, '5': 7, '6': 8, '7': 9, '8': 10, '9': 11}


In [15]:
# target vocabulary frequency distribution
counter = Counter()
for y in train_ys:
    counter.update(y)

print(counter.most_common())

[('-', 8807), ('+', 8099), ('==', 8000), ('2', 5775), ('3', 4883), ('4', 4602), ('5', 4043), ('6', 4042), ('8', 3668), ('*', 3582), ('/', 3512), ('7', 3362), ('10', 3362), ('9', 3303), ('11', 2960)]


In [16]:
tgt_vocab_list = sorted(counter.keys())
print(tgt_vocab_list)

['*', '+', '-', '/', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9', '==']


In [17]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
tgt_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '*': 3, '+': 4, '-': 5, '/': 6, '10': 7, '11': 8, '2': 9, '3': 10, '4': 11, '5': 12, '6': 13, '7': 14, '8': 15, '9': 16, '==': 17}


### Val

In [18]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_val_xs[i])
    print('tgt:', raw_val_ys[i])
    print()

src: 7 7 11 4 7
tgt: 7 / 7 * 11 - 4 == 7

src: 3 7 5 2 11
tgt: 3 * 7 - 5 * 2 == 11

src: 6 2 8 8 8
tgt: 6 + 2 * 8 / 8 == 8

src: 11 7 6 8 4
tgt: 11 + 7 - 6 - 8 == 4

src: 2 7 7 8 10
tgt: 2 - 7 + 7 + 8 == 10

src: 8 3 4 2 7
tgt: 8 - 3 + 4 - 2 == 7

src: 3 11 3 4 2
tgt: 3 + 11 - 3 * 4 == 2

src: 7 2 4 9 4
tgt: 7 + 2 + 4 - 9 == 4

src: 4 2 6 3 5
tgt: 4 - 2 + 6 - 3 == 5

src: 2 9 2 2 7
tgt: 2 * 9 / 2 - 2 == 7



In [19]:
# white space tokenization
val_xs = white_space_tokenizer(raw_val_xs)
val_ys = white_space_tokenizer(raw_val_ys)

### Test

In [21]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_test_xs[i])
    print('tgt:', raw_test_ys[i])
    print()

src: 6 2 2 4 2
tgt: 6 - 2 / 2 * 4 == 2

src: 2 6 4 2 5
tgt: 2 * 6 / 4 + 2 == 5

src: 8 2 4 3 4
tgt: 8 * 2 - 4 * 3 == 4

src: 4 2 3 4 3
tgt: 4 - 2 - 3 + 4 == 3

src: 7 5 2 7 11
tgt: 7 - 5 + 2 + 7 == 11

src: 4 2 6 2 6
tgt: 4 / 2 + 6 - 2 == 6

src: 11 4 11 11 8
tgt: 11 - 4 + 11 / 11 == 8

src: 9 2 7 7 11
tgt: 9 + 2 + 7 - 7 == 11

src: 4 4 9 11 2
tgt: 4 - 4 - 9 + 11 == 2

src: 6 2 6 11 9
tgt: 6 - 2 - 6 + 11 == 9



In [22]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [23]:
# combine data sets to a dict
train_dict = {}
train_dict['xs'] = train_xs
train_dict['ys'] = train_ys

val_dict = {}
val_dict['xs'] = val_xs
val_dict['ys'] = val_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['val'] = val_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [24]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)