# Data Preprocessing

## End2End Moldes
+ Naive GRU RNN

## Notes:
+ There is no validation set 
+ Encoder and Decoder have separate embedding layers
+ Train and Test have their own vocab space although there maybe an overlap

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
import os
import numpy as np
from collections import Counter
from utils import load_txt, save_json, white_space_tokenizer, vocab_to_index

In [3]:
# define parameters
vocab_size = 10
seq_len = 5
data_size = 10000

In [4]:
# load path
indir = 'raw'
indir = os.path.join(indir, 'vocab_size_{}'.format(vocab_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'raw/vocab_size_10/seq_len_5/data_size_10000'

In [5]:
# save path
outdir = 'end2end'

outdir = os.path.join(outdir, 'vocab_size_{}'.format(vocab_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'end2end/vocab_size_10/seq_len_5/data_size_10000'

In [6]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [7]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 8000
train label size 8000
test sample size 2000
test label size 2000


### Train

In [8]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_train_xs[i])
    print('tgt:', raw_train_ys[i])
    print()

src: 5 7 4 7 7
tgt: 5 * 7 - 4 * 7 == 7

src: 10 5 10 10 5
tgt: 10 - 5 / 10 * 10 == 5

src: 8 4 8 3 8
tgt: 8 * 4 - 8 * 3 == 8

src: 3 2 3 4 2
tgt: 3 - 2 - 3 + 4 == 2

src: 8 5 7 3 7
tgt: 8 - 5 + 7 - 3 == 7

src: 10 4 8 3 3
tgt: 10 + 4 - 8 - 3 == 3

src: 2 3 6 7 6
tgt: 2 + 3 - 6 + 7 == 6

src: 6 6 2 10 4
tgt: 6 - 6 * 2 + 10 == 4

src: 4 5 10 7 2
tgt: 4 - 5 + 10 - 7 == 2

src: 9 3 3 5 10
tgt: 9 + 3 + 3 - 5 == 10



In [9]:
# white space tokenization
train_xs = white_space_tokenizer(raw_train_xs)
train_ys = white_space_tokenizer(raw_train_ys)

In [10]:
# vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)
    
print(counter.most_common())

[('2', 5724), ('3', 4817), ('4', 4579), ('6', 4093), ('5', 4029), ('8', 3697), ('10', 3451), ('9', 3370), ('7', 3350), ('11', 2890)]


In [11]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['10', '11', '2', '3', '4', '5', '6', '7', '8', '9']


In [12]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
src_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
src_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '10': 3, '11': 4, '2': 5, '3': 6, '4': 7, '5': 8, '6': 9, '7': 10, '8': 11, '9': 12}


In [13]:
# target vocabulary frequency distribution
counter = Counter()
for y in train_ys:
    counter.update(y)

print(counter.most_common())

[('-', 8810), ('+', 8164), ('==', 8000), ('2', 5724), ('3', 4817), ('4', 4579), ('6', 4093), ('5', 4029), ('8', 3697), ('/', 3527), ('*', 3499), ('10', 3451), ('9', 3370), ('7', 3350), ('11', 2890)]


In [14]:
tgt_vocab_list = sorted(counter.keys())
print(tgt_vocab_list)

['*', '+', '-', '/', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9', '==']


In [15]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
tgt_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '*': 3, '+': 4, '-': 5, '/': 6, '10': 7, '11': 8, '2': 9, '3': 10, '4': 11, '5': 12, '6': 13, '7': 14, '8': 15, '9': 16, '==': 17}


In [16]:
# convert vocabulary to index
train_xs = vocab_to_index(train_xs, src_vocab2idx_dict)
train_ys = vocab_to_index(train_ys, tgt_vocab2idx_dict)

### Test

In [17]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [18]:
# take a look
for i in range(-10, 0, 1):
    print('src:', test_xs[i])
    print('tgt:', test_ys[i])

src: ['4', '5', '2', '5', '6']
tgt: ['4', '+', '5', '+', '2', '-', '5', '==', '6']
src: ['11', '11', '11', '3', '8']
tgt: ['11', '+', '11', '-', '11', '-', '3', '==', '8']
src: ['6', '2', '4', '2', '2']
tgt: ['6', '-', '2', '*', '4', '/', '2', '==', '2']
src: ['9', '8', '7', '7', '2']
tgt: ['9', '-', '8', '+', '7', '/', '7', '==', '2']
src: ['8', '9', '2', '2', '3']
tgt: ['8', '-', '9', '+', '2', '*', '2', '==', '3']
src: ['6', '2', '7', '10', '5']
tgt: ['6', '+', '2', '+', '7', '-', '10', '==', '5']
src: ['8', '10', '7', '9', '2']
tgt: ['8', '+', '10', '-', '7', '-', '9', '==', '2']
src: ['10', '6', '11', '2', '3']
tgt: ['10', '+', '6', '-', '11', '-', '2', '==', '3']
src: ['10', '4', '11', '3', '7']
tgt: ['10', '*', '4', '-', '11', '*', '3', '==', '7']
src: ['9', '3', '4', '5', '7']
tgt: ['9', '*', '3', '-', '4', '*', '5', '==', '7']


In [19]:
# convert vocabulary to index
test_xs = vocab_to_index(test_xs, src_vocab2idx_dict)
test_ys = vocab_to_index(test_ys, tgt_vocab2idx_dict)

In [20]:
# combine data sets to a dict
train_dict = {}
train_dict['xs'] = train_xs
train_dict['ys'] = train_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [21]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)