# Data Preprocessing

## End2End Moldes
+ Naive RNN

## Notes:
+ Both the encoder and decoder share the same embedding layer
+ Both the train_valid and test share the same vocab

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
import os
import numpy as np
from collections import Counter
from utils import load_txt, save_json, white_space_tokenizer, vocab_to_index

In [3]:
# define parameters
vocab_size = 10
seq_len = 10
data_size = 100000

In [4]:
# load path
indir = 'raw'
indir = os.path.join(indir, 'vocab_size_{}'.format(vocab_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'raw/vocab_size_10/seq_len_10/data_size_100000'

In [5]:
# save path
outdir = 'end2end'

outdir = os.path.join(outdir, 'vocab_size_{}'.format(vocab_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'end2end/vocab_size_10/seq_len_10/data_size_100000'

In [6]:
# load raw dataset
raw_xs = load_txt(os.path.join(indir, 'x.txt'))
raw_ys = load_txt(os.path.join(indir, 'y.txt'))

In [7]:
# check data size
print('sample size', len(raw_xs))
print('label size', len(raw_ys))

sample size 100000
label size 100000


In [8]:
# check duplicates
dataset = [(src, tgt) for src, tgt in zip(raw_xs, raw_ys)]
dataset = np.array(list(set(dataset)))
print(dataset.shape)

(100000, 2)


In [9]:
# take a look
for i in range(-10, 0, 1):
    print('input:', dataset[i, 0])
    print('output:', dataset[i, 1])
    print()

input: 1 8 5 3 3 1 3 9 6 7
output: 1 + 8 - 5 / 3 * 3 - 1 / 3 * 9 + 6 == 7

input: 4 6 6 9 3 1 4 0 6 3
output: 4 - 6 / 6 - 9 * 3 * 1 * 4 * 0 * 6 == 3

input: 4 1 6 0 8 0 1 4 8 4
output: 4 / 1 / 6 * 0 / 8 + 0 * 1 - 4 + 8 == 4

input: 4 9 0 9 2 2 3 9 3 8
output: 4 + 9 + 0 - 9 + 2 + 2 - 3 + 9 / 3 == 8

input: 5 6 3 9 3 7 0 8 0 4
output: 5 + 6 - 3 + 9 / 3 - 7 - 0 * 8 * 0 == 4

input: 9 0 4 1 4 2 6 9 2 7
output: 9 + 0 + 4 + 1 * 4 / 2 * 6 - 9 * 2 == 7

input: 1 4 6 3 4 1 6 3 1 0
output: 1 * 4 * 6 * 3 - 4 / 1 * 6 * 3 * 1 == 0

input: 2 7 0 5 6 2 0 3 1 4
output: 2 + 7 - 0 * 5 - 6 / 2 + 0 - 3 + 1 == 4

input: 3 9 9 1 4 3 4 4 5 8
output: 3 / 9 * 9 - 1 + 4 - 3 - 4 + 4 + 5 == 8

input: 3 3 5 8 7 4 8 0 7 4
output: 3 * 3 - 5 - 8 / 7 / 4 * 8 * 0 * 7 == 4



In [10]:
# white space tokenization
xs = dataset[:, 0]
ys = dataset[:, 1]
tk_xs = white_space_tokenizer(xs)
tk_ys = white_space_tokenizer(ys)

In [11]:
# vocabulary frequency distribution
counter = Counter()
for x in tk_xs:
    counter.update(x)

for y in tk_ys:
    counter.update(y)

print(counter.most_common())

[('0', 295062), ('1', 230154), ('*', 229541), ('2', 213860), ('+', 208073), ('-', 203841), ('3', 201328), ('4', 194710), ('6', 184504), ('5', 176640), ('8', 173162), ('7', 166410), ('9', 164170), ('/', 158545), ('==', 100000)]


In [12]:
vocab_list = sorted(counter.keys())
print(vocab_list)

['*', '+', '-', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '==']


In [13]:
# vocabulary dictionary
vocab2idx_dict = dict()
vocab2idx_dict['<pad>'] = 0 # to pad sequence length
vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(vocab2idx_dict)
for token in vocab_list:
    vocab2idx_dict[token] = i
    i += 1

print(vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '*': 4, '+': 5, '-': 6, '/': 7, '0': 8, '1': 9, '2': 10, '3': 11, '4': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17, '==': 18}


In [14]:
# convert vocabulary to index
xs = vocab_to_index(tk_xs, vocab2idx_dict)
ys = vocab_to_index(tk_ys, vocab2idx_dict)

In [15]:
# take a look
for i in range(-10, 0, 1):
    print('input:', xs[i], tk_xs[i])
    print('output:', ys[i], tk_ys[i])
    print()

input: [9, 16, 13, 11, 11, 9, 11, 17, 14, 15] ['1', '8', '5', '3', '3', '1', '3', '9', '6', '7']
output: [9, 5, 16, 6, 13, 7, 11, 4, 11, 6, 9, 7, 11, 4, 17, 5, 14, 18, 15] ['1', '+', '8', '-', '5', '/', '3', '*', '3', '-', '1', '/', '3', '*', '9', '+', '6', '==', '7']

input: [12, 14, 14, 17, 11, 9, 12, 8, 14, 11] ['4', '6', '6', '9', '3', '1', '4', '0', '6', '3']
output: [12, 6, 14, 7, 14, 6, 17, 4, 11, 4, 9, 4, 12, 4, 8, 4, 14, 18, 11] ['4', '-', '6', '/', '6', '-', '9', '*', '3', '*', '1', '*', '4', '*', '0', '*', '6', '==', '3']

input: [12, 9, 14, 8, 16, 8, 9, 12, 16, 12] ['4', '1', '6', '0', '8', '0', '1', '4', '8', '4']
output: [12, 7, 9, 7, 14, 4, 8, 7, 16, 5, 8, 4, 9, 6, 12, 5, 16, 18, 12] ['4', '/', '1', '/', '6', '*', '0', '/', '8', '+', '0', '*', '1', '-', '4', '+', '8', '==', '4']

input: [12, 17, 8, 17, 10, 10, 11, 17, 11, 16] ['4', '9', '0', '9', '2', '2', '3', '9', '3', '8']
output: [12, 5, 17, 5, 8, 6, 17, 5, 10, 5, 10, 6, 11, 5, 17, 7, 11, 18, 16] ['4', '+', '9', '+',

In [16]:
# train test split
dataset = np.array([(x, y) for x, y in zip(xs, ys)])
data_size = dataset.shape[0]
indices = np.random.permutation(data_size)
train_size = int(0.8*data_size)
test_size = int(0.2*data_size)
train_idx = indices[:train_size]
test_idx = indices[train_size:]
train_set = dataset[train_idx, :]
test_set = dataset[test_idx, :]
print('train size', train_size, train_set.shape[0])
print('test size', test_size, test_set.shape[0])

train size 80000 80000
test size 20000 20000


In [17]:
# combine data sets to a dict
train_dict = {}
train_dict['xs'] = train_set[:, 0].tolist()
train_dict['ys'] = train_set[:, 1].tolist()

test_dict = {}
test_dict['xs'] = test_set[:, 0].tolist()
test_dict['ys'] = test_set[:, 1].tolist()

data_dict = dict()
data_dict['train'] = train_dict
data_dict['test'] = test_dict

In [18]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab2idx_dict)