# Data Preprocessing

## End2End Moldes
+ Naive RNN

## Notes:
+ Both the encoder and decoder share the same embedding layer
+ Both the train_valid and test share the same vocab

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
import os
import numpy as np
from collections import Counter
from utils import load_txt, save_json, white_space_tokenizer, vocab_to_index

In [3]:
# define parameters
vocab_size = 10
seq_len = 5
data_size = 10000

In [4]:
# load path
indir = 'raw'
indir = os.path.join(indir, 'vocab_size_{}'.format(vocab_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'raw/vocab_size_10/seq_len_5/data_size_10000'

In [5]:
# save path
outdir = 'end2end'

outdir = os.path.join(outdir, 'vocab_size_{}'.format(vocab_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'end2end/vocab_size_10/seq_len_5/data_size_10000'

In [6]:
# load raw dataset
raw_xs = load_txt(os.path.join(indir, 'x.txt'))
raw_ys = load_txt(os.path.join(indir, 'y.txt'))

In [7]:
# check data size
print('sample size', len(raw_xs))
print('label size', len(raw_ys))

sample size 10000
label size 10000


In [8]:
# check duplicates
dataset = [(src, tgt) for src, tgt in zip(raw_xs, raw_ys)]
dataset = np.array(list(set(dataset)))
print(dataset.shape)

(10000, 2)


In [9]:
# take a look
for i in range(-10, 0, 1):
    print('input:', dataset[i, 0])
    print('output:', dataset[i, 1])
    print()

input: 8 1 3 0 5
output: 8 / 1 - 3 - 0 == 5

input: 2 7 2 6 2
output: 2 * 7 - 2 * 6 == 2

input: 7 9 1 9 6
output: 7 - 9 - 1 + 9 == 6

input: 7 9 6 8 2
output: 7 + 9 - 6 - 8 == 2

input: 0 8 1 6 7
output: 0 / 8 + 1 + 6 == 7

input: 4 7 7 0 0
output: 4 / 7 * 7 * 0 == 0

input: 7 1 1 1 7
output: 7 / 1 * 1 * 1 == 7

input: 0 7 6 1 1
output: 0 * 7 / 6 + 1 == 1

input: 8 1 0 9 0
output: 8 + 1 - 0 - 9 == 0

input: 0 5 1 1 1
output: 0 * 5 / 1 + 1 == 1



In [10]:
# white space tokenization
dataset = np.array(dataset)
xs = dataset[:, 0]
ys = dataset[:, 1]
tk_xs = white_space_tokenizer(xs)
tk_ys = white_space_tokenizer(ys)

In [11]:
# vocabulary frequency distribution
counter = Counter()
for x in tk_xs:
    counter.update(x)

for y in tk_ys:
    counter.update(y)

print(counter.most_common())

[('0', 17688), ('1', 11854), ('2', 10686), ('==', 10000), ('3', 9684), ('4', 9552), ('6', 8860), ('+', 8290), ('5', 8270), ('*', 8057), ('7', 7994), ('8', 7866), ('-', 7808), ('9', 7546), ('/', 5845)]


In [12]:
vocab_list = sorted(counter.keys())
print(vocab_list)

['*', '+', '-', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '==']


In [13]:
# vocabulary dictionary
vocab2idx_dict = dict()
vocab2idx_dict['<pad>'] = 0 # to pad sequence length
vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence
vocab2idx_dict['<unk>'] = 3 # to represent the unknow word

i = len(vocab2idx_dict)
for token in vocab_list:
    vocab2idx_dict[token] = i
    i += 1

print(vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '*': 4, '+': 5, '-': 6, '/': 7, '0': 8, '1': 9, '2': 10, '3': 11, '4': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17, '==': 18}


In [14]:
# convert vocabulary to index
xs = vocab_to_index(tk_xs, vocab2idx_dict)
ys = vocab_to_index(tk_ys, vocab2idx_dict)

In [15]:
# take a look
for i in range(-10, 0, 1):
    print('input:', xs[i], tk_xs[i])
    print('output:', ys[i], tk_ys[i])
    print()

input: [16, 9, 11, 8, 13] ['8', '1', '3', '0', '5']
output: [16, 7, 9, 6, 11, 6, 8, 18, 13] ['8', '/', '1', '-', '3', '-', '0', '==', '5']

input: [10, 15, 10, 14, 10] ['2', '7', '2', '6', '2']
output: [10, 4, 15, 6, 10, 4, 14, 18, 10] ['2', '*', '7', '-', '2', '*', '6', '==', '2']

input: [15, 17, 9, 17, 14] ['7', '9', '1', '9', '6']
output: [15, 6, 17, 6, 9, 5, 17, 18, 14] ['7', '-', '9', '-', '1', '+', '9', '==', '6']

input: [15, 17, 14, 16, 10] ['7', '9', '6', '8', '2']
output: [15, 5, 17, 6, 14, 6, 16, 18, 10] ['7', '+', '9', '-', '6', '-', '8', '==', '2']

input: [8, 16, 9, 14, 15] ['0', '8', '1', '6', '7']
output: [8, 7, 16, 5, 9, 5, 14, 18, 15] ['0', '/', '8', '+', '1', '+', '6', '==', '7']

input: [12, 15, 15, 8, 8] ['4', '7', '7', '0', '0']
output: [12, 7, 15, 4, 15, 4, 8, 18, 8] ['4', '/', '7', '*', '7', '*', '0', '==', '0']

input: [15, 9, 9, 9, 15] ['7', '1', '1', '1', '7']
output: [15, 7, 9, 4, 9, 4, 9, 18, 15] ['7', '/', '1', '*', '1', '*', '1', '==', '7']

input: [8, 1

In [16]:
# train test split
dataset = np.array([(x, y) for x, y in zip(xs, ys)])
data_size = dataset.shape[0]
indices = np.random.permutation(data_size)
train_size = int(0.7*data_size)
valid_size = int(0.15*data_size)
test_size = int(0.15*data_size)
train_idx = indices[:train_size]
valid_idx = indices[train_size:train_size+valid_size]
test_idx = indices[train_size+valid_size:]
train_set = dataset[train_idx, :]
valid_set = dataset[valid_idx, :]
test_set = dataset[test_idx, :]
print('train size', train_size, train_set.shape[0])
print('valid size', valid_size, valid_set.shape[0])
print('test size', test_size, test_set.shape[0])

train size 7000 7000
valid size 1500 1500
test size 1500 1500


In [17]:
# combine data sets to a dict
train_dict = {}
train_dict['xs'] = train_set[:, 0].tolist()
train_dict['ys'] = train_set[:, 1].tolist()

valid_dict = {}
valid_dict['xs'] = valid_set[:, 0].tolist()
valid_dict['ys'] = valid_set[:, 1].tolist()

test_dict = {}
test_dict['xs'] = test_set[:, 0].tolist()
test_dict['ys'] = test_set[:, 1].tolist()


data_dict = dict()
data_dict['train'] = train_dict
data_dict['valid'] = valid_dict
data_dict['test'] = test_dict

In [18]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab2idx_dict)