# Data Preprocessing

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [3]:
# dependency
# public
import os
import numpy as np
from collections import Counter
# private
from utils import *

In [3]:
# define parameters
method = 'e2e'
num_size = 10
seq_len = 5
data_size = 10000

In [4]:
# load path
data_src = 'aor'
indir = os.path.join(data_src, 
                     'num_size_{}'.format(num_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'aor/num_size_10/seq_len_5/data_size_10000'

In [5]:
# save path
outdir = os.path.join(method, 
                      'num_size_{}'.format(num_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'e2e/num_size_10/seq_len_5/data_size_10000'

In [6]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_val_xs = load_txt(os.path.join(indir, 'val_x.txt'))
raw_val_ys = load_txt(os.path.join(indir, 'val_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [7]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('val sample size', len(raw_val_xs))
print('val label size', len(raw_val_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 7000
train label size 7000
val sample size 1500
val label size 1500
test sample size 1500
test label size 1500


### Train

In [8]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_train_xs[i])
    print('tgt:', raw_train_ys[i])
    print()

src: 7 2 3 3 3
tgt: 7 + 2 - 3 - 3 == 3

src: 2 2 3 2 10
tgt: 2 + 2 * 3 + 2 == 10

src: 9 3 8 6 10
tgt: 9 + 3 - 8 + 6 == 10

src: 6 2 8 5 5
tgt: - 6 - 2 + 8 + 5 == 5

src: 4 2 9 6 7
tgt: 4 + 2 * 9 / 6 == 7

src: 11 8 9 5 5
tgt: 11 + 8 - 9 - 5 == 5

src: 2 2 8 2 9
tgt: - 2 / 2 + 8 + 2 == 9

src: 9 10 5 10 3
tgt: - 9 + 10 / 5 + 10 == 3

src: 9 4 6 7 4
tgt: 9 - 4 + 6 - 7 == 4

src: 4 4 9 6 2
tgt: - 4 / 4 + 9 - 6 == 2



In [9]:
# white space tokenization
train_xs = white_space_tokenizer(raw_train_xs)
train_ys = white_space_tokenizer(raw_train_ys)

In [10]:
# vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)
    
print(counter.most_common())

[('2', 4932), ('3', 4266), ('4', 4046), ('6', 3582), ('5', 3516), ('8', 3283), ('7', 3046), ('10', 3024), ('9', 2922), ('11', 2383)]


In [11]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['10', '11', '2', '3', '4', '5', '6', '7', '8', '9']


In [12]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
# for online training
src_vocab2idx_dict['+'] = 1
src_vocab2idx_dict['-'] = 2
src_vocab2idx_dict['*'] = 3
src_vocab2idx_dict['/'] = 4
src_vocab2idx_dict['=='] = 5

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '+': 1, '-': 2, '*': 3, '/': 4, '==': 5, '10': 6, '11': 7, '2': 8, '3': 9, '4': 10, '5': 11, '6': 12, '7': 13, '8': 14, '9': 15}


In [13]:
# target vocabulary frequency distribution
counter = Counter()
for y in train_ys:
    counter.update(y)

print(counter.most_common())

[('+', 8967), ('-', 8782), ('==', 7000), ('2', 4932), ('3', 4266), ('4', 4046), ('6', 3582), ('5', 3516), ('8', 3283), ('*', 3062), ('7', 3046), ('10', 3024), ('9', 2922), ('/', 2709), ('11', 2383)]


In [14]:
tgt_vocab_list = sorted(counter.keys())
print(tgt_vocab_list)

['*', '+', '-', '/', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9', '==']


In [15]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
tgt_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '*': 3, '+': 4, '-': 5, '/': 6, '10': 7, '11': 8, '2': 9, '3': 10, '4': 11, '5': 12, '6': 13, '7': 14, '8': 15, '9': 16, '==': 17}


### Val

In [16]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_val_xs[i])
    print('tgt:', raw_val_ys[i])
    print()

src: 5 5 6 6 11
tgt: - 5 / 5 + 6 + 6 == 11

src: 9 5 5 2 11
tgt: 9 + 5 / 5 * 2 == 11

src: 9 4 3 6 11
tgt: 9 + 4 * 3 / 6 == 11

src: 6 8 4 7 5
tgt: - 6 + 8 - 4 + 7 == 5

src: 8 2 4 10 3
tgt: 8 - 2 / 4 * 10 == 3

src: 5 4 8 3 4
tgt: - 5 + 4 + 8 - 3 == 4

src: 4 4 10 11 7
tgt: 4 + 4 + 10 - 11 == 7

src: 9 8 2 9 6
tgt: 9 + 8 - 2 - 9 == 6

src: 4 6 7 8 9
tgt: 4 + 6 + 7 - 8 == 9

src: 5 5 10 6 9
tgt: 5 * 5 - 10 - 6 == 9



In [17]:
# white space tokenization
val_xs = white_space_tokenizer(raw_val_xs)
val_ys = white_space_tokenizer(raw_val_ys)

### Test

In [18]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_test_xs[i])
    print('tgt:', raw_test_ys[i])
    print()

src: 5 10 5 3 10
tgt: 5 + 10 / 5 + 3 == 10

src: 7 6 9 6 11
tgt: 7 + 6 / 9 * 6 == 11

src: 10 10 10 4 7
tgt: 10 + 10 / 10 - 4 == 7

src: 4 2 2 4 4
tgt: 4 + 2 + 2 - 4 == 4

src: 3 10 2 5 3
tgt: 3 - 10 / 2 + 5 == 3

src: 4 2 7 4 5
tgt: 4 / 2 + 7 - 4 == 5

src: 8 11 2 8 9
tgt: - 8 + 11 - 2 + 8 == 9

src: 7 5 3 6 2
tgt: - 7 + 5 * 3 - 6 == 2

src: 8 11 2 6 11
tgt: - 8 + 11 + 2 + 6 == 11

src: 6 10 2 8 10
tgt: - 6 + 10 - 2 + 8 == 10



In [19]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [20]:
# combine data sets to a dict
train_dict = {}
train_dict['xs'] = train_xs
train_dict['ys'] = train_ys

val_dict = {}
val_dict['xs'] = val_xs
val_dict['ys'] = val_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['val'] = val_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [21]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)

In [12]:
x = ['8', '11', '2', '6', '11'] 
y = ['-', '8', '+', '11', '+', '2', '+', '6', '==', '11']

In [13]:
# get operator indexes
operator_idxes = [i for i, token in enumerate(y) if not token.isdigit()][::-1] 
print(operator_idxes)
# decide how many operators to remove
num_idxes = np.random.choice(range(len(operator_idxes)+1))
print(num_idxes)
# decide operators to remove
idxes_to_remove = operator_idxes[:num_idxes]
print(idxes_to_remove)
x = [x[i] for i in range(len(x)) if i not in idxes_to_remove]
print(x)

[8, 6, 4, 2, 0]
4
[8, 6, 4, 2]
['8', '11', '6']


In [5]:
x

['8', '11', '2', '6']