# Data Preprocessing

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
# public
import os
import numpy as np
from collections import Counter
# private
from utils import *

In [3]:
# define parameters
num_size = 100
seq_len = 5
data_size = 10000

In [4]:
# load path
indir = 'aes'
indir = os.path.join(indir, 
                     'num_size_{}'.format(num_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'aes/num_size_100/seq_len_5/data_size_10000'

In [5]:
# save path
outdir = 'e2e'

outdir = os.path.join(outdir, 
                      'num_size_{}'.format(num_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'e2e/num_size_100/seq_len_5/data_size_10000'

In [6]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_val_xs = load_txt(os.path.join(indir, 'val_x.txt'))
raw_val_ys = load_txt(os.path.join(indir, 'val_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [7]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('val sample size', len(raw_val_xs))
print('val label size', len(raw_val_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 7000
train label size 7000
val sample size 1500
val label size 1500
test sample size 1500
test label size 1500


### Train

In [8]:
# white space tokenization
train_ys = white_space_tokenizer(raw_train_ys)
train_xs = white_space_tokenizer(raw_train_xs)

In [9]:
# take a look
for i in range(-10, 0, 1):
    print('src:', train_xs[i])
    print('tgt:', train_ys[i])
    print()

src: ['88', '-', '(', '40', '-', '37', ')', '+', '(', '21', '+', '71', ')', '-', '99', '==', '(', '32', '+', '46', ')']
tgt: ['88', '-', '3', '+', '92', '-', '99', '==', '78']

src: ['-', '(', '41', '+', '17', ')', '+', '(', '99', '-', '32', ')', '+', '(', '-', '76', '+', '81', ')', '+', '(', '21', '+', '53', ')', '==', '(', '35', '+', '53', ')']
tgt: ['-', '58', '+', '67', '+', '5', '+', '74', '==', '88']

src: ['10', '+', '(', '97', '-', '63', ')', '-', '(', '94', '-', '52', ')', '+', '26', '==', '28']
tgt: ['10', '+', '34', '-', '42', '+', '26', '==', '28']

src: ['(', '43', '+', '32', ')', '-', '(', '17', '+', '4', ')', '-', '(', '30', '+', '68', ')', '+', '(', '97', '-', '18', ')', '==', '(', '83', '-', '48', ')']
tgt: ['75', '-', '21', '-', '98', '+', '79', '==', '35']

src: ['19', '-', '(', '-', '46', '+', '48', ')', '-', '60', '+', '(', '18', '+', '43', ')', '==', '(', '38', '-', '20', ')']
tgt: ['19', '-', '2', '-', '60', '+', '61', '==', '18']

src: ['(', '75', '-', '32', ')'

In [10]:
# source vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)

print(len(counter))
print(counter.most_common())

107
[('+', 22882), ('-', 21737), ('(', 17495), (')', 17495), ('==', 7000), ('*', 1356), ('/', 879), ('2', 845), ('3', 762), ('4', 761), ('6', 751), ('9', 715), ('7', 706), ('5', 693), ('8', 681), ('10', 680), ('18', 666), ('20', 661), ('14', 659), ('12', 655), ('23', 651), ('15', 649), ('19', 639), ('11', 632), ('13', 631), ('16', 628), ('17', 623), ('30', 623), ('35', 619), ('22', 618), ('24', 615), ('31', 599), ('32', 598), ('26', 595), ('25', 593), ('28', 593), ('29', 592), ('37', 588), ('21', 583), ('40', 581), ('46', 574), ('39', 563), ('27', 557), ('33', 556), ('38', 552), ('36', 552), ('44', 549), ('45', 545), ('43', 545), ('47', 534), ('34', 531), ('42', 529), ('48', 527), ('53', 526), ('41', 522), ('52', 514), ('56', 506), ('65', 504), ('50', 504), ('49', 501), ('51', 499), ('72', 497), ('54', 497), ('64', 496), ('66', 494), ('55', 494), ('59', 488), ('58', 488), ('57', 486), ('60', 484), ('63', 475), ('61', 474), ('69', 466), ('67', 461), ('76', 459), ('68', 456), ('71', 451)

In [11]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['(', ')', '*', '+', '-', '/', '10', '100', '101', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '==']


In [12]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '(': 1, ')': 2, '*': 3, '+': 4, '-': 5, '/': 6, '10': 7, '100': 8, '101': 9, '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '2': 19, '20': 20, '21': 21, '22': 22, '23': 23, '24': 24, '25': 25, '26': 26, '27': 27, '28': 28, '29': 29, '3': 30, '30': 31, '31': 32, '32': 33, '33': 34, '34': 35, '35': 36, '36': 37, '37': 38, '38': 39, '39': 40, '4': 41, '40': 42, '41': 43, '42': 44, '43': 45, '44': 46, '45': 47, '46': 48, '47': 49, '48': 50, '49': 51, '5': 52, '50': 53, '51': 54, '52': 55, '53': 56, '54': 57, '55': 58, '56': 59, '57': 60, '58': 61, '59': 62, '6': 63, '60': 64, '61': 65, '62': 66, '63': 67, '64': 68, '65': 69, '66': 70, '67': 71, '68': 72, '69': 73, '7': 74, '70': 75, '71': 76, '72': 77, '73': 78, '74': 79, '75': 80, '76': 81, '77': 82, '78': 83, '79': 84, '8': 85, '80': 86, '81': 87, '82': 88, '83': 89, '84': 90, '85': 91, '86': 92, '87': 93, '88': 94, '89': 95, '9': 96, '90': 97, '91': 98, '92': 99, '93': 100, '94': 1

In [13]:
# target vocabulary frequency distribution
counter = Counter()
for y in train_ys:
    counter.update(y)
    
print(len(counter))
print(counter.most_common())

105
[('+', 11294), ('-', 10605), ('==', 7000), ('*', 1007), ('/', 879), ('2', 588), ('3', 499), ('6', 495), ('4', 463), ('12', 444), ('8', 444), ('24', 438), ('16', 425), ('18', 421), ('14', 417), ('9', 417), ('7', 415), ('10', 413), ('22', 408), ('30', 406), ('13', 405), ('5', 400), ('38', 397), ('23', 392), ('15', 391), ('48', 388), ('37', 387), ('11', 385), ('28', 382), ('17', 381), ('36', 381), ('20', 374), ('32', 374), ('21', 373), ('25', 371), ('29', 369), ('33', 369), ('19', 367), ('42', 366), ('40', 364), ('27', 363), ('45', 360), ('44', 359), ('26', 357), ('60', 356), ('46', 355), ('56', 354), ('54', 352), ('47', 348), ('55', 348), ('59', 348), ('35', 346), ('39', 345), ('34', 344), ('50', 343), ('74', 340), ('43', 337), ('66', 337), ('57', 336), ('49', 335), ('78', 334), ('41', 333), ('51', 332), ('31', 331), ('52', 330), ('58', 330), ('65', 329), ('70', 327), ('69', 327), ('64', 327), ('75', 326), ('61', 325), ('63', 323), ('72', 320), ('77', 318), ('71', 314), ('83', 312), 

In [14]:
tgt_vocab_list = sorted(counter.keys())
print(tgt_vocab_list)

['*', '+', '-', '/', '10', '100', '101', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '==']


In [15]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
tgt_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '*': 3, '+': 4, '-': 5, '/': 6, '10': 7, '100': 8, '101': 9, '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '2': 19, '20': 20, '21': 21, '22': 22, '23': 23, '24': 24, '25': 25, '26': 26, '27': 27, '28': 28, '29': 29, '3': 30, '30': 31, '31': 32, '32': 33, '33': 34, '34': 35, '35': 36, '36': 37, '37': 38, '38': 39, '39': 40, '4': 41, '40': 42, '41': 43, '42': 44, '43': 45, '44': 46, '45': 47, '46': 48, '47': 49, '48': 50, '49': 51, '5': 52, '50': 53, '51': 54, '52': 55, '53': 56, '54': 57, '55': 58, '56': 59, '57': 60, '58': 61, '59': 62, '6': 63, '60': 64, '61': 65, '62': 66, '63': 67, '64': 68, '65': 69, '66': 70, '67': 71, '68': 72, '69': 73, '7': 74, '70': 75, '71': 76, '72': 77, '73': 78, '74': 79, '75': 80, '76': 81, '77': 82, '78': 83, '79': 84, '8': 85, '80': 86, '81': 87, '82': 88, '83': 89, '84': 90, '85': 91, '86': 92, '87': 93, '88': 94, '89': 95, '9': 96, '90': 97, '91': 98, '92': 99, '93': 100, '9

### Val

In [16]:
# white space tokenization
val_xs = white_space_tokenizer(raw_val_xs)
val_ys = white_space_tokenizer(raw_val_ys)

In [17]:
# take a look
for i in range(-10, 0, 1):
    print('src:', val_xs[i])
    print('tgt:', val_ys[i])
    print()

src: ['18', '-', '45', '+', '44', '-', '(', '91', '-', '79', ')', '==', '5']
tgt: ['18', '-', '45', '+', '44', '-', '12', '==', '5']

src: ['-', '(', '-', '61', '+', '82', ')', '+', '77', '-', '(', '2', '*', '29', ')', '+', '(', '10', '+', '63', ')', '==', '(', '18', '+', '53', ')']
tgt: ['-', '21', '+', '77', '-', '58', '+', '73', '==', '71']

src: ['(', '19', '+', '49', ')', '-', '(', '82', '-', '56', ')', '+', '84', '-', '(', '35', '+', '14', ')', '==', '(', '50', '+', '27', ')']
tgt: ['68', '-', '26', '+', '84', '-', '49', '==', '77']

src: ['100', '+', '(', '98', '-', '56', ')', '+', '19', '-', '(', '58', '+', '25', ')', '==', '(', '68', '+', '10', ')']
tgt: ['100', '+', '42', '+', '19', '-', '83', '==', '78']

src: ['54', '+', '(', '-', '2', '+', '56', ')', '+', '(', '76', '-', '25', ')', '-', '86', '==', '(', '70', '+', '3', ')']
tgt: ['54', '+', '54', '+', '51', '-', '86', '==', '73']

src: ['53', '-', '(', '-', '12', '+', '63', ')', '-', '95', '+', '98', '==', '5']
tgt: ['53',

### Test

In [18]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [19]:
# take a look
for i in range(-10, 0, 1):
    print('src:', test_xs[i])
    print('tgt:', test_ys[i])
    print()

src: ['(', '-', '25', '+', '79', ')', '-', '(', '72', '+', '26', ')', '-', '36', '+', '83', '==', '(', '-', '20', '+', '23', ')']
tgt: ['54', '-', '98', '-', '36', '+', '83', '==', '3']

src: ['25', '+', '59', '-', '(', '60', '-', '17', ')', '+', '11', '==', '52']
tgt: ['25', '+', '59', '-', '43', '+', '11', '==', '52']

src: ['20', '-', '(', '-', '11', '+', '38', ')', '+', '(', '-', '63', '+', '75', ')', '+', '84', '==', '89']
tgt: ['20', '-', '27', '+', '12', '+', '84', '==', '89']

src: ['(', '-', '5', '+', '87', ')', '-', '(', '3', '+', '97', ')', '+', '(', '38', '+', '57', ')', '+', '(', '-', '16', '+', '24', ')', '==', '(', '71', '+', '14', ')']
tgt: ['82', '-', '100', '+', '95', '+', '8', '==', '85']

src: ['-', '(', '3', '*', '19', ')', '+', '(', '21', '+', '69', ')', '-', '(', '10', '*', '3', ')', '+', '(', '91', '-', '72', ')', '==', '(', '80', '-', '58', ')']
tgt: ['-', '57', '+', '90', '-', '30', '+', '19', '==', '22']

src: ['-', '(', '23', '+', '50', ')', '+', '33', '*', 

In [20]:
# combine data sets to a dict
train_dict = {}
train_dict['ys'] = train_ys

val_dict = {}
val_dict['xs'] = val_xs
val_dict['ys'] = val_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['val'] = val_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [21]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)

# Archive Code

In [9]:
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt')) 
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
x = raw_train_xs[4]
y = raw_train_ys[4]
x = x.split()
y = y.split()
print(x)
print(y)
print()

['(', '44', '+', '36', ')', '+', '(', '10', '+', '6', ')', '+', '(', '20', '+', '9', ')', '-', '(', '65', '+', '35', ')', '==', '(', '-', '58', '+', '83', ')']
['80', '+', '16', '+', '29', '-', '100', '==', '25']



In [10]:
# for online end2end
xs = [x.copy()]
num_left = len([i for i in x if i == '('])
for i in range(num_left):
    left_idx = x.index('(') 
    right_idx = x.index(')') 
    v = y[left_idx] 
    x = x[:left_idx] + [v] + x[right_idx+1:]
    xs.append(x)

for x in xs:
    print(x)

['(', '44', '+', '36', ')', '+', '(', '10', '+', '6', ')', '+', '(', '20', '+', '9', ')', '-', '(', '65', '+', '35', ')', '==', '(', '-', '58', '+', '83', ')']
['80', '+', '(', '10', '+', '6', ')', '+', '(', '20', '+', '9', ')', '-', '(', '65', '+', '35', ')', '==', '(', '-', '58', '+', '83', ')']
['80', '+', '16', '+', '(', '20', '+', '9', ')', '-', '(', '65', '+', '35', ')', '==', '(', '-', '58', '+', '83', ')']
['80', '+', '16', '+', '29', '-', '(', '65', '+', '35', ')', '==', '(', '-', '58', '+', '83', ')']
['80', '+', '16', '+', '29', '-', '100', '==', '(', '-', '58', '+', '83', ')']
['80', '+', '16', '+', '29', '-', '100', '==', '25']
