# Data Preprocessing

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
# public
import os
import numpy as np
from collections import Counter
# private
from utils import *

In [3]:
# define parameters
method = 'rec'
num_size = 100
seq_len = 5
data_size = 10000

In [4]:
# load path
indir = 'aes'
indir = os.path.join(indir, 
                     'num_size_{}'.format(num_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'aes/num_size_100/seq_len_5/data_size_10000'

In [5]:
# save path
outdir = os.path.join(method, 
                      'num_size_{}'.format(num_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'rec/num_size_100/seq_len_5/data_size_10000'

In [6]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_val_xs = load_txt(os.path.join(indir, 'val_x.txt'))
raw_val_ys = load_txt(os.path.join(indir, 'val_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [7]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('val sample size', len(raw_val_xs))
print('val label size', len(raw_val_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 7000
train label size 7000
val sample size 1500
val label size 1500
test sample size 1500
test label size 1500


### Helper Functions

In [8]:
# def gen_rec_pair(x, y): 
#     x = x.split()
#     y = y.split()
#     xs = [x.copy()]
#     ys_ = []
#     num_left = len([i for i in x if i == '('])
#     for i in range(num_left):
#         left_idx = x.index('(') 
#         right_idx = x.index(')') 
#         v = y[left_idx] 
#         ys_.append(['<pos_{}>'.format(left_idx), '<pos_{}>'.format(right_idx), v])
#         x = x[:left_idx] + [v] + x[right_idx+1:]
#         xs.append(x)
#     ys_.append(['<done>']*3)
#     index = np.random.choice(range(len(xs)))
#     x = xs[index]
#     y_ = ys_[index]
#     return x, y_, y

In [9]:
sum([x == y for x, y in zip(raw_train_xs, raw_train_ys)])/len(raw_train_xs)

0.168

In [10]:
1/6

0.16666666666666666

### Train

In [11]:
train_xs, train_ys_, train_ys = zip(*[gen_rec_pair(x, y) for x, y in zip(raw_train_xs, raw_train_ys)])

In [12]:
# take a look
for i in range(-10, 0, 1):
    print('src:', train_xs[i])
    print('tgt:', train_ys[i])
    print('pred:', train_ys_[i])
    print()

src: ['88', '-', '3', '+', '92', '-', '99', '==', '78']
tgt: ['88', '-', '3', '+', '92', '-', '99', '==', '78']
pred: ['<done>', '<done>', '<done>']

src: ['-', '58', '+', '67', '+', '5', '+', '74', '==', '(', '35', '+', '53', ')']
tgt: ['-', '58', '+', '67', '+', '5', '+', '74', '==', '88']
pred: ['<pos_9>', '<pos_13>', '88']

src: ['10', '+', '(', '97', '-', '63', ')', '-', '(', '94', '-', '52', ')', '+', '26', '==', '28']
tgt: ['10', '+', '34', '-', '42', '+', '26', '==', '28']
pred: ['<pos_2>', '<pos_6>', '34']

src: ['75', '-', '21', '-', '98', '+', '79', '==', '35']
tgt: ['75', '-', '21', '-', '98', '+', '79', '==', '35']
pred: ['<done>', '<done>', '<done>']

src: ['19', '-', '2', '-', '60', '+', '61', '==', '18']
tgt: ['19', '-', '2', '-', '60', '+', '61', '==', '18']
pred: ['<done>', '<done>', '<done>']

src: ['43', '+', '65', '-', '12', '-', '(', '-', '9', '+', '75', ')', '==', '(', '-', '16', '+', '46', ')']
tgt: ['43', '+', '65', '-', '12', '-', '66', '==', '30']
pred: ['<po

In [13]:
sum([5/6 * 1/5 * 1/i for i in range(2, 7)]) + 1/6

0.4083333333333333

In [15]:
sum([y_ == ['<done>', '<done>', '<done>'] for y_ in train_ys_])/len(train_ys_)

0.41014285714285714

In [19]:
# source vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)

print(len(counter))
print(counter.most_common())

107
[('+', 17212), ('-', 16250), ('(', 8853), (')', 8853), ('==', 7000), ('*', 1223), ('/', 860), ('2', 755), ('4', 655), ('3', 651), ('5', 624), ('8', 597), ('6', 594), ('7', 575), ('9', 574), ('10', 546), ('14', 539), ('21', 539), ('33', 531), ('12', 531), ('13', 529), ('11', 515), ('30', 515), ('25', 508), ('17', 505), ('22', 502), ('19', 498), ('27', 490), ('32', 488), ('18', 485), ('31', 483), ('43', 482), ('16', 482), ('15', 480), ('48', 477), ('23', 471), ('28', 469), ('36', 469), ('39', 469), ('24', 468), ('40', 467), ('35', 464), ('29', 463), ('41', 461), ('26', 460), ('47', 457), ('34', 455), ('38', 453), ('52', 452), ('20', 451), ('37', 449), ('63', 436), ('60', 435), ('42', 435), ('44', 434), ('66', 432), ('51', 426), ('59', 426), ('68', 424), ('69', 421), ('46', 421), ('65', 420), ('50', 419), ('56', 416), ('45', 415), ('78', 410), ('58', 403), ('55', 403), ('54', 397), ('64', 395), ('81', 395), ('49', 394), ('76', 393), ('61', 392), ('70', 392), ('90', 389), ('53', 387), 

In [20]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['(', ')', '*', '+', '-', '/', '10', '100', '101', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '==']


In [21]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '(': 1, ')': 2, '*': 3, '+': 4, '-': 5, '/': 6, '10': 7, '100': 8, '101': 9, '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '2': 19, '20': 20, '21': 21, '22': 22, '23': 23, '24': 24, '25': 25, '26': 26, '27': 27, '28': 28, '29': 29, '3': 30, '30': 31, '31': 32, '32': 33, '33': 34, '34': 35, '35': 36, '36': 37, '37': 38, '38': 39, '39': 40, '4': 41, '40': 42, '41': 43, '42': 44, '43': 45, '44': 46, '45': 47, '46': 48, '47': 49, '48': 50, '49': 51, '5': 52, '50': 53, '51': 54, '52': 55, '53': 56, '54': 57, '55': 58, '56': 59, '57': 60, '58': 61, '59': 62, '6': 63, '60': 64, '61': 65, '62': 66, '63': 67, '64': 68, '65': 69, '66': 70, '67': 71, '68': 72, '69': 73, '7': 74, '70': 75, '71': 76, '72': 77, '73': 78, '74': 79, '75': 80, '76': 81, '77': 82, '78': 83, '79': 84, '8': 85, '80': 86, '81': 87, '82': 88, '83': 89, '84': 90, '85': 91, '86': 92, '87': 93, '88': 94, '89': 95, '9': 96, '90': 97, '91': 98, '92': 99, '93': 100, '94': 1

In [22]:
# target vocabulary frequency distribution
counter = Counter()
for y_ in train_ys_:
    counter.update(y_)

print(len(counter))
print(counter.most_common())

116
[('<done>', 8454), ('<pos_6>', 951), ('<pos_8>', 945), ('<pos_4>', 850), ('<pos_7>', 740), ('<pos_5>', 721), ('<pos_9>', 704), ('<pos_2>', 524), ('<pos_0>', 515), ('<pos_12>', 450), ('<pos_11>', 427), ('<pos_10>', 427), ('<pos_13>', 367), ('<pos_1>', 312), ('<pos_3>', 310), ('<pos_14>', 121), ('4', 77), ('6', 70), ('2', 62), ('5', 59), ('3', 59), ('7', 58), ('32', 56), ('36', 56), ('8', 55), ('25', 55), ('29', 55), ('21', 53), ('42', 53), ('28', 53), ('12', 53), ('16', 53), ('17', 53), ('34', 51), ('23', 51), ('58', 50), ('30', 50), ('24', 50), ('52', 50), ('68', 49), ('26', 48), ('18', 48), ('57', 48), ('9', 48), ('47', 47), ('15', 46), ('19', 46), ('43', 45), ('14', 45), ('48', 45), ('31', 44), ('13', 44), ('61', 44), ('66', 44), ('72', 44), ('20', 44), ('22', 43), ('27', 43), ('93', 42), ('10', 42), ('11', 42), ('45', 42), ('67', 41), ('33', 41), ('39', 41), ('46', 40), ('62', 40), ('41', 39), ('84', 39), ('49', 39), ('64', 39), ('89', 39), ('51', 39), ('78', 38), ('101', 38), (

In [23]:
tgt_vocab_list = sorted(counter.keys())
print(tgt_vocab_list)

['10', '100', '101', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '<done>', '<pos_0>', '<pos_10>', '<pos_11>', '<pos_12>', '<pos_13>', '<pos_14>', '<pos_1>', '<pos_2>', '<pos_3>', '<pos_4>', '<pos_5>', '<pos_6>', '<pos_7>', '<pos_8>', '<pos_9>']


In [27]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '10': 2, '100': 3, '101': 4, '11': 5, '12': 6, '13': 7, '14': 8, '15': 9, '16': 10, '17': 11, '18': 12, '19': 13, '2': 14, '20': 15, '21': 16, '22': 17, '23': 18, '24': 19, '25': 20, '26': 21, '27': 22, '28': 23, '29': 24, '3': 25, '30': 26, '31': 27, '32': 28, '33': 29, '34': 30, '35': 31, '36': 32, '37': 33, '38': 34, '39': 35, '4': 36, '40': 37, '41': 38, '42': 39, '43': 40, '44': 41, '45': 42, '46': 43, '47': 44, '48': 45, '49': 46, '5': 47, '50': 48, '51': 49, '52': 50, '53': 51, '54': 52, '55': 53, '56': 54, '57': 55, '58': 56, '59': 57, '6': 58, '60': 59, '61': 60, '62': 61, '63': 62, '64': 63, '65': 64, '66': 65, '67': 66, '68': 67, '69': 68, '7': 69, '70': 70, '71': 71, '72': 72, '73': 73, '74': 74, '75': 75, '76': 76, '77': 77, '78': 78, '79': 79, '8': 80, '80': 81, '81': 82, '82': 83, '83': 84, '84': 85, '85': 86, '86': 87, '87': 88, '88': 89, '89': 90, '9': 91, '90': 92, '91': 93, '92': 94, '93': 95, '94': 96, '95': 97, '96': 98, '97': 99, '98': 100, 

### Val

In [15]:
# white space tokenization
val_xs = white_space_tokenizer(raw_val_xs)
val_ys = white_space_tokenizer(raw_val_ys)

In [16]:
# take a look
for i in range(-10, 0, 1):
    print('src:', val_xs[i])
    print('tgt:', val_ys[i])
    print()

src: ['18', '-', '45', '+', '44', '-', '(', '91', '-', '79', ')', '==', '5']
tgt: ['18', '-', '45', '+', '44', '-', '12', '==', '5']

src: ['-', '(', '-', '61', '+', '82', ')', '+', '77', '-', '(', '2', '*', '29', ')', '+', '(', '10', '+', '63', ')', '==', '(', '18', '+', '53', ')']
tgt: ['-', '21', '+', '77', '-', '58', '+', '73', '==', '71']

src: ['(', '19', '+', '49', ')', '-', '(', '82', '-', '56', ')', '+', '84', '-', '(', '35', '+', '14', ')', '==', '(', '50', '+', '27', ')']
tgt: ['68', '-', '26', '+', '84', '-', '49', '==', '77']

src: ['100', '+', '(', '98', '-', '56', ')', '+', '19', '-', '(', '58', '+', '25', ')', '==', '(', '68', '+', '10', ')']
tgt: ['100', '+', '42', '+', '19', '-', '83', '==', '78']

src: ['54', '+', '(', '-', '2', '+', '56', ')', '+', '(', '76', '-', '25', ')', '-', '86', '==', '(', '70', '+', '3', ')']
tgt: ['54', '+', '54', '+', '51', '-', '86', '==', '73']

src: ['53', '-', '(', '-', '12', '+', '63', ')', '-', '95', '+', '98', '==', '5']
tgt: ['53',

## Test

In [17]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [18]:
# take a look
for i in range(-10, 0, 1):
    print('src:', test_xs[i])
    print('tgt:', test_ys[i])
    print()

src: ['(', '-', '25', '+', '79', ')', '-', '(', '72', '+', '26', ')', '-', '36', '+', '83', '==', '(', '-', '20', '+', '23', ')']
tgt: ['54', '-', '98', '-', '36', '+', '83', '==', '3']

src: ['25', '+', '59', '-', '(', '60', '-', '17', ')', '+', '11', '==', '52']
tgt: ['25', '+', '59', '-', '43', '+', '11', '==', '52']

src: ['20', '-', '(', '-', '11', '+', '38', ')', '+', '(', '-', '63', '+', '75', ')', '+', '84', '==', '89']
tgt: ['20', '-', '27', '+', '12', '+', '84', '==', '89']

src: ['(', '-', '5', '+', '87', ')', '-', '(', '3', '+', '97', ')', '+', '(', '38', '+', '57', ')', '+', '(', '-', '16', '+', '24', ')', '==', '(', '71', '+', '14', ')']
tgt: ['82', '-', '100', '+', '95', '+', '8', '==', '85']

src: ['-', '(', '3', '*', '19', ')', '+', '(', '21', '+', '69', ')', '-', '(', '10', '*', '3', ')', '+', '(', '91', '-', '72', ')', '==', '(', '80', '-', '58', ')']
tgt: ['-', '57', '+', '90', '-', '30', '+', '19', '==', '22']

src: ['-', '(', '23', '+', '50', ')', '+', '33', '*', 

In [19]:
# combine data sets to a dict
train_dict = {}
train_dict['ys'] = train_ys

val_dict = {}
val_dict['xs'] = val_xs
val_dict['ys'] = val_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['val'] = val_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [20]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)

# Archive Code

In [37]:
# raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
# x = raw_train_xs[0]
# y = raw_train_ys[0]
# x = x.split()
# y = y.split()
# print(x)
# print(y)
# print()

# xs = [x.copy()]
# ys_ = []

# num_left = len([i for i in x if i == '('])
# for i in range(num_left):
#     left_idx = x.index('(') 
#     right_idx = x.index(')') 
#     v = y[left_idx] 
#     ys_.append(['<pos_{}>'.format(left_idx), '<pos_{}>'.format(right_idx), v])
#     x = x[:left_idx] + [v] + x[right_idx+1:]
#     xs.append(x)

# ys_.append(['<done>']*3)

# for x, y_ in zip(xs, ys_):
#     print(x)
#     print(y_)

In [38]:
# x = xs[0]
# y_ = ys_[0]
# print(x)
# print(y_)

In [39]:
# np_xs = [x for i in range(10)]
# np_ys_ = [y_ for i in range(10)]
# np_xs.append(xs[0])
# np_ys_.append(ys_[-1])

# np_xs = np.array(np_xs)
# np_ys_ = np.array(np_ys_)

# print(np_xs)
# print()
# print(np_ys_)

In [40]:
# mask = (np_ys_ != '<done>').all(axis=-1)
# mask

In [41]:
# np_ys_[mask]

In [42]:
# def parse_pos(pos):
#     return int(''.join([i for i in pos if i.isdigit()]))

In [43]:
# get_pos = np.vectorize(parse_pos, otypes=[int])

In [44]:
# left = get_pos(np_ys_[mask, :2])[:, 0]
# left

In [45]:
# right = get_pos(np_ys_[mask, :2])[:, 1]
# right

In [46]:
# np_xs[mask, left]

In [47]:
# y_[0].startswith('<pos_')

In [48]:
# left_idx = parse_pos(y_[0])

In [49]:
# right_idx = parse_pos(y_[1])

In [50]:
# x

In [51]:
# x[:left_idx] + [y_[2]] + x[right_idx+1:]