# Data Preprocessing

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
# public
import os
import numpy as np
from collections import Counter
# private
from utils import *

In [3]:
# define parameters
num_size = 100
seq_len = 5
data_size = 10000

In [4]:
# load path
data_src = 'aes'
indir = os.path.join(data_src, 
                     'num_size_{}'.format(num_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'aes/num_size_100/seq_len_5/data_size_10000'

In [5]:
# save path
outdir = 'tag'

outdir = os.path.join(outdir, 
                      'num_size_{}'.format(num_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'tag/num_size_100/seq_len_5/data_size_10000'

In [6]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_val_xs = load_txt(os.path.join(indir, 'val_x.txt'))
raw_val_ys = load_txt(os.path.join(indir, 'val_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [7]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('val sample size', len(raw_val_xs))
print('val label size', len(raw_val_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 7000
train label size 7000
val sample size 1500
val label size 1500
test sample size 1500
test label size 1500


### Helper Functions

In [8]:
# def gen_tag_pair(x, y):
#     x_ = x.split()
#     y = y.split()
#     y_ = []
#     x_token = x_.pop(0)
#     for i in range(len(y)):
#         y_token = y[i]
#         if x_token == y_token:
#             y_.append('<keep>')
#             if len(x_) == 0:
#                 break
#             x_token = x_.pop(0)
#         else:
#             y_.append('<sub_{}>'.format(y_token))
#             x_token = x_.pop(0)
#             while True:
#                 y_.append('<delete>')
#                 if x_token == ')':
#                     if len(x_) != 0:
#                         x_token = x_.pop(0)
#                     break
#                 x_token = x_.pop(0)
#     return x, ' '.join(y), ' '.join(y_)

### Train

In [9]:
train_xs, train_ys, train_ys_ = zip(*[gen_tag_pair(x, y) for x, y in zip(raw_train_xs, raw_train_ys)])

In [10]:
# white space tokenization
train_xs = white_space_tokenizer(train_xs)
train_ys = white_space_tokenizer(train_ys)
train_ys_ = white_space_tokenizer(train_ys_)

In [11]:
# take a look
for i in range(-10, 0, 1):
    print('src:', train_xs[i])
    print('tgt:', train_ys[i])
    print('pred:', train_ys_[i])
    print()

src: ['(', '55', '-', '45', ')', '+', '(', '14', '+', '12', ')', '+', '(', '94', '-', '53', ')', '-', '(', '35', '-', '2', ')', '==', '44']
tgt: ['10', '+', '26', '+', '41', '-', '33', '==', '44']
pred: ['<sub_10>', '<delete>', '<delete>', '<delete>', '<delete>', '<keep>', '<sub_26>', '<delete>', '<delete>', '<delete>', '<delete>', '<keep>', '<sub_41>', '<delete>', '<delete>', '<delete>', '<delete>', '<keep>', '<sub_33>', '<delete>', '<delete>', '<delete>', '<delete>', '<keep>', '<keep>']

src: ['46', '+', '(', '20', '+', '6', ')', '/', '25', '*', '(', '15', '+', '35', ')', '==', '98']
tgt: ['46', '+', '26', '/', '25', '*', '50', '==', '98']
pred: ['<keep>', '<keep>', '<sub_26>', '<delete>', '<delete>', '<delete>', '<delete>', '<keep>', '<keep>', '<keep>', '<sub_50>', '<delete>', '<delete>', '<delete>', '<delete>', '<keep>', '<keep>']

src: ['(', '45', '+', '32', ')', '+', '19', '-', '3', '*', '11', '==', '(', '4', '+', '59', ')']
tgt: ['77', '+', '19', '-', '3', '*', '11', '==', '63']

In [12]:
# source vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)

print(len(counter))
print(counter.most_common())

107
[('+', 22978), ('-', 21816), ('(', 17645), (')', 17645), ('==', 7000), ('*', 1417), ('2', 918), ('/', 860), ('3', 795), ('4', 776), ('5', 746), ('6', 746), ('9', 722), ('7', 719), ('8', 715), ('13', 708), ('10', 664), ('21', 650), ('33', 645), ('22', 645), ('11', 643), ('14', 641), ('19', 626), ('17', 625), ('16', 622), ('12', 622), ('25', 620), ('24', 615), ('15', 612), ('23', 608), ('28', 599), ('18', 594), ('35', 589), ('30', 589), ('32', 587), ('26', 586), ('20', 583), ('27', 583), ('29', 580), ('31', 580), ('43', 570), ('48', 567), ('47', 564), ('40', 563), ('36', 563), ('39', 558), ('41', 556), ('34', 552), ('37', 545), ('38', 537), ('60', 535), ('42', 522), ('44', 517), ('52', 514), ('59', 509), ('51', 508), ('45', 505), ('66', 501), ('46', 501), ('55', 498), ('56', 497), ('58', 493), ('53', 489), ('49', 487), ('50', 484), ('63', 480), ('54', 479), ('65', 477), ('62', 475), ('69', 474), ('81', 467), ('73', 467), ('70', 465), ('57', 464), ('61', 462), ('68', 460), ('72', 459)

In [13]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['(', ')', '*', '+', '-', '/', '10', '100', '101', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '==']


In [14]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '(': 1, ')': 2, '*': 3, '+': 4, '-': 5, '/': 6, '10': 7, '100': 8, '101': 9, '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '2': 19, '20': 20, '21': 21, '22': 22, '23': 23, '24': 24, '25': 25, '26': 26, '27': 27, '28': 28, '29': 29, '3': 30, '30': 31, '31': 32, '32': 33, '33': 34, '34': 35, '35': 36, '36': 37, '37': 38, '38': 39, '39': 40, '4': 41, '40': 42, '41': 43, '42': 44, '43': 45, '44': 46, '45': 47, '46': 48, '47': 49, '48': 50, '49': 51, '5': 52, '50': 53, '51': 54, '52': 55, '53': 56, '54': 57, '55': 58, '56': 59, '57': 60, '58': 61, '59': 62, '6': 63, '60': 64, '61': 65, '62': 66, '63': 67, '64': 68, '65': 69, '66': 70, '67': 71, '68': 72, '69': 73, '7': 74, '70': 75, '71': 76, '72': 77, '73': 78, '74': 79, '75': 80, '76': 81, '77': 82, '78': 83, '79': 84, '8': 85, '80': 86, '81': 87, '82': 88, '83': 89, '84': 90, '85': 91, '86': 92, '87': 93, '88': 94, '89': 95, '9': 96, '90': 97, '91': 98, '92': 99, '93': 100, '94': 1

In [15]:
# target vocabulary frequency distribution
counter = Counter()
for y_ in train_ys_:
    counter.update(y_)

print(len(counter))
print(counter.most_common())

102
[('<delete>', 76231), ('<keep>', 48130), ('<sub_4>', 295), ('<sub_2>', 281), ('<sub_3>', 254), ('<sub_6>', 246), ('<sub_5>', 243), ('<sub_9>', 235), ('<sub_12>', 230), ('<sub_21>', 225), ('<sub_30>', 223), ('<sub_8>', 221), ('<sub_36>', 215), ('<sub_7>', 210), ('<sub_14>', 210), ('<sub_32>', 207), ('<sub_18>', 201), ('<sub_13>', 201), ('<sub_68>', 200), ('<sub_25>', 200), ('<sub_26>', 199), ('<sub_17>', 198), ('<sub_29>', 196), ('<sub_15>', 196), ('<sub_20>', 194), ('<sub_69>', 194), ('<sub_34>', 194), ('<sub_16>', 191), ('<sub_40>', 191), ('<sub_39>', 191), ('<sub_19>', 190), ('<sub_43>', 189), ('<sub_42>', 188), ('<sub_11>', 187), ('<sub_56>', 186), ('<sub_24>', 186), ('<sub_45>', 186), ('<sub_27>', 185), ('<sub_33>', 185), ('<sub_66>', 185), ('<sub_31>', 181), ('<sub_22>', 181), ('<sub_50>', 181), ('<sub_48>', 181), ('<sub_52>', 181), ('<sub_28>', 178), ('<sub_10>', 176), ('<sub_38>', 176), ('<sub_46>', 175), ('<sub_41>', 174), ('<sub_23>', 173), ('<sub_57>', 172), ('<sub_63>', 

In [16]:
# tgt_vocab_list = ['<delete>', '<keep>']
# tgt_vocab_list += ['<add_{}>'.format(i) for i in range(2, num_size+2)]
tgt_vocab_list = sorted(counter.keys())
print(len(tgt_vocab_list))
print(tgt_vocab_list)

102
['<delete>', '<keep>', '<sub_100>', '<sub_101>', '<sub_10>', '<sub_11>', '<sub_12>', '<sub_13>', '<sub_14>', '<sub_15>', '<sub_16>', '<sub_17>', '<sub_18>', '<sub_19>', '<sub_20>', '<sub_21>', '<sub_22>', '<sub_23>', '<sub_24>', '<sub_25>', '<sub_26>', '<sub_27>', '<sub_28>', '<sub_29>', '<sub_2>', '<sub_30>', '<sub_31>', '<sub_32>', '<sub_33>', '<sub_34>', '<sub_35>', '<sub_36>', '<sub_37>', '<sub_38>', '<sub_39>', '<sub_3>', '<sub_40>', '<sub_41>', '<sub_42>', '<sub_43>', '<sub_44>', '<sub_45>', '<sub_46>', '<sub_47>', '<sub_48>', '<sub_49>', '<sub_4>', '<sub_50>', '<sub_51>', '<sub_52>', '<sub_53>', '<sub_54>', '<sub_55>', '<sub_56>', '<sub_57>', '<sub_58>', '<sub_59>', '<sub_5>', '<sub_60>', '<sub_61>', '<sub_62>', '<sub_63>', '<sub_64>', '<sub_65>', '<sub_66>', '<sub_67>', '<sub_68>', '<sub_69>', '<sub_6>', '<sub_70>', '<sub_71>', '<sub_72>', '<sub_73>', '<sub_74>', '<sub_75>', '<sub_76>', '<sub_77>', '<sub_78>', '<sub_79>', '<sub_7>', '<sub_80>', '<sub_81>', '<sub_82>', '<sub

In [17]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
tgt_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '<delete>': 3, '<keep>': 4, '<sub_100>': 5, '<sub_101>': 6, '<sub_10>': 7, '<sub_11>': 8, '<sub_12>': 9, '<sub_13>': 10, '<sub_14>': 11, '<sub_15>': 12, '<sub_16>': 13, '<sub_17>': 14, '<sub_18>': 15, '<sub_19>': 16, '<sub_20>': 17, '<sub_21>': 18, '<sub_22>': 19, '<sub_23>': 20, '<sub_24>': 21, '<sub_25>': 22, '<sub_26>': 23, '<sub_27>': 24, '<sub_28>': 25, '<sub_29>': 26, '<sub_2>': 27, '<sub_30>': 28, '<sub_31>': 29, '<sub_32>': 30, '<sub_33>': 31, '<sub_34>': 32, '<sub_35>': 33, '<sub_36>': 34, '<sub_37>': 35, '<sub_38>': 36, '<sub_39>': 37, '<sub_3>': 38, '<sub_40>': 39, '<sub_41>': 40, '<sub_42>': 41, '<sub_43>': 42, '<sub_44>': 43, '<sub_45>': 44, '<sub_46>': 45, '<sub_47>': 46, '<sub_48>': 47, '<sub_49>': 48, '<sub_4>': 49, '<sub_50>': 50, '<sub_51>': 51, '<sub_52>': 52, '<sub_53>': 53, '<sub_54>': 54, '<sub_55>': 55, '<sub_56>': 56, '<sub_57>': 57, '<sub_58>': 58, '<sub_59>': 59, '<sub_5>': 60, '<sub_60>': 61, '<sub_61>': 62, '<sub_62>': 63, '

### Val

In [18]:
# white space tokenization
val_xs = white_space_tokenizer(raw_val_xs)
val_ys = white_space_tokenizer(raw_val_ys)

In [19]:
# take a look
for i in range(-10, 0, 1):
    print('src:', val_xs[i])
    print('tgt:', val_ys[i])
    print()

src: ['18', '-', '45', '+', '44', '-', '(', '91', '-', '79', ')', '==', '5']
tgt: ['18', '-', '45', '+', '44', '-', '12', '==', '5']

src: ['-', '(', '-', '61', '+', '82', ')', '+', '77', '-', '(', '2', '*', '29', ')', '+', '(', '10', '+', '63', ')', '==', '(', '18', '+', '53', ')']
tgt: ['-', '21', '+', '77', '-', '58', '+', '73', '==', '71']

src: ['(', '19', '+', '49', ')', '-', '(', '82', '-', '56', ')', '+', '84', '-', '(', '35', '+', '14', ')', '==', '(', '50', '+', '27', ')']
tgt: ['68', '-', '26', '+', '84', '-', '49', '==', '77']

src: ['100', '+', '(', '98', '-', '56', ')', '+', '19', '-', '(', '58', '+', '25', ')', '==', '(', '68', '+', '10', ')']
tgt: ['100', '+', '42', '+', '19', '-', '83', '==', '78']

src: ['54', '+', '(', '-', '2', '+', '56', ')', '+', '(', '76', '-', '25', ')', '-', '86', '==', '(', '70', '+', '3', ')']
tgt: ['54', '+', '54', '+', '51', '-', '86', '==', '73']

src: ['53', '-', '(', '-', '12', '+', '63', ')', '-', '95', '+', '98', '==', '5']
tgt: ['53',

### Test

In [20]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [21]:
# take a look
for i in range(-10, 0, 1):
    print('src:', test_xs[i])
    print('tgt:', test_ys[i])
    print()

src: ['(', '-', '25', '+', '79', ')', '-', '(', '72', '+', '26', ')', '-', '36', '+', '83', '==', '(', '-', '20', '+', '23', ')']
tgt: ['54', '-', '98', '-', '36', '+', '83', '==', '3']

src: ['25', '+', '59', '-', '(', '60', '-', '17', ')', '+', '11', '==', '52']
tgt: ['25', '+', '59', '-', '43', '+', '11', '==', '52']

src: ['20', '-', '(', '-', '11', '+', '38', ')', '+', '(', '-', '63', '+', '75', ')', '+', '84', '==', '89']
tgt: ['20', '-', '27', '+', '12', '+', '84', '==', '89']

src: ['(', '-', '5', '+', '87', ')', '-', '(', '3', '+', '97', ')', '+', '(', '38', '+', '57', ')', '+', '(', '-', '16', '+', '24', ')', '==', '(', '71', '+', '14', ')']
tgt: ['82', '-', '100', '+', '95', '+', '8', '==', '85']

src: ['-', '(', '3', '*', '19', ')', '+', '(', '21', '+', '69', ')', '-', '(', '10', '*', '3', ')', '+', '(', '91', '-', '72', ')', '==', '(', '80', '-', '58', ')']
tgt: ['-', '57', '+', '90', '-', '30', '+', '19', '==', '22']

src: ['-', '(', '23', '+', '50', ')', '+', '33', '*', 

In [22]:
# combine data sets to a dict
train_dict = {}
train_dict['ys'] = train_ys

val_dict = {}
val_dict['xs'] = val_xs
val_dict['ys'] = val_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['val'] = val_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [23]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)

## Archive Code

In [128]:
# def gen_tag_pair(x, y):
#     x_ = x.split()
#     y = y.split()
#     y_ = []
#     x_token = x_.pop(0)
#     for i in range(len(y)):
#         y_token = y[i]
#         if x_token == y_token:
#             y_.append('<keep>')
#             if len(x_) == 0:
#                 break
#             x_token = x_.pop(0)
#         else:
#             y_.append('<sub_{}>'.format(y_token))
#             x_token = x_.pop(0)
#             while True:
#                 y_.append('<delete>')
#                 if x_token == ')':
#                     if len(x_) != 0:
#                         x_token = x_.pop(0)
#                     break
#                 x_token = x_.pop(0)
#     return x, ' '.join(y), ' '.join(y_)

In [75]:
# train_xs, train_ys, train_ys_ = zip(*[gen_tag_pair(x, y) for x, y in zip(raw_train_xs, raw_train_ys)])

In [171]:
# def tagging_execution(x, y_):
#     p = []
#     x_ = x.copy()
#     x_token = x_.pop(0)
#     for y_token in y_:
#         if y_token == '<keep>':
#             # keep token
#             p.append(x_token)
#             if len(x_) == 0:
#                 break
#             else:
#                 x_token = x_.pop(0)
#         elif y_token == '<delete>':
#             # delete token
#             if len(x_) == 0:
#                 break
#             else:
#                 x_token = x_.pop(0)
#         elif 'insert' in y_token:
#             # insert token
#             p.append(y_token[8:-1])
#         elif 'sub' in y_token:
#             # substitute token
#             p.append(y_token[5:-1])
#             if len(x_) == 0:
#                 break
#             else:
#                 x_token = x_.pop(0)
#         else:
#             # end symbol or pad symbol
#             break
#     # return prediction
#     return p