# Data Preprocessing

## Tagging Moldes
+ Naive GRU RNN
+ Naive LSTM RNN
+ Bi-directional GRU RNN
+ Bi-directional LSTM RNN
+ Bi-directional GRU RNN with Attention
+ Bi-directional LSTM RNN with Attention

## Notes:
+ Encoder and Decoder have separate embedding layers
+ There are two training methods, namely, online and offline

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
# public
import os
import numpy as np
from collections import Counter
# private
from utils import *

In [11]:
# define parameters
num_size = 100
seq_len = 5
data_size = 10000

In [12]:
# load path
data_src = 'aes'
indir = os.path.join(data_src, 
                     'num_size_{}'.format(num_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'aes/num_size_100/seq_len_5/data_size_10000'

In [13]:
# save path
outdir = 'tagging'

outdir = os.path.join(outdir, 
                      'num_size_{}'.format(num_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'tagging/num_size_100/seq_len_5/data_size_10000'

In [14]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_val_xs = load_txt(os.path.join(indir, 'val_x.txt'))
raw_val_ys = load_txt(os.path.join(indir, 'val_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [15]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('val sample size', len(raw_val_xs))
print('val label size', len(raw_val_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 7000
train label size 7000
val sample size 1500
val label size 1500
test sample size 1500
test label size 1500


### Train

In [16]:
# white space tokenization
train_ys = white_space_tokenizer(raw_train_ys)
train_xs = white_space_tokenizer(raw_train_xs)

In [17]:
# take a look
for i in range(-10, 0, 1):
    print('src:', train_xs[i])
    print('tgt:', train_ys[i])
    print()

src: ['-', '(', '77', '-', '71', ')', '*', '(', '-', '44', '+', '67', ')', '+', '(', '-', '6', '+', '76', ')', '+', '(', '46', '+', '46', ')', '==', '24']
tgt: ['-', '6', '*', '23', '+', '70', '+', '92', '==', '24']

src: ['-', '(', '5', '+', '89', ')', '+', '36', '+', '(', '34', '+', '19', ')', '+', '(', '89', '+', '7', ')', '==', '91']
tgt: ['-', '94', '+', '36', '+', '53', '+', '96', '==', '91']

src: ['(', '-', '4', '+', '95', ')', '+', '35', '-', '(', '23', '*', '3', ')', '-', '21', '==', '36']
tgt: ['91', '+', '35', '-', '69', '-', '21', '==', '36']

src: ['-', '(', '37', '+', '31', ')', '-', '3', '+', '(', '34', '+', '55', ')', '+', '37', '==', '55']
tgt: ['-', '68', '-', '3', '+', '89', '+', '37', '==', '55']

src: ['-', '23', '-', '(', '-', '25', '+', '48', ')', '-', '34', '+', '93', '==', '13']
tgt: ['-', '23', '-', '23', '-', '34', '+', '93', '==', '13']

src: ['-', '21', '+', '20', '+', '60', '+', '10', '==', '69']
tgt: ['-', '21', '+', '20', '+', '60', '+', '10', '==', '69

In [18]:
# source vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)

print(len(counter))
print(counter.most_common())

107
[('+', 20542), ('-', 19642), ('(', 14028), (')', 14028), ('==', 7000), ('*', 1287), ('/', 918), ('2', 797), ('4', 738), ('3', 728), ('7', 696), ('6', 677), ('8', 650), ('10', 643), ('5', 642), ('13', 611), ('12', 602), ('16', 599), ('11', 596), ('23', 595), ('26', 590), ('9', 586), ('15', 582), ('25', 580), ('21', 580), ('22', 580), ('17', 579), ('19', 572), ('31', 565), ('20', 564), ('30', 559), ('29', 558), ('27', 556), ('24', 556), ('14', 553), ('34', 551), ('42', 545), ('36', 532), ('33', 530), ('28', 527), ('32', 526), ('41', 523), ('18', 519), ('40', 513), ('56', 510), ('46', 508), ('44', 503), ('39', 502), ('50', 499), ('52', 499), ('38', 494), ('60', 493), ('59', 488), ('58', 485), ('43', 484), ('35', 482), ('57', 480), ('49', 477), ('37', 477), ('51', 476), ('48', 476), ('63', 474), ('69', 465), ('54', 463), ('45', 461), ('65', 459), ('70', 458), ('47', 457), ('61', 456), ('64', 454), ('53', 453), ('55', 442), ('77', 440), ('66', 437), ('75', 437), ('72', 435), ('76', 428)

In [19]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['(', ')', '*', '+', '-', '/', '10', '100', '101', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '==']


In [20]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '(': 1, ')': 2, '*': 3, '+': 4, '-': 5, '/': 6, '10': 7, '100': 8, '101': 9, '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '2': 19, '20': 20, '21': 21, '22': 22, '23': 23, '24': 24, '25': 25, '26': 26, '27': 27, '28': 28, '29': 29, '3': 30, '30': 31, '31': 32, '32': 33, '33': 34, '34': 35, '35': 36, '36': 37, '37': 38, '38': 39, '39': 40, '4': 41, '40': 42, '41': 43, '42': 44, '43': 45, '44': 46, '45': 47, '46': 48, '47': 49, '48': 50, '49': 51, '5': 52, '50': 53, '51': 54, '52': 55, '53': 56, '54': 57, '55': 58, '56': 59, '57': 60, '58': 61, '59': 62, '6': 63, '60': 64, '61': 65, '62': 66, '63': 67, '64': 68, '65': 69, '66': 70, '67': 71, '68': 72, '69': 73, '7': 74, '70': 75, '71': 76, '72': 77, '73': 78, '74': 79, '75': 80, '76': 81, '77': 82, '78': 83, '79': 84, '8': 85, '80': 86, '81': 87, '82': 88, '83': 89, '84': 90, '85': 91, '86': 92, '87': 93, '88': 94, '89': 95, '9': 96, '90': 97, '91': 98, '92': 99, '93': 100, '94': 1

In [21]:
tgt_vocab_list = ['<delete>', '<keep>']
tgt_vocab_list += ['<add_{}>'.format(i) for i in range(2, num_size+2)]
print(len(tgt_vocab_list))
print(tgt_vocab_list)

102
['<delete>', '<keep>', '<add_2>', '<add_3>', '<add_4>', '<add_5>', '<add_6>', '<add_7>', '<add_8>', '<add_9>', '<add_10>', '<add_11>', '<add_12>', '<add_13>', '<add_14>', '<add_15>', '<add_16>', '<add_17>', '<add_18>', '<add_19>', '<add_20>', '<add_21>', '<add_22>', '<add_23>', '<add_24>', '<add_25>', '<add_26>', '<add_27>', '<add_28>', '<add_29>', '<add_30>', '<add_31>', '<add_32>', '<add_33>', '<add_34>', '<add_35>', '<add_36>', '<add_37>', '<add_38>', '<add_39>', '<add_40>', '<add_41>', '<add_42>', '<add_43>', '<add_44>', '<add_45>', '<add_46>', '<add_47>', '<add_48>', '<add_49>', '<add_50>', '<add_51>', '<add_52>', '<add_53>', '<add_54>', '<add_55>', '<add_56>', '<add_57>', '<add_58>', '<add_59>', '<add_60>', '<add_61>', '<add_62>', '<add_63>', '<add_64>', '<add_65>', '<add_66>', '<add_67>', '<add_68>', '<add_69>', '<add_70>', '<add_71>', '<add_72>', '<add_73>', '<add_74>', '<add_75>', '<add_76>', '<add_77>', '<add_78>', '<add_79>', '<add_80>', '<add_81>', '<add_82>', '<add_83>

In [22]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
tgt_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '<delete>': 3, '<keep>': 4, '<add_2>': 5, '<add_3>': 6, '<add_4>': 7, '<add_5>': 8, '<add_6>': 9, '<add_7>': 10, '<add_8>': 11, '<add_9>': 12, '<add_10>': 13, '<add_11>': 14, '<add_12>': 15, '<add_13>': 16, '<add_14>': 17, '<add_15>': 18, '<add_16>': 19, '<add_17>': 20, '<add_18>': 21, '<add_19>': 22, '<add_20>': 23, '<add_21>': 24, '<add_22>': 25, '<add_23>': 26, '<add_24>': 27, '<add_25>': 28, '<add_26>': 29, '<add_27>': 30, '<add_28>': 31, '<add_29>': 32, '<add_30>': 33, '<add_31>': 34, '<add_32>': 35, '<add_33>': 36, '<add_34>': 37, '<add_35>': 38, '<add_36>': 39, '<add_37>': 40, '<add_38>': 41, '<add_39>': 42, '<add_40>': 43, '<add_41>': 44, '<add_42>': 45, '<add_43>': 46, '<add_44>': 47, '<add_45>': 48, '<add_46>': 49, '<add_47>': 50, '<add_48>': 51, '<add_49>': 52, '<add_50>': 53, '<add_51>': 54, '<add_52>': 55, '<add_53>': 56, '<add_54>': 57, '<add_55>': 58, '<add_56>': 59, '<add_57>': 60, '<add_58>': 61, '<add_59>': 62, '<add_60>': 63, '<add_6

### Val

In [23]:
# white space tokenization
val_xs = white_space_tokenizer(raw_val_xs)
val_ys = white_space_tokenizer(raw_val_ys)

In [24]:
# take a look
for i in range(-10, 0, 1):
    print('src:', val_xs[i])
    print('tgt:', val_ys[i])
    print()

src: ['-', '(', '4', '+', '52', ')', '/', '(', '-', '26', '+', '34', ')', '+', '46', '+', '(', '-', '8', '+', '62', ')', '==', '(', '91', '+', '2', ')']
tgt: ['-', '56', '/', '8', '+', '46', '+', '54', '==', '93']

src: ['-', '(', '84', '+', '6', ')', '/', '(', '92', '-', '87', ')', '+', '88', '+', '30', '==', '100']
tgt: ['-', '90', '/', '5', '+', '88', '+', '30', '==', '100']

src: ['9', '+', '(', '-', '4', '+', '10', ')', '+', '21', '+', '2', '==', '38']
tgt: ['9', '+', '6', '+', '21', '+', '2', '==', '38']

src: ['-', '33', '+', '94', '-', '76', '+', '23', '==', '8']
tgt: ['-', '33', '+', '94', '-', '76', '+', '23', '==', '8']

src: ['51', '-', '(', '-', '50', '+', '66', ')', '*', '(', '37', '-', '34', ')', '+', '(', '78', '-', '7', ')', '==', '74']
tgt: ['51', '-', '16', '*', '3', '+', '71', '==', '74']

src: ['-', '85', '+', '71', '+', '88', '+', '25', '==', '99']
tgt: ['-', '85', '+', '71', '+', '88', '+', '25', '==', '99']

src: ['-', '79', '-', '22', '+', '(', '-', '4', '+', '

### Test

In [25]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [26]:
# take a look
for i in range(-10, 0, 1):
    print('src:', test_xs[i])
    print('tgt:', test_ys[i])
    print()

src: ['-', '18', '+', '(', '-', '41', '+', '85', ')', '+', '(', '38', '+', '3', ')', '-', '(', '19', '+', '33', ')', '==', '(', '-', '78', '+', '93', ')']
tgt: ['-', '18', '+', '44', '+', '41', '-', '52', '==', '15']

src: ['-', '95', '+', '(', '75', '+', '21', ')', '+', '(', '10', '+', '79', ')', '-', '12', '==', '78']
tgt: ['-', '95', '+', '96', '+', '89', '-', '12', '==', '78']

src: ['37', '+', '2', '+', '(', '50', '+', '33', ')', '-', '(', '44', '+', '9', ')', '==', '(', '72', '-', '3', ')']
tgt: ['37', '+', '2', '+', '83', '-', '53', '==', '69']

src: ['(', '-', '34', '+', '76', ')', '+', '(', '76', '+', '13', ')', '-', '(', '100', '-', '4', ')', '+', '36', '==', '71']
tgt: ['42', '+', '89', '-', '96', '+', '36', '==', '71']

src: ['88', '-', '2', '+', '66', '-', '81', '==', '(', '85', '-', '14', ')']
tgt: ['88', '-', '2', '+', '66', '-', '81', '==', '71']

src: ['-', '38', '-', '32', '+', '7', '*', '17', '==', '49']
tgt: ['-', '38', '-', '32', '+', '7', '*', '17', '==', '49']

s

In [27]:
# combine data sets to a dict
train_dict = {}
train_dict['ys'] = train_ys

val_dict = {}
val_dict['xs'] = val_xs
val_dict['ys'] = val_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['val'] = val_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [28]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)

## Archive Code

In [74]:
# def gen_tag_pair(x, y):
#     x_ = x.split()
#     y = y.split()
#     y_ = []
#     x_token = x_.pop(0)
#     for i in range(len(y)):
#         y_token = y[i]
#         if x_token == y_token:
#             y_.append('<keep>')
#             if len(x_) == 0:
#                 break
#             x_token = x_.pop(0)
#         else:
#             y_.append('<add_{}>'.format(y_token))
#             while True:
#                 y_.append('<delete>')
#                 if x_token == ')':
#                     if len(x_) != 0:
#                         x_token = x_.pop(0)
#                     break
#                 x_token = x_.pop(0)
#     return x, ' '.join(y), ' '.join(y_)

In [75]:
# train_xs, train_ys, train_ys_ = zip(*[gen_tag_pair(x, y) for x, y in zip(raw_train_xs, raw_train_ys)])