# Data Preprocessing

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
# public
import os
import numpy as np
from collections import Counter
# private
from utils import *

In [3]:
# define parameters
num_size = 10
seq_len = 5
data_size = 10000

In [4]:
# load path
data_src = 'aec'
indir = os.path.join(data_src, 
                     'num_size_{}'.format(num_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'aec/num_size_10/seq_len_5/data_size_10000'

In [5]:
# save path
outdir = 'tag'

outdir = os.path.join(outdir, 
                      'num_size_{}'.format(num_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'tag/num_size_10/seq_len_5/data_size_10000'

In [6]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_val_xs = load_txt(os.path.join(indir, 'val_x.txt'))
raw_val_ys = load_txt(os.path.join(indir, 'val_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [7]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('val sample size', len(raw_val_xs))
print('val label size', len(raw_val_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 7000
train label size 7000
val sample size 1500
val label size 1500
test sample size 1500
test label size 1500


### Helper Functions

In [27]:
# def levenshtein_editops_list(source, target):
#     unique_elements = sorted(set(source + target)) 
#     char_list = [chr(i) for i in range(len(unique_elements))]
#     if len(unique_elements) > len(char_list):
#         raise Exception("too many elements")
#     else:
#         unique_element_map = {ele:char_list[i]  for i, ele in enumerate(unique_elements)}
#     source_str = ''.join([unique_element_map[ele] for ele in source])
#     target_str = ''.join([unique_element_map[ele] for ele in target])
#     transform_list = Levenshtein.editops(source_str, target_str)
#     return transform_list

# def gen_tag_pair(x, y):
#     x = x.split()
#     y = y.split()
#     editops = levenshtein_editops_list(x, y)
#     y_ = ['<keep>'] * len(x)
#     c = 0
#     for tag, i, j in editops:
#         i += c
#         if tag == 'replace': 
#             if y_[i] != '<keep>':
#                 y_.insert(i+1, '<sub_{}>'.format(y[j]))
#                 c += 1
#             else:
#                 y_[i] = '<sub_{}>'.format(y[j])
#         elif tag == 'delete':
#             y_[i] = '<delete>'
#         elif tag == 'insert': 
#             y_.insert(i, '<insert_{}>'.format(y[j]))
#             c += 1
#     return x, y, y_

# def tag_execute(x, y_):
#     p = []
#     x_ = x.copy()
#     x_token = x_.pop(0)
#     for y_token in y_:
#         if y_token == '<keep>':
#             # keep token
#             p.append(x_token)
#             if len(x_) == 0:
#                 break
#             else:
#                 x_token = x_.pop(0)
#         elif y_token == '<delete>':
#             # delete token
#             if len(x_) == 0:
#                 break
#             else:
#                 x_token = x_.pop(0)
#         elif 'insert' in y_token:
#             # insert token
#             p.append(y_token[8:-1])
#         elif 'sub' in y_token:
#             # substitute token
#             p.append(y_token[5:-1])
#             if len(x_) == 0:
#                 break
#             else:
#                 x_token = x_.pop(0)
#         else:
#             # end symbol or pad symbol
#             break
#     # return prediction
#     return p

### Train

In [9]:
train_xs, train_ys, train_ys_ = zip(*[gen_tag_pair(x, y) for x, y in zip(raw_train_xs, raw_train_ys)])

In [10]:
# take a look
for i in range(-10, 0, 1):
    print('src:', train_xs[i])
    print('tgt:', train_ys[i])
    print('pred:', train_ys_[i]) 
    print()

src: ['5', '*', '7', '+', '5', '*', '4', '==', '8']
tgt: ['5', '-', '7', '+', '5', '*', '2', '==', '8']
pred: ['<keep>', '<sub_->', '<keep>', '<keep>', '<keep>', '<keep>', '<sub_2>', '<keep>', '<keep>']

src: ['+', '+', '5', '+', '11', '+', '+', '2', '8']
tgt: ['-', '10', '+', '5', '+', '11', '+', '2', '==', '8']
pred: ['<insert_->', '<sub_10>', '<keep>', '<keep>', '<keep>', '<keep>', '<keep>', '<sub_2>', '<sub_==>', '<keep>']

src: ['-', '10', '+', '6', '*', '4', '-', '3', '==', '11']
tgt: ['-', '10', '+', '6', '*', '4', '-', '3', '==', '11']
pred: ['<keep>', '<keep>', '<keep>', '<keep>', '<keep>', '<keep>', '<keep>', '<keep>', '<keep>', '<keep>']

src: ['8', '+', '11', '-', '2', '-', '8', '==', '9']
tgt: ['8', '+', '11', '-', '2', '-', '8', '==', '9']
pred: ['<keep>', '<keep>', '<keep>', '<keep>', '<keep>', '<keep>', '<keep>', '<keep>', '<keep>']

src: ['-', '+', '9', '+', '4', '==', '9']
tgt: ['-', '2', '*', '2', '+', '9', '+', '4', '==', '9']
pred: ['<keep>', '<insert_2>', '<insert

In [11]:
# source vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)

print(len(counter))
print(counter.most_common())

15
[('+', 8232), ('-', 8171), ('==', 5913), ('2', 4877), ('3', 4331), ('4', 4105), ('6', 3925), ('5', 3813), ('8', 3616), ('10', 3265), ('7', 3263), ('9', 3199), ('*', 3186), ('/', 2878), ('11', 2852)]


In [12]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['*', '+', '-', '/', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9', '==']


In [13]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '*': 1, '+': 2, '-': 3, '/': 4, '10': 5, '11': 6, '2': 7, '3': 8, '4': 9, '5': 10, '6': 11, '7': 12, '8': 13, '9': 14, '==': 15}


In [14]:
# target vocabulary frequency distribution
counter = Counter()
for y_ in train_ys_:
    counter.update(y_)

print(len(counter))
print(counter.most_common())

32
[('<keep>', 56045), ('<delete>', 3702), ('<sub_+>', 936), ('<sub_->', 882), ('<sub_==>', 628), ('<insert_+>', 530), ('<insert_->', 512), ('<insert_==>', 459), ('<sub_2>', 408), ('<sub_3>', 347), ('<sub_4>', 344), ('<sub_*>', 316), ('<sub_/>', 283), ('<sub_6>', 282), ('<sub_5>', 262), ('<sub_8>', 262), ('<sub_9>', 258), ('<sub_10>', 249), ('<sub_7>', 231), ('<insert_2>', 225), ('<insert_3>', 208), ('<insert_5>', 204), ('<insert_/>', 200), ('<insert_8>', 194), ('<sub_11>', 191), ('<insert_*>', 189), ('<insert_4>', 189), ('<insert_6>', 183), ('<insert_9>', 157), ('<insert_7>', 153), ('<insert_10>', 138), ('<insert_11>', 105)]


In [16]:
tgt_vocab_list = sorted(counter.keys())
print(len(tgt_vocab_list))
print(tgt_vocab_list)

32
['<delete>', '<insert_*>', '<insert_+>', '<insert_->', '<insert_/>', '<insert_10>', '<insert_11>', '<insert_2>', '<insert_3>', '<insert_4>', '<insert_5>', '<insert_6>', '<insert_7>', '<insert_8>', '<insert_9>', '<insert_==>', '<keep>', '<sub_*>', '<sub_+>', '<sub_->', '<sub_/>', '<sub_10>', '<sub_11>', '<sub_2>', '<sub_3>', '<sub_4>', '<sub_5>', '<sub_6>', '<sub_7>', '<sub_8>', '<sub_9>', '<sub_==>']


In [17]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
tgt_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '<delete>': 3, '<insert_*>': 4, '<insert_+>': 5, '<insert_->': 6, '<insert_/>': 7, '<insert_10>': 8, '<insert_11>': 9, '<insert_2>': 10, '<insert_3>': 11, '<insert_4>': 12, '<insert_5>': 13, '<insert_6>': 14, '<insert_7>': 15, '<insert_8>': 16, '<insert_9>': 17, '<insert_==>': 18, '<keep>': 19, '<sub_*>': 20, '<sub_+>': 21, '<sub_->': 22, '<sub_/>': 23, '<sub_10>': 24, '<sub_11>': 25, '<sub_2>': 26, '<sub_3>': 27, '<sub_4>': 28, '<sub_5>': 29, '<sub_6>': 30, '<sub_7>': 31, '<sub_8>': 32, '<sub_9>': 33, '<sub_==>': 34}


### Val

In [18]:
# white space tokenization
val_xs = white_space_tokenizer(raw_val_xs)
val_ys = white_space_tokenizer(raw_val_ys)

In [19]:
# take a look
for i in range(-10, 0, 1):
    print('src:', val_xs[i])
    print('tgt:', val_ys[i])
    print()

src: ['5', '/', '9', '3', '2', '/', '3']
tgt: ['5', '/', '9', '*', '9', '-', '2', '==', '3']

src: ['-', '5', '-', '4', '+', '2', '*', '10', '==', '6']
tgt: ['-', '7', '-', '7', '+', '2', '*', '10', '==', '6']

src: ['-', '+', '-', '9', '+', '6', '6', '9', '==', '2']
tgt: ['10', '-', '9', '+', '6', '/', '6', '==', '2']

src: ['8', '-', '7', '-', '6', '9', '+', '10', '==', '2']
tgt: ['8', '-', '7', '-', '9', '+', '10', '==', '2']

src: ['-', '4', '-', '*', '2', '9', '+', '8', '==', '11']
tgt: ['-', '4', '-', '2', '+', '9', '+', '8', '==', '11']

src: ['-', '8', '*', '+', '2', '+', '7', '==', '4']
tgt: ['-', '8', '+', '3', '+', '2', '+', '7', '==', '4']

src: ['8', '4', '*', '2', '-', '10', '-', '==', '11']
tgt: ['4', '*', '6', '-', '2', '-', '11', '==', '11']

src: ['3', '+', '-', '2', '-', '9', '==', '5']
tgt: ['3', '+', '7', '-', '2', '-', '3', '==', '5']

src: ['10', '8', '2', '*', '7', '-', '6', '==', '10']
tgt: ['10', '-', '2', '+', '7', '-', '5', '==', '10']

src: ['*', '9', '8', 

### Test

In [20]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [21]:
# take a look
for i in range(-10, 0, 1):
    print('src:', test_xs[i])
    print('tgt:', test_ys[i])
    print()

src: ['8', '-', '11', '*', '==', '2']
tgt: ['8', '-', '6', '/', '4', '*', '4', '==', '2']

src: ['4', '-', '7', '+', '8', '8', '==', '10']
tgt: ['4', '+', '5', '-', '7', '+', '8', '==', '10']

src: ['-', '-', '8', '3', '+', '6', '4', '10', '==', '8']
tgt: ['-', '5', '-', '3', '+', '6', '+', '10', '==', '8']

src: ['8', '8', '/', '4', '-', '4', '7', '7']
tgt: ['8', '*', '8', '/', '4', '-', '9', '==', '7']

src: ['7', '/', '2', '*', '8', '2', '/', '9', '4']
tgt: ['9', '/', '2', '*', '8', '/', '9', '==', '4']

src: ['3', '*', '5', '10', '-', '9', '==', '7']
tgt: ['3', '*', '2', '+', '10', '-', '9', '==', '7']

src: ['2', '*', '4', '+', '9', '6', '-', '7', '7', '==', '3']
tgt: ['2', '*', '2', '+', '6', '-', '7', '==', '3']

src: ['-', '+', '+', '-', '3', '-', '3', '==', '7']
tgt: ['11', '+', '2', '-', '3', '-', '3', '==', '7']

src: ['4', '-', '3', '+', '+', '2', '9', '2', '+', '2', '3']
tgt: ['-', '3', '+', '2', '+', '2', '+', '2', '==', '3']

src: ['2', '/', '5', '*', '*', '5', '+', '2',

In [22]:
# combine data sets to a dict
train_dict = {}
train_dict['ys'] = train_ys

val_dict = {}
val_dict['xs'] = val_xs
val_dict['ys'] = val_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['val'] = val_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [23]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)

## Archive Code

In [9]:
# def levenshtein_editops_list(source, target):
#     unique_elements = sorted(set(source + target)) 
#     char_list = [chr(i) for i in range(len(unique_elements))]
#     if len(unique_elements) > len(char_list):
#         raise Exception("too many elements")
#     else:
#         unique_element_map = {ele:char_list[i]  for i, ele in enumerate(unique_elements)}
#     source_str = ''.join([unique_element_map[ele] for ele in source])
#     target_str = ''.join([unique_element_map[ele] for ele in target])
#     transform_list = Levenshtein.editops(source_str, target_str)
#     return transform_list

# def gen_tag_pair(x, y):
#     x = x.split()
#     y = y.split()
#     editops = levenshtein_editops_list(x, y)
#     y_ = ['<keep>'] * len(x)
#     c = 0
#     for tag, i, j in editops:
#         i += c
#         if tag == 'replace': 
#             if y_[i] != '<keep>':
#                 y_.insert(i+1, '<sub_{}>'.format(y[j]))
#                 c += 1
#             else:
#                 y_[i] = '<sub_{}>'.format(y[j])
#         elif tag == 'delete':
#             y_[i] = '<delete>'
#         elif tag == 'insert': 
#             y_.insert(i, '<insert_{}>'.format(y[j]))
#             c += 1
#     return x, y, y_

# def tag_execute(x, y_):
#     p = []
#     x_ = x.copy()
#     x_token = x_.pop(0)
#     for y_token in y_:
#         if y_token == '<keep>':
#             # keep token
#             p.append(x_token)
#             if len(x_) == 0:
#                 break
#             else:
#                 x_token = x_.pop(0)
#         elif y_token == '<delete>':
#             # delete token
#             if len(x_) == 0:
#                 break
#             else:
#                 x_token = x_.pop(0)
#         elif 'insert' in y_token:
#             # insert token
#             p.append(y_token[8:-1])
#         elif 'sub' in y_token:
#             # substitute token
#             p.append(y_token[5:-1])
#             if len(x_) == 0:
#                 break
#             else:
#                 x_token = x_.pop(0)
#         else:
#             # end symbol or pad symbol
#             break
#     # return prediction
#     return p