# Data Preprocessing

In [39]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [40]:
# dependency
# public
import os
import numpy as np
import Levenshtein
from collections import Counter
# private
from utils import *

In [41]:
# define parameters
method = 'rec'
num_size = 10
seq_len = 5
data_size = 10000

In [42]:
# load path
indir = 'aec'
indir = os.path.join(indir, 
                     'num_size_{}'.format(num_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'aec/num_size_10/seq_len_5/data_size_10000'

In [43]:
# save path
outdir = os.path.join(method, 
                      'num_size_{}'.format(num_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'rec/num_size_10/seq_len_5/data_size_10000'

In [44]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_val_xs = load_txt(os.path.join(indir, 'val_x.txt'))
raw_val_ys = load_txt(os.path.join(indir, 'val_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [45]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('val sample size', len(raw_val_xs))
print('val label size', len(raw_val_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 7000
train label size 7000
val sample size 1500
val label size 1500
test sample size 1500
test label size 1500


In [46]:
(np.array(raw_train_xs) == np.array(raw_train_ys)).sum()/len(raw_train_xs)

0.26557142857142857

### Helper Functions

In [47]:
# def levenshtein_editops_list(source, target):
#     unique_elements = sorted(set(source + target)) 
#     char_list = [chr(i) for i in range(len(unique_elements))]
#     if len(unique_elements) > len(char_list):
#         raise Exception("too many elements")
#     else:
#         unique_element_map = {ele:char_list[i]  for i, ele in enumerate(unique_elements)}
#     source_str = ''.join([unique_element_map[ele] for ele in source])
#     target_str = ''.join([unique_element_map[ele] for ele in target])
#     transform_list = Levenshtein.editops(source_str, target_str)
#     return transform_list

# def gen_rec_pair(x: list, y: list) -> list:
#     # white space tokenization
#     x = x.split()
#     y = y.split()
#     xs = [x.copy()]
#     ys_ = []
#     editops = levenshtein_editops_list(x, y)
#     if len(editops) == 0: 
#         y_ = ['<done>']*3 
#     else:
#         c = 0 
#         for tag, i, j in editops: 
#             i += c
#             if tag == 'replace':
#                 y_ = ['<sub>', '<pos_{}>'.format(i), y[j]]
#                 x[i] = y[j]
#             elif tag == 'delete': 
#                 y_ = ['<delete>', '<pos_{}>'.format(i), '<done>']
#                 del x[i]
#                 c -= 1
#             elif tag == 'insert': 
#                 y_ = ['<insert>', '<pos_{}>'.format(i), y[j]]
#                 x.insert(i, y[j]) 
#                 c += 1
#             xs.append(x.copy()) 
#             ys_.append(y_)
#         # ys_.append(['<done>']*3)
#         index = np.random.choice(range(len(xs)-1))
#         x = xs[index]
#         y_ = ys_[index]
#     return x, y_, y

### Train

In [48]:
train_xs, train_ys_, train_ys = zip(*[gen_rec_pair(x, y) for x, y in zip(raw_train_xs, raw_train_ys)])

In [49]:
# take a look
for i in range(10, 20):
    print('src:', train_xs[i])
    print('tgt:', train_ys[i])
    print('pred:', train_ys_[i])
    print()

src: ['10', '-', '5', '+', '5', '-', '6', '5', '4']
tgt: ['10', '-', '5', '+', '5', '-', '6', '==', '4']
pred: ['<sub>', '<pos_7>', '==']

src: ['2', '*', '3', '*', '-', '/', '2', '6']
tgt: ['2', '*', '3', '*', '2', '/', '2', '==', '6']
pred: ['<sub>', '<pos_4>', '2']

src: ['6', '*', '3', '/', '6', '+', '7', '==', '10']
tgt: ['6', '*', '3', '/', '6', '+', '7', '==', '10']
pred: ['<done>', '<done>', '<done>']

src: ['6', '/', '2', '*', '3', '-', '4', '==', '5']
tgt: ['6', '/', '2', '*', '3', '-', '4', '==', '5']
pred: ['<done>', '<done>', '<done>']

src: ['7', '-', '3', '*', '3', '+', '4', '==', '2']
tgt: ['7', '-', '3', '*', '3', '+', '4', '==', '2']
pred: ['<done>', '<done>', '<done>']

src: ['3', '/', '3', '/', '7', '-', '5', '==', '3']
tgt: ['3', '/', '3', '+', '7', '-', '5', '==', '3']
pred: ['<sub>', '<pos_3>', '+']

src: ['3', '+', '11', '-', '10', '*', '4', '==', '2']
tgt: ['3', '+', '11', '-', '3', '*', '4', '==', '2']
pred: ['<sub>', '<pos_4>', '3']

src: ['7', '*', '8', '/',

In [50]:
sum([y_ == ['<done>', '<done>', '<done>'] for y_ in train_ys_])/len(train_ys_)

0.26557142857142857

In [51]:
# source vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)

print(len(counter))
print(counter.most_common())

15
[('+', 8506), ('-', 8414), ('==', 6165), ('2', 5029), ('3', 4232), ('4', 4115), ('6', 3714), ('5', 3676), ('8', 3375), ('*', 3207), ('9', 3207), ('10', 3136), ('7', 3135), ('/', 2929), ('11', 2671)]


In [52]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['*', '+', '-', '/', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9', '==']


In [53]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '*': 1, '+': 2, '-': 3, '/': 4, '10': 5, '11': 6, '2': 7, '3': 8, '4': 9, '5': 10, '6': 11, '7': 12, '8': 13, '9': 14, '==': 15}


In [54]:
# target vocabulary frequency distribution
counter = Counter()
for y_ in train_ys_:
    counter.update(y_)

print(len(counter))
print(counter.most_common())

29
[('<done>', 5577), ('<sub>', 2019), ('<delete>', 1580), ('<insert>', 1542), ('<pos_3>', 876), ('<pos_4>', 847), ('<pos_0>', 836), ('<pos_5>', 792), ('<pos_2>', 780), ('<pos_6>', 775), ('<pos_7>', 772), ('<pos_1>', 749), ('+', 527), ('-', 509), ('==', 451), ('<pos_8>', 293), ('2', 224), ('*', 206), ('3', 205), ('5', 194), ('/', 192), ('4', 187), ('6', 185), ('8', 167), ('10', 163), ('7', 121), ('9', 119), ('11', 111), ('<pos_9>', 1)]


In [109]:
tgt_vocab_list = list(counter.keys())
for i in range(seq_len*2): 
    if '<pos_{}>'.format(i) not in tgt_vocab_list:
        tgt_vocab_list.append('<pos_{}>'.format(i))
tgt_vocab_list.sort()
print(tgt_vocab_list)

['*', '+', '-', '/', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9', '<done>', '<insert>', '<pos_0>', '<pos_1>', '<pos_2>', '<pos_3>', '<pos_4>', '<pos_5>', '<pos_6>', '<pos_7>', '<pos_8>', '<pos_9>', '==']


In [110]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '*': 2, '+': 3, '-': 4, '/': 5, '10': 6, '11': 7, '2': 8, '3': 9, '4': 10, '5': 11, '6': 12, '7': 13, '8': 14, '9': 15, '<done>': 16, '<insert>': 17, '<pos_0>': 18, '<pos_1>': 19, '<pos_2>': 20, '<pos_3>': 21, '<pos_4>': 22, '<pos_5>': 23, '<pos_6>': 24, '<pos_7>': 25, '<pos_8>': 26, '<pos_9>': 27, '==': 28}


### Val

In [111]:
# white space tokenization
val_xs = white_space_tokenizer(raw_val_xs)
val_ys = white_space_tokenizer(raw_val_ys)

In [112]:
# take a look
for i in range(-10, 0, 1):
    print('src:', val_xs[i])
    print('tgt:', val_ys[i])
    print()

src: ['-', '4', '*', '7', '/', '7', '+', '10', '==', '6']
tgt: ['-', '4', '*', '7', '/', '7', '+', '10', '==', '6']

src: ['6', '/', '7', '+', '8', '/', '7', '==', '2']
tgt: ['6', '/', '7', '+', '8', '/', '7', '==', '2']

src: ['3', '+', '4', '/', '4', '+', '3', '7']
tgt: ['3', '+', '4', '/', '4', '+', '3', '==', '7']

src: ['2', '/', '4', '+', '10', '/', '4', '==', '3']
tgt: ['2', '/', '4', '+', '10', '/', '4', '==', '3']

src: ['4', '+', '5', '4', '/', '2', '==', '11']
tgt: ['4', '+', '5', '+', '4', '/', '2', '==', '11']

src: ['3', '*', '4', '-', '6', '/', '2', '==', '9']
tgt: ['3', '*', '4', '-', '6', '/', '2', '==', '9']

src: ['-', '5', '/', '10', '*', '2', '+', '==', '10']
tgt: ['-', '5', '/', '10', '*', '2', '+', '11', '==', '10']

src: ['-', '6', '+', '9', '+', '4', '2', '==', '5']
tgt: ['-', '6', '+', '9', '+', '4', '/', '2', '==', '5']

src: ['4', '+', '6', '*', '8', '/', '8', '==', '10']
tgt: ['4', '+', '6', '*', '8', '/', '8', '==', '10']

src: ['-', '10', '5', '+', '3', '

## Test

In [113]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [114]:
# take a look
for i in range(-10, 0, 1):
    print('src:', test_xs[i])
    print('tgt:', test_ys[i])
    print()

src: ['10', '/', '3', '*', '6', '/', '10', '2']
tgt: ['10', '/', '3', '*', '6', '/', '10', '==', '2']

src: ['10', '+', '2', '-', '9', '/', '3', '==', '9']
tgt: ['10', '+', '2', '-', '9', '/', '3', '==', '9']

src: ['4', '-', '4', '-', '5', '+', '11', '==', '6']
tgt: ['4', '-', '4', '-', '5', '+', '11', '==', '6']

src: ['-', '2', '*', '5', '/', '5', '+', '7', '==', '5']
tgt: ['-', '2', '*', '5', '/', '5', '+', '7', '==', '5']

src: ['9', '-', '9', '+', '3', '+', '3', '==', '6']
tgt: ['9', '-', '9', '+', '3', '+', '3', '==', '6']

src: ['8', '+', '3', '+', '10', '-', '11', '==', '10']
tgt: ['8', '+', '3', '+', '10', '-', '11', '==', '10']

src: ['-', '2', '+', '+', '9', '-', '4', '==', '9']
tgt: ['-', '2', '+', '6', '+', '9', '-', '4', '==', '9']

src: ['-', '5', '*', '2', '/', '2', '+', '11', '==', '6']
tgt: ['-', '5', '*', '2', '/', '2', '+', '11', '==', '6']

src: ['-', '2', '+', '10', '-', '7', '+', '10', '==', '11']
tgt: ['-', '2', '+', '10', '-', '7', '+', '10', '==', '11']

src:

In [115]:
# combine data sets to a dict
train_dict = {}
train_dict['ys'] = train_ys

val_dict = {}
val_dict['xs'] = val_xs
val_dict['ys'] = val_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['val'] = val_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [116]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)

# Archive Code

In [87]:
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt')) 
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
x = raw_train_xs[4]
y = raw_train_ys[4]
x = x.split()
y = y.split()
print(x)
print(y)
print()

['-', '3', '+', '11', '-', '8', '+', '9', '==', '9']
['-', '3', '+', '11', '-', '8', '+', '9', '==', '9']



In [88]:
import Levenshtein 

def levenshtein_editops_list(source, target):
    unique_elements = sorted(set(source + target)) 
    char_list = [chr(i) for i in range(len(unique_elements))]
    if len(unique_elements) > len(char_list):
        raise Exception("too many elements")
    else:
        unique_element_map = {ele:char_list[i]  for i, ele in enumerate(unique_elements)}
    source_str = ''.join([unique_element_map[ele] for ele in source])
    target_str = ''.join([unique_element_map[ele] for ele in target])
    transform_list = Levenshtein.editops(source_str, target_str)
    return transform_list

for i in range(7000):

    x = raw_train_xs[i]
    y = raw_train_ys[i]
    x = x.split()
    y = y.split()
#     print('src', x)
#     print('tgt', y)

    editops = levenshtein_editops_list(x, y)
    c = 0
    for tag, i, j in editops: 
        i += c
#         print(tag, i, j)
        if tag == 'replace':
            x[i] = y[j]
        elif tag == 'delete':
            del x[i]
            c -= 1
        elif tag == 'insert':
            x.insert(i, y[j]) 
            c += 1
    if x != y: 
        print(i)
        print('src', x)
        print('tgt', y)
        break

In [89]:
x = raw_train_xs[4]
y = raw_train_ys[4]
x = x.split()
y = y.split()
print(x)
print(y)
print()

['-', '3', '+', '11', '-', '8', '+', '9', '==', '9']
['-', '3', '+', '11', '-', '8', '+', '9', '==', '9']



In [90]:
# for online end2end 
xs = [x.copy()] 
editops = levenshtein_editops_list(x, y) 
if len(editops) == 0: 
    pass
else:
    c = 0 
    for tag, i, j in editops: 
        i += c
        if tag == 'replace':
            x[i] = y[j]
        elif tag == 'delete':
            del x[i]
            c -= 1
        elif tag == 'insert':
            x.insert(i, y[j]) 
            c += 1
        xs.append(x.copy())
    index = np.random.choice(range(len(xs)-1))
    x = xs[index]
print(x)
for x in xs:
    print(x)

['-', '3', '+', '11', '-', '8', '+', '9', '==', '9']
['-', '3', '+', '11', '-', '8', '+', '9', '==', '9']


In [91]:
# for offline recurrent inference
editops = levenshtein_editops_list(x, y)
tag, i, j = editops[0]
if tag == 'replace':
    y_ = ['<sub>', '<pos_{}>'.format(i), y[j]]
elif tag == 'delete': 
    y_ = ['<delete>', '<pos_{}>'.format(i), '<done>'] 
elif tag == 'insert': 
    y_ = ['<insert>', '<pos_{}>'.format(i), y[j]] 
print(y)
print(x)
print(y_)

IndexError: list index out of range

In [92]:
# for online recurrent inference
xs = [x.copy()]
ys_ = []
editops = levenshtein_editops_list(x, y)
c = 0 
for tag, i, j in editops: 
    i += c
    if tag == 'replace':
        y_ = ['<sub>', '<pos_{}>'.format(i), y[j]]
        x[i] = y[j]
    elif tag == 'delete': 
        y_ = ['<delete>', '<pos_{}>'.format(i), '<done>']
        del x[i]
        c -= 1
    elif tag == 'insert': 
        y_ = ['<insert>', '<pos_{}>'.format(i), y[j]]
        x.insert(i, y[j]) 
        c += 1

    xs.append(x.copy()) 
    ys_.append(y_)

ys_.append(['<done>']*3)

for x, y_ in zip(xs, ys_):
    print(x)
    print(y_)

['-', '3', '+', '11', '-', '8', '+', '9', '==', '9']
['<done>', '<done>', '<done>']
