# Data Preprocessing

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
# public
import os
import numpy as np
from collections import Counter
# private
from utils import *

In [3]:
# define parameters
method = 'rec'
num_size = 10
seq_len = 5
data_size = 10000

In [4]:
# load path
data_src = 'aor'
indir = os.path.join(data_src, 
                     'num_size_{}'.format(num_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'aor/num_size_10/seq_len_5/data_size_10000'

In [5]:
# save path
outdir = os.path.join(method, 
                      'num_size_{}'.format(num_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'rec/num_size_10/seq_len_5/data_size_10000'

In [6]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_val_xs = load_txt(os.path.join(indir, 'val_x.txt'))
raw_val_ys = load_txt(os.path.join(indir, 'val_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [7]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('val sample size', len(raw_val_xs))
print('val label size', len(raw_val_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 7000
train label size 7000
val sample size 1500
val label size 1500
test sample size 1500
test label size 1500


### Train

In [8]:
# def gen_rec_pair(y: list) -> list:
#     # white space tokenization
#     y = y.split()
#     # make a copy
#     x = y.copy()
#     # get operator indexes
#     operator_idxes = [i for i, token in enumerate(y) if not token.isdigit()][::-1]
#     # decide how many operators to remove
#     num_idxes = np.random.choice(range(len(operator_idxes)+1))
#     if num_idxes == 0:
#         return x, ['<done>', '<done>'], y
#     else:
#         # decide operators to remove
#         idxes_to_remove = operator_idxes[:num_idxes]
#         # generat label
#         y_ = ['pos_{}'.format(idxes_to_remove[-1]), x[idxes_to_remove[-1]]]
#         # generate sample
#         x = [x[i] for i in range(len(x)) if i not in idxes_to_remove]
        
#         return x, y_, y

In [9]:
train_xs, train_ys_, train_ys = zip(*[gen_rec_pair(y) for y in raw_train_ys])

In [10]:
# take a look
for i in range(-10, 0, 1):
    print('src:', train_xs[i])
    print('tgt:', train_ys[i])
    print('pred:', train_ys_[i])
    print()

src: ['7', '+', '2', '3', '3', '3']
tgt: ['7', '+', '2', '-', '3', '-', '3', '==', '3']
pred: ['pos_3', '-']

src: ['2', '+', '2', '*', '3', '2', '10']
tgt: ['2', '+', '2', '*', '3', '+', '2', '==', '10']
pred: ['pos_5', '+']

src: ['9', '+', '3', '-', '8', '6', '10']
tgt: ['9', '+', '3', '-', '8', '+', '6', '==', '10']
pred: ['pos_5', '+']

src: ['6', '2', '8', '5', '5']
tgt: ['-', '6', '-', '2', '+', '8', '+', '5', '==', '5']
pred: ['pos_0', '-']

src: ['4', '2', '9', '6', '7']
tgt: ['4', '+', '2', '*', '9', '/', '6', '==', '7']
pred: ['pos_1', '+']

src: ['11', '8', '9', '5', '5']
tgt: ['11', '+', '8', '-', '9', '-', '5', '==', '5']
pred: ['pos_1', '+']

src: ['-', '2', '/', '2', '+', '8', '2', '9']
tgt: ['-', '2', '/', '2', '+', '8', '+', '2', '==', '9']
pred: ['pos_6', '+']

src: ['9', '10', '5', '10', '3']
tgt: ['-', '9', '+', '10', '/', '5', '+', '10', '==', '3']
pred: ['pos_0', '-']

src: ['9', '-', '4', '+', '6', '-', '7', '4']
tgt: ['9', '-', '4', '+', '6', '-', '7', '==', '4

In [11]:
# source vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)

print(counter.most_common())

[('-', 5739), ('+', 5077), ('2', 4932), ('3', 4266), ('4', 4046), ('6', 3582), ('5', 3516), ('8', 3283), ('7', 3046), ('10', 3024), ('9', 2922), ('11', 2383), ('*', 1747), ('/', 1587), ('==', 1303)]


In [12]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['*', '+', '-', '/', '10', '11', '2', '3', '4', '5', '6', '7', '8', '9', '==']


In [13]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '*': 1, '+': 2, '-': 3, '/': 4, '10': 5, '11': 6, '2': 7, '3': 8, '4': 9, '5': 10, '6': 11, '7': 12, '8': 13, '9': 14, '==': 15}


In [14]:
# target vocabulary frequency distribution
counter = Counter()
for y_ in train_ys_:
    counter.update(y_)

print(counter.most_common())

[('<done>', 2606), ('+', 1637), ('-', 1580), ('==', 1384), ('pos_7', 929), ('pos_3', 906), ('pos_5', 882), ('pos_1', 875), ('*', 586), ('/', 510), ('pos_8', 455), ('pos_6', 449), ('pos_2', 406), ('pos_4', 398), ('pos_0', 397)]


In [15]:
tgt_vocab_list = sorted(counter.keys())
print(tgt_vocab_list)

['*', '+', '-', '/', '<done>', '==', 'pos_0', 'pos_1', 'pos_2', 'pos_3', 'pos_4', 'pos_5', 'pos_6', 'pos_7', 'pos_8']


In [16]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '*': 2, '+': 3, '-': 4, '/': 5, '<done>': 6, '==': 7, 'pos_0': 8, 'pos_1': 9, 'pos_2': 10, 'pos_3': 11, 'pos_4': 12, 'pos_5': 13, 'pos_6': 14, 'pos_7': 15, 'pos_8': 16}


### Val

In [17]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_val_xs[i])
    print('tgt:', raw_val_ys[i])
    print()

src: 5 5 6 6 11
tgt: - 5 / 5 + 6 + 6 == 11

src: 9 5 5 2 11
tgt: 9 + 5 / 5 * 2 == 11

src: 9 4 3 6 11
tgt: 9 + 4 * 3 / 6 == 11

src: 6 8 4 7 5
tgt: - 6 + 8 - 4 + 7 == 5

src: 8 2 4 10 3
tgt: 8 - 2 / 4 * 10 == 3

src: 5 4 8 3 4
tgt: - 5 + 4 + 8 - 3 == 4

src: 4 4 10 11 7
tgt: 4 + 4 + 10 - 11 == 7

src: 9 8 2 9 6
tgt: 9 + 8 - 2 - 9 == 6

src: 4 6 7 8 9
tgt: 4 + 6 + 7 - 8 == 9

src: 5 5 10 6 9
tgt: 5 * 5 - 10 - 6 == 9



In [18]:
# white space tokenization
val_xs = white_space_tokenizer(raw_val_xs)
val_ys = white_space_tokenizer(raw_val_ys)

## Test

In [19]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_test_xs[i])
    print('tgt:', raw_test_ys[i])
    print()

src: 5 10 5 3 10
tgt: 5 + 10 / 5 + 3 == 10

src: 7 6 9 6 11
tgt: 7 + 6 / 9 * 6 == 11

src: 10 10 10 4 7
tgt: 10 + 10 / 10 - 4 == 7

src: 4 2 2 4 4
tgt: 4 + 2 + 2 - 4 == 4

src: 3 10 2 5 3
tgt: 3 - 10 / 2 + 5 == 3

src: 4 2 7 4 5
tgt: 4 / 2 + 7 - 4 == 5

src: 8 11 2 8 9
tgt: - 8 + 11 - 2 + 8 == 9

src: 7 5 3 6 2
tgt: - 7 + 5 * 3 - 6 == 2

src: 8 11 2 6 11
tgt: - 8 + 11 + 2 + 6 == 11

src: 6 10 2 8 10
tgt: - 6 + 10 - 2 + 8 == 10



In [20]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [21]:
# combine data sets to a dict
train_dict = {}
train_dict['xs'] = train_xs
train_dict['ys'] = train_ys
train_dict['ys_'] = train_ys_

val_dict = {}
val_dict['xs'] = val_xs
val_dict['ys'] = val_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['val'] = val_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [22]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)

# Archive Code

In [23]:
# # a function to generate a sequence pair
# # given a label sequence
# def get_sequence_pair(y: list) -> list:
#     # white space tokenization
#     y = y.split()
#     x = y.copy()
#     # get operator indexes
#     operator_idxes = list(range(1, len(x), 2))
#     # decide how many operators to remove
#     num_idxes = np.random.choice(range(len(operator_idxes)+1))
#     if num_idxes == 0:
#         return x, ['<completion>', '<none>', '<none>'], y
#     else:
#         # decide operators to remove
#         idxes_to_remove = sorted(np.random.choice(operator_idxes, num_idxes, replace=False))
#         # generat possible ys
#         ys_ = [['<insertion>', str(idxes_to_remove[i]-i), x[idxes_to_remove[i]]] 
#               for i in range(len(idxes_to_remove))]
#         # pick y randomly
#         y_ = ys_[np.random.choice(range(len(ys_)))]
#         # remove operators
#         x = [x[i] for i in range(len(x)) if i not in idxes_to_remove]
#         return x, y_, y

In [24]:
# # a function to generate a sequence pair
# # given a label sequence
# def get_sequence_pair(y: list) -> list:
#     # white space tokenization
#     y = y.split()
#     x = y.copy()
#     # get operator indexes
# #     operator_idxes = np.arange(1, len(x), 2)[::-1]
#     operator_idxes = [i for i, token in enumerate(y) if not token.isdigit()][::-1]
    
#     # decide how many operators to remove
#     num_idxes = np.random.choice(range(len(operator_idxes)+1))
#     if num_idxes == 0:
#         return x, ['<completion>', '<none>', '<none>'], y
#     else:
#         # decide operators to remove
#         idxes_to_remove = operator_idxes[:num_idxes]
# #         idxes_to_remove = sorted(np.random.choice(operator_idxes, num_idxes, replace=False))
#         # generat possible ys
# #         ys_ = [['<insertion>', str(idxes_to_remove[i]+1-i), x[idxes_to_remove[i]]] 
# #               for i in np.arange(len(idxes_to_remove))[::-1]]
# #         print(['<insertion>', str(idxes_to_remove[-1]), x[idxes_to_remove[-1]]])
#         y_ = ['<insertion>', str(idxes_to_remove[-1]), x[idxes_to_remove[-1]]]
#         # pick y randomly
# #         y_ = ys_[0]
# #         y_ = ys_[np.random.choice(range(len(ys_)))]
# #         remove operators
#         x = [x[i] for i in range(len(x)) if i not in idxes_to_remove]
# #         print(idxes_to_remove)
#         return x, y_, y