# Data Preprocessing

## Recursive Moldes
+ Naive GRU RNN
+ Naive LSTM RNN
+ Bi-directional GRU RNN
+ Bi-directional LSTM RNN
+ Bi-directional GRU RNN with Attention
+ Bi-directional LSTM RNN with Attention

## Notes:
+ Encoder and Decoder have separate embedding layers
+ There are two training methods, namely, online and offline

In [291]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [292]:
# dependency
# public
import os
import numpy as np
from collections import Counter
# private
from utils import *

In [309]:
# define parameters
method = 'recursion'
num_size = 100
seq_len = 5
data_size = 10000

In [310]:
# load path
indir = 'nss'
indir = os.path.join(indir, 
                     'num_size_{}'.format(num_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'nss/num_size_100/seq_len_5/data_size_10000'

In [311]:
# save path
outdir = os.path.join(method, 
                      'num_size_{}'.format(num_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'recursion/num_size_100/seq_len_5/data_size_10000'

In [312]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_val_xs = load_txt(os.path.join(indir, 'val_x.txt'))
raw_val_ys = load_txt(os.path.join(indir, 'val_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [313]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('val sample size', len(raw_val_xs))
print('val label size', len(raw_val_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 7000
train label size 7000
val sample size 1500
val label size 1500
test sample size 1500
test label size 1500


### Help Functions

In [314]:
def convert_to_int(seq:list) -> list:
    return [int(str_number) for str_number in seq]

def convert_to_str(seq:list) -> str:
    return [str(int_number) for int_number in seq]

def gen_recursion_pair(x: str, y: str) -> (list, list):
    # white space tokenization
    x = convert_to_int(x.split())
    y = convert_to_int(y.split())
    # record observation
    xs = [x.copy()]
    ys_ = []
    # process swap sort
    while True:
        src_idx = find_src_index_to_swap(x, y)
        tgt_idx = find_tgt_index_to_swap(x, src_idx)
        ys_.append([src_idx, tgt_idx])
        if src_idx == tgt_idx == -1:
            break
        x[src_idx], x[tgt_idx] = x[tgt_idx], x[src_idx]
        xs.append(x.copy())
    
    index = np.random.choice(range(len(xs)))
    return convert_to_str(xs[index]), convert_to_str(ys_[index]), convert_to_str(y)

def find_src_index_to_swap(x: list, y: list) -> int:
    if x == y:
        return -1
    else:
        idx_to_swap = [i for i in range(len(x)) if x[i] != y[i]][0]
        return idx_to_swap
    
def find_tgt_index_to_swap(x: list, src_idx: int) -> int:
    if src_idx == -1:
        return -1
    else:
        return np.argmin(x[src_idx:]) + src_idx
        
def swap_sort(x):
    i = 0
    while True:
        src_idx = find_src_index_to_swap(x, y)
        tgt_idx = find_tgt_index_to_swap(x, src_idx)
        if src_idx == tgt_idx == -1:
            return x, i
        x[src_idx], x[tgt_idx] = x[tgt_idx], x[src_idx]
        i += 1

In [315]:
x = raw_train_xs[0]
x = x.split()
x

['20', '84', '51', '36', '40']

In [316]:
y = raw_train_ys[0]
y = y.split()
y

['20', '36', '40', '51', '84']

In [317]:
src_idx = find_src_index_to_swap(x, y)
src_idx

1

In [318]:
tgt_idx = np.argmin(x[src_idx:]) + src_idx
tgt_idx

3

In [319]:
print(x)
x[src_idx], x[tgt_idx] = x[tgt_idx], x[src_idx]
x

['20', '84', '51', '36', '40']


['20', '36', '51', '84', '40']

In [320]:
x = np.random.randint(100, size=20).tolist()
print(x)
y = np.sort(x).tolist()
print(y)
_, i = swap_sort(x)
print()
print(i)

[76, 90, 69, 19, 57, 94, 61, 17, 97, 53, 22, 94, 41, 19, 74, 41, 48, 94, 40, 5]
[5, 17, 19, 19, 22, 40, 41, 41, 48, 53, 57, 61, 69, 74, 76, 90, 94, 94, 94, 97]

16


### Train

In [321]:
train_xs, train_ys_, train_ys = zip(*[gen_recursion_pair(x, y) 
                                      for x, y in zip(raw_train_xs, raw_train_ys)])

In [322]:
# take a look
for i in range(-10, 0, 1):
    print('src:', train_xs[i])
    print('tgt:', train_ys[i])
    print('pred:', train_ys_[i])
    print()

src: ['5', '10', '23', '26', '79']
tgt: ['5', '10', '23', '26', '79']
pred: ['-1', '-1']

src: ['6', '50', '54', '94', '91']
tgt: ['6', '50', '54', '91', '94']
pred: ['3', '4']

src: ['1', '17', '28', '42', '65']
tgt: ['1', '17', '28', '42', '65']
pred: ['-1', '-1']

src: ['11', '25', '26', '38', '90']
tgt: ['11', '25', '26', '38', '90']
pred: ['-1', '-1']

src: ['30', '12', '25', '9', '37']
tgt: ['9', '12', '25', '30', '37']
pred: ['0', '3']

src: ['6', '34', '70', '80', '99']
tgt: ['6', '34', '70', '80', '99']
pred: ['-1', '-1']

src: ['5', '21', '30', '77', '41']
tgt: ['5', '21', '30', '41', '77']
pred: ['3', '4']

src: ['2', '2', '67', '28', '13']
tgt: ['2', '2', '13', '28', '67']
pred: ['2', '4']

src: ['20', '24', '70', '81', '80']
tgt: ['20', '24', '70', '80', '81']
pred: ['3', '4']

src: ['5', '38', '64', '72', '85']
tgt: ['5', '38', '64', '72', '85']
pred: ['-1', '-1']



In [323]:
# source vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)

print(counter.most_common())

[('29', 391), ('9', 389), ('52', 388), ('25', 386), ('55', 383), ('87', 383), ('60', 383), ('47', 379), ('84', 378), ('83', 378), ('75', 377), ('74', 377), ('59', 375), ('27', 374), ('5', 373), ('67', 371), ('20', 369), ('33', 369), ('58', 368), ('31', 365), ('41', 365), ('45', 365), ('51', 364), ('98', 364), ('37', 364), ('8', 364), ('91', 364), ('46', 363), ('81', 362), ('89', 360), ('65', 359), ('22', 358), ('57', 358), ('79', 358), ('70', 357), ('76', 356), ('71', 354), ('23', 354), ('97', 354), ('88', 354), ('49', 353), ('85', 353), ('32', 352), ('66', 352), ('15', 352), ('44', 351), ('19', 350), ('94', 350), ('28', 349), ('1', 349), ('43', 349), ('30', 349), ('17', 347), ('4', 347), ('40', 345), ('82', 345), ('26', 345), ('95', 345), ('42', 344), ('48', 343), ('56', 343), ('11', 343), ('7', 342), ('35', 342), ('54', 342), ('80', 341), ('99', 341), ('62', 341), ('18', 339), ('14', 339), ('16', 339), ('13', 338), ('96', 337), ('72', 337), ('3', 337), ('69', 336), ('36', 335), ('6',

In [324]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99']


In [325]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '0': 1, '1': 2, '10': 3, '11': 4, '12': 5, '13': 6, '14': 7, '15': 8, '16': 9, '17': 10, '18': 11, '19': 12, '2': 13, '20': 14, '21': 15, '22': 16, '23': 17, '24': 18, '25': 19, '26': 20, '27': 21, '28': 22, '29': 23, '3': 24, '30': 25, '31': 26, '32': 27, '33': 28, '34': 29, '35': 30, '36': 31, '37': 32, '38': 33, '39': 34, '4': 35, '40': 36, '41': 37, '42': 38, '43': 39, '44': 40, '45': 41, '46': 42, '47': 43, '48': 44, '49': 45, '5': 46, '50': 47, '51': 48, '52': 49, '53': 50, '54': 51, '55': 52, '56': 53, '57': 54, '58': 55, '59': 56, '6': 57, '60': 58, '61': 59, '62': 60, '63': 61, '64': 62, '65': 63, '66': 64, '67': 65, '68': 66, '69': 67, '7': 68, '70': 69, '71': 70, '72': 71, '73': 72, '74': 73, '75': 74, '76': 75, '77': 76, '78': 77, '79': 78, '8': 79, '80': 80, '81': 81, '82': 82, '83': 83, '84': 84, '85': 85, '86': 86, '87': 87, '88': 88, '89': 89, '9': 90, '90': 91, '91': 92, '92': 93, '93': 94, '94': 95, '95': 96, '96': 97, '97': 98, '98': 99, '99': 100}


In [326]:
# target vocabulary frequency distribution
counter = Counter()
for y_ in train_ys_:
    counter.update(y_)

print(counter.most_common())

[('-1', 4158), ('4', 2256), ('3', 2247), ('2', 2026), ('1', 1773), ('0', 1540)]


In [327]:
tgt_vocab_list = sorted(counter.keys())
print(tgt_vocab_list)

['-1', '0', '1', '2', '3', '4']


In [328]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '-1': 2, '0': 3, '1': 4, '2': 5, '3': 6, '4': 7}


### Val

In [329]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_val_xs[i])
    print('tgt:', raw_val_ys[i])
    print()

src: 84 62 86 38 98
tgt: 38 62 84 86 98

src: 22 47 85 42 25
tgt: 22 25 42 47 85

src: 57 28 49 38 32
tgt: 28 32 38 49 57

src: 2 80 13 42 85
tgt: 2 13 42 80 85

src: 47 32 50 79 51
tgt: 32 47 50 51 79

src: 45 84 34 96 40
tgt: 34 40 45 84 96

src: 16 73 68 33 19
tgt: 16 19 33 68 73

src: 97 62 0 92 69
tgt: 0 62 69 92 97

src: 52 15 70 29 62
tgt: 15 29 52 62 70

src: 17 58 20 19 94
tgt: 17 19 20 58 94



In [330]:
# white space tokenization
val_xs = white_space_tokenizer(raw_val_xs)
val_ys = white_space_tokenizer(raw_val_ys)

## Test

In [331]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_test_xs[i])
    print('tgt:', raw_test_ys[i])
    print()

src: 5 80 0 12 24
tgt: 0 5 12 24 80

src: 34 25 38 19 37
tgt: 19 25 34 37 38

src: 10 70 76 32 26
tgt: 10 26 32 70 76

src: 13 12 44 90 79
tgt: 12 13 44 79 90

src: 81 71 24 45 26
tgt: 24 26 45 71 81

src: 20 23 25 79 76
tgt: 20 23 25 76 79

src: 62 25 24 96 52
tgt: 24 25 52 62 96

src: 74 87 51 84 50
tgt: 50 51 74 84 87

src: 85 60 2 52 24
tgt: 2 24 52 60 85

src: 68 41 64 21 44
tgt: 21 41 44 64 68



In [332]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [333]:
# combine data sets to a dict
train_dict = {}
train_dict['xs'] = train_xs
train_dict['ys_'] = train_ys_
train_dict['ys'] = train_ys

val_dict = {}
val_dict['xs'] = val_xs
val_dict['ys'] = val_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['val'] = val_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [334]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)

# Archive Code

In [23]:
# # helper functions for bubble sort 
# def bubble_sort(seq): 
#     n = len(seq) 
#     for i in range(n): 
#         for j in range(0, n-i-1):
#             if seq[j] > seq[j+1]: 
#                 seq[j], seq[j+1] = seq[j+1], seq[j] 
#     return seq

# def find_next_step_in_bubble_sort(seq): 
#     n = len(seq) 
#     for j in range(0, n-1):
#         if seq[j] > seq[j+1]:
#             return j
#     return -1

# def bubble_sort_step(seq, j): 
#     # perform one bubble sort step
#     seq[j], seq[j+1] = seq[j+1], seq[j] 
#     return seq

# # ------------------------------------------
# # test the bubble sort algo
# seq = np.random.randint(10, size=[10,])
# print(seq)
# a = seq
# # bubble_sort(a) 
# print(a)

# j = 0
# while True: 
#     j = find_next_step_in_bubble_sort(seq)
#     print(j)
#     if j == -1: 
#         break
#     seq = bubble_sort_step(seq, j)
#     print(seq)
#     print()
    
# print(seq)     

In [24]:
# # class for data generation of the Number Sequence Sorting (NSS) problem 
# class NumberSequenceSorting(): 
#     def __init__(self, seq_len, data_size, num_size):
#         super().__init__()
#         self.seq_len = seq_len
#         self.data_size = data_size
#         self.num_size = num_size 
        
#     def find_next_step_in_bubble_sort(self, seq): 
#         n = len(seq) 
#         for j in range(0, n-1):
#             if seq[j] > seq[j+1]:
#                 return j
#         return -1
    
#     def bubble_sort_step(self, seq, j): 
#         # perform one bubble sort step
#         seq[j], seq[j+1] = seq[j+1], seq[j] 
#         return seq
    
#     def convert_to_str(self, seq:list) -> str:
#         seq = [str(number) for number in seq]
#         return ' '.join(seq) 
    
#     def generate_end2end(self):
#         # generate random number sequence sa sample
#         # with its sorted result as label
#         xs, ys = [], []
#         # to filter out duplicates
#         num_seq_set = set()
#         for i in tqdm(range(self.data_size)):
#             while True:
#                 # get a random number sequence
#                 x = np.random.randint(self.num_size, size=[self.seq_len])
#                 # check duplicates
#                 if str(x) in num_seq_set:
#                     continue
#                 else:
#                     num_seq_set.add(str(x))
#                     y = np.sort(x)
#                     # convert a list of int to string
#                     x = self.convert_to_str(x)
#                     y = self.convert_to_str(y)
#                     # append to dataset
#                     xs.append(x)
#                     ys.append(y)
#                     break
                    
#         return xs, ys
                
#     def generate_recursive(self): 
#         xs, ys = [], []
#         # generate random number sequences
#         x = np.random.randint(self.num_size, size=[self.data_size, self.seq_len])
#         xs.append(x)
#         y = [find_next_step_in_bubble_sort(s) for s in x]
#         ys = y.copy()

#         while not all(np.equal(y, -1)):
#             for i in range(x.shape[0]):
#                 if y[i] != -1:
#                     x[i] = bubble_sort_step(x[i], y[i])
#                     y[i] = find_next_step_in_bubble_sort(x[i])
#                     xs.append(x[i])
#                     ys.append(y[i])

#         xs = np.vstack(xs)
#         ys = np.array(ys)

#         # randomly select indices
#         indices = np.random.choice(xs.shape[0], data_size, replace=False)
#         xs = xs[indices]
#         ys = ys[indices]

#         return xs, ys

In [1]:
# def find_next_step_in_bubble_sort(seq): 
#     n = len(seq) 
#     for j in range(0, n-1):
#         if seq[j] > seq[j+1]:
#             return j
#     return -1

# def bubble_sort_step(seq, j): 
#     # perform one bubble sort step
#     seq[j], seq[j+1] = seq[j+1], seq[j] 
#     return seq
    
# def gen_recursion_pair(x: str, y: str) -> (list, list):
#     # white space tokenization
#     x = convert_to_int(x.split())
#     y = y.split()
#     # record observation
#     xs = [x.copy()]
#     ys_ = []
#     # process bubble sort
#     while True:
#         y_ = find_next_step_in_bubble_sort(x)
#         ys_.append(y_)
#         if y_ == -1:
#             break
#         x = bubble_sort_step(x, y_)
#         xs.append(x.copy())

#     index = np.random.choice(range(len(xs)))
#     return convert_to_str(xs[index]), [str(ys_[index])], y

In [2]:
# def find_src_index_to_swap(x: list, y: list) -> int:
#     if x == y:
#         return -1
#     else:
#         idx_to_swap = [i for i in range(len(x)) if x[i] != y[i]][0]
#         return idx_to_swap
    
# def find_tgt_index_to_swap(x: list, y: list, src_idx: int) -> int:
#     if src_idx == -1:
#         return -1
#     else:
#         tgt_num = y[src_idx]
#         idx_to_swap = [i for i in range(len(x)) if x[i]==tgt_num][-1]
#         return idx_to_swap

# def find_src_index_to_swap(x: list, y: list) -> int:
#     if x == y:
#         return -1
#     else:
#         idx_to_swap = [i for i in range(len(x)) if x[i] != y[i]][0]
#         return idx_to_swap
    
# def find_tgt_index_to_swap(x: list, y: list, src_idx: int) -> int:
#     if src_idx == -1:
#         return -1
#     else:
#         tgt_num = y[src_idx]
#         idx_to_swap = [i for i in range(len(x)) if x[i]==tgt_num][-1]
#         return idx_to_swap