# Data Preprocessing

## End2End Moldes
+ Naive GRU RNN
+ Naive LSTM RNN
+ Bi-directional GRU RNN
+ Bi-directional LSTM RNN
+ Bi-directional GRU RNN with Attention
+ Bi-directional LSTM RNN with Attention

## Notes:
+ Encoder and Decoder have separate embedding layers
+ There are two training methods, namely, online and offline

In [43]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [44]:
# dependency
# public
import os
import numpy as np
from collections import Counter
# private
from utils import *

In [45]:
# define parameters
num_size = 100
seq_len = 5
data_size = 10000

In [46]:
# load path
indir = 'nss'
indir = os.path.join(indir, 
                     'num_size_{}'.format(num_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'nss/num_size_100/seq_len_5/data_size_10000'

In [47]:
# save path
outdir = 'end2end'

outdir = os.path.join(outdir, 
                      'num_size_{}'.format(num_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'end2end/num_size_100/seq_len_5/data_size_10000'

In [48]:
# load raw dataset
raw_train_xs = load_txt(os.path.join(indir, 'train_x.txt'))
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_val_xs = load_txt(os.path.join(indir, 'val_x.txt'))
raw_val_ys = load_txt(os.path.join(indir, 'val_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [49]:
# check data size
print('train sample size', len(raw_train_xs))
print('train label size', len(raw_train_ys))
print('val sample size', len(raw_val_xs))
print('val label size', len(raw_val_ys))
print('test sample size', len(raw_test_xs))
print('test label size', len(raw_test_ys))

train sample size 7000
train label size 7000
val sample size 1500
val label size 1500
test sample size 1500
test label size 1500


### Train

In [50]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_train_xs[i])
    print('tgt:', raw_train_ys[i])
    print()

src: 10 26 5 23 79
tgt: 5 10 23 26 79

src: 54 91 94 6 50
tgt: 6 50 54 91 94

src: 42 65 1 28 17
tgt: 1 17 28 42 65

src: 11 90 26 38 25
tgt: 11 25 26 38 90

src: 30 12 25 9 37
tgt: 9 12 25 30 37

src: 99 34 6 80 70
tgt: 6 34 70 80 99

src: 77 41 30 5 21
tgt: 5 21 30 41 77

src: 2 67 2 28 13
tgt: 2 2 13 28 67

src: 20 80 70 81 24
tgt: 20 24 70 80 81

src: 72 38 64 5 85
tgt: 5 38 64 72 85



In [51]:
# white space tokenization
train_xs = white_space_tokenizer(raw_train_xs)
train_ys = white_space_tokenizer(raw_train_ys)

In [52]:
# vocabulary frequency distribution
counter = Counter()
for x in train_xs:
    counter.update(x)
    
print(counter.most_common())

[('29', 391), ('9', 389), ('52', 388), ('25', 386), ('55', 383), ('87', 383), ('60', 383), ('47', 379), ('84', 378), ('83', 378), ('75', 377), ('74', 377), ('59', 375), ('27', 374), ('5', 373), ('67', 371), ('20', 369), ('33', 369), ('58', 368), ('31', 365), ('41', 365), ('45', 365), ('51', 364), ('98', 364), ('37', 364), ('8', 364), ('91', 364), ('46', 363), ('81', 362), ('89', 360), ('65', 359), ('57', 358), ('22', 358), ('79', 358), ('70', 357), ('76', 356), ('71', 354), ('23', 354), ('97', 354), ('88', 354), ('49', 353), ('85', 353), ('32', 352), ('66', 352), ('15', 352), ('44', 351), ('19', 350), ('94', 350), ('28', 349), ('1', 349), ('43', 349), ('30', 349), ('17', 347), ('4', 347), ('40', 345), ('82', 345), ('26', 345), ('95', 345), ('42', 344), ('48', 343), ('56', 343), ('11', 343), ('7', 342), ('35', 342), ('54', 342), ('80', 341), ('99', 341), ('62', 341), ('18', 339), ('14', 339), ('16', 339), ('13', 338), ('96', 337), ('72', 337), ('3', 337), ('69', 336), ('36', 335), ('6',

In [53]:
src_vocab_list = sorted(counter.keys())
print(src_vocab_list)

['0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99']


In [54]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length

i = len(src_vocab2idx_dict)
for token in src_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '0': 1, '1': 2, '10': 3, '11': 4, '12': 5, '13': 6, '14': 7, '15': 8, '16': 9, '17': 10, '18': 11, '19': 12, '2': 13, '20': 14, '21': 15, '22': 16, '23': 17, '24': 18, '25': 19, '26': 20, '27': 21, '28': 22, '29': 23, '3': 24, '30': 25, '31': 26, '32': 27, '33': 28, '34': 29, '35': 30, '36': 31, '37': 32, '38': 33, '39': 34, '4': 35, '40': 36, '41': 37, '42': 38, '43': 39, '44': 40, '45': 41, '46': 42, '47': 43, '48': 44, '49': 45, '5': 46, '50': 47, '51': 48, '52': 49, '53': 50, '54': 51, '55': 52, '56': 53, '57': 54, '58': 55, '59': 56, '6': 57, '60': 58, '61': 59, '62': 60, '63': 61, '64': 62, '65': 63, '66': 64, '67': 65, '68': 66, '69': 67, '7': 68, '70': 69, '71': 70, '72': 71, '73': 72, '74': 73, '75': 74, '76': 75, '77': 76, '78': 77, '79': 78, '8': 79, '80': 80, '81': 81, '82': 82, '83': 83, '84': 84, '85': 85, '86': 86, '87': 87, '88': 88, '89': 89, '9': 90, '90': 91, '91': 92, '92': 93, '93': 94, '94': 95, '95': 96, '96': 97, '97': 98, '98': 99, '99': 100}


In [55]:
# target vocabulary frequency distribution
counter = Counter()
for y in train_ys:
    counter.update(y)

print(counter.most_common())

[('29', 391), ('9', 389), ('52', 388), ('25', 386), ('55', 383), ('87', 383), ('60', 383), ('47', 379), ('84', 378), ('83', 378), ('75', 377), ('74', 377), ('59', 375), ('27', 374), ('5', 373), ('67', 371), ('20', 369), ('33', 369), ('58', 368), ('31', 365), ('41', 365), ('45', 365), ('51', 364), ('98', 364), ('37', 364), ('8', 364), ('91', 364), ('46', 363), ('81', 362), ('89', 360), ('65', 359), ('22', 358), ('57', 358), ('79', 358), ('70', 357), ('76', 356), ('71', 354), ('23', 354), ('97', 354), ('88', 354), ('49', 353), ('85', 353), ('32', 352), ('66', 352), ('15', 352), ('44', 351), ('19', 350), ('94', 350), ('1', 349), ('28', 349), ('43', 349), ('30', 349), ('17', 347), ('4', 347), ('40', 345), ('82', 345), ('26', 345), ('95', 345), ('42', 344), ('48', 343), ('56', 343), ('11', 343), ('7', 342), ('35', 342), ('54', 342), ('80', 341), ('99', 341), ('62', 341), ('18', 339), ('14', 339), ('16', 339), ('13', 338), ('72', 337), ('96', 337), ('3', 337), ('69', 336), ('36', 335), ('6',

In [56]:
tgt_vocab_list = sorted(counter.keys())
print(tgt_vocab_list)

['0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99']


In [57]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
tgt_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(tgt_vocab2idx_dict)
for token in tgt_vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '0': 3, '1': 4, '10': 5, '11': 6, '12': 7, '13': 8, '14': 9, '15': 10, '16': 11, '17': 12, '18': 13, '19': 14, '2': 15, '20': 16, '21': 17, '22': 18, '23': 19, '24': 20, '25': 21, '26': 22, '27': 23, '28': 24, '29': 25, '3': 26, '30': 27, '31': 28, '32': 29, '33': 30, '34': 31, '35': 32, '36': 33, '37': 34, '38': 35, '39': 36, '4': 37, '40': 38, '41': 39, '42': 40, '43': 41, '44': 42, '45': 43, '46': 44, '47': 45, '48': 46, '49': 47, '5': 48, '50': 49, '51': 50, '52': 51, '53': 52, '54': 53, '55': 54, '56': 55, '57': 56, '58': 57, '59': 58, '6': 59, '60': 60, '61': 61, '62': 62, '63': 63, '64': 64, '65': 65, '66': 66, '67': 67, '68': 68, '69': 69, '7': 70, '70': 71, '71': 72, '72': 73, '73': 74, '74': 75, '75': 76, '76': 77, '77': 78, '78': 79, '79': 80, '8': 81, '80': 82, '81': 83, '82': 84, '83': 85, '84': 86, '85': 87, '86': 88, '87': 89, '88': 90, '89': 91, '9': 92, '90': 93, '91': 94, '92': 95, '93': 96, '94': 97, '95': 98, '96': 99, '97': 100, '9

### Val

In [21]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_val_xs[i])
    print('tgt:', raw_val_ys[i])
    print()

src: 3 8 1 4 5
tgt: 1 3 4 5 8

src: 0 0 6 4 9
tgt: 0 0 4 6 9

src: 5 3 3 9 0
tgt: 0 3 3 5 9

src: 1 9 8 4 4
tgt: 1 4 4 8 9

src: 0 0 4 5 0
tgt: 0 0 0 4 5

src: 5 6 8 3 8
tgt: 3 5 6 8 8

src: 3 3 5 2 4
tgt: 2 3 3 4 5

src: 6 4 8 2 3
tgt: 2 3 4 6 8

src: 7 6 9 0 4
tgt: 0 4 6 7 9

src: 0 7 9 5 6
tgt: 0 5 6 7 9



In [22]:
# white space tokenization
val_xs = white_space_tokenizer(raw_val_xs)
val_ys = white_space_tokenizer(raw_val_ys)

### Test

In [23]:
# take a look
for i in range(-10, 0, 1):
    print('src:', raw_test_xs[i])
    print('tgt:', raw_test_ys[i])
    print()

src: 2 7 5 5 6
tgt: 2 5 5 6 7

src: 6 4 7 3 3
tgt: 3 3 4 6 7

src: 4 8 3 0 8
tgt: 0 3 4 8 8

src: 2 2 6 2 0
tgt: 0 2 2 2 6

src: 5 1 9 9 4
tgt: 1 4 5 9 9

src: 3 7 6 8 6
tgt: 3 6 6 7 8

src: 6 3 8 2 2
tgt: 2 2 3 6 8

src: 1 7 5 7 0
tgt: 0 1 5 7 7

src: 1 6 7 6 8
tgt: 1 6 6 7 8

src: 2 7 0 0 0
tgt: 0 0 0 2 7



In [24]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [25]:
# combine data sets to a dict
train_dict = {}
train_dict['xs'] = train_xs
train_dict['ys'] = train_ys

val_dict = {}
val_dict['xs'] = val_xs
val_dict['ys'] = val_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['val'] = val_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [26]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)

In [20]:
a = np.arange(10).reshape(2, 5)
a

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [38]:
b = np.arange(1, -1, -1).reshape(-1, 1).repeat(5, -1)
b

array([[1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0]])

In [42]:
np.take_along_axis(a, b, axis=-1)

array([[1, 1, 1, 1, 1],
       [5, 5, 5, 5, 5]])