# Data Preprocessing

## End2End Moldes
+ Naive GRU RNN
+ Naive LSTM RNN
+ Bi-directional GRU RNN
+ Bi-directional LSTM RNN
+ Bi-directional GRU RNN with Attention
+ Bi-directional LSTM RNN with Attention

## Notes:
+ Encoder and Decoder have separate embedding layers
+ There are two training methods, namely, online and offline

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
# public
import os
import numpy as np
from collections import Counter
# private
from utils import *

In [3]:
# define parameters
num_size = 100
seq_len = 5
data_size = 10000

In [4]:
# load path
indir = 'nss'
indir = os.path.join(indir, 
                     'num_size_{}'.format(num_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'nss/num_size_100/seq_len_5/data_size_10000'

In [5]:
# save path
outdir = 'end2end'

outdir = os.path.join(outdir, 
                      'num_size_{}'.format(num_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'end2end/num_size_100/seq_len_5/data_size_10000'

In [6]:
# load raw dataset
raw_train_ys = load_txt(os.path.join(indir, 'train_y.txt'))
raw_val_xs = load_txt(os.path.join(indir, 'val_x.txt'))
raw_val_ys = load_txt(os.path.join(indir, 'val_y.txt'))
raw_test_xs = load_txt(os.path.join(indir, 'test_x.txt'))
raw_test_ys = load_txt(os.path.join(indir, 'test_y.txt'))

In [7]:
# check data size
print('train size', len(raw_train_ys))
print('val size', len(raw_val_ys))
print('test size', len(raw_test_ys))

train size 7000
val size 1500
test size 1500


### Train

In [8]:
# take a look
for i in range(-10, 0, 1):
    print('tgt:', raw_train_ys[i])
    print()

tgt: 31 36 69 70 81

tgt: 6 17 23 79 97

tgt: 10 12 13 29 29

tgt: 46 46 51 66 75

tgt: 9 41 70 76 94

tgt: 8 12 44 78 93

tgt: 0 30 47 86 92

tgt: 17 42 71 82 83

tgt: 0 23 42 48 90

tgt: 35 64 66 71 85



In [9]:
# white space tokenization
train_ys = white_space_tokenizer(raw_train_ys)

In [10]:
# vocabulary frequency distribution
counter = Counter()
for y in train_ys:
    counter.update(y)
    
print(counter.most_common())

[('72', 403), ('7', 402), ('43', 390), ('61', 388), ('81', 388), ('29', 386), ('6', 383), ('15', 383), ('41', 381), ('83', 380), ('38', 371), ('45', 370), ('49', 370), ('8', 369), ('9', 368), ('2', 368), ('60', 367), ('93', 367), ('17', 366), ('58', 364), ('94', 363), ('75', 362), ('70', 361), ('85', 361), ('3', 360), ('26', 359), ('16', 359), ('99', 359), ('47', 359), ('78', 359), ('40', 359), ('91', 358), ('74', 357), ('48', 356), ('21', 356), ('24', 356), ('13', 356), ('14', 355), ('1', 355), ('77', 355), ('34', 354), ('84', 353), ('10', 353), ('68', 353), ('56', 353), ('30', 352), ('89', 352), ('52', 352), ('46', 352), ('79', 350), ('0', 350), ('95', 349), ('27', 349), ('82', 348), ('54', 348), ('92', 348), ('62', 347), ('57', 346), ('5', 346), ('87', 346), ('53', 346), ('20', 345), ('12', 345), ('50', 344), ('44', 344), ('23', 343), ('67', 342), ('25', 341), ('96', 340), ('98', 340), ('66', 339), ('37', 339), ('33', 339), ('69', 338), ('22', 337), ('90', 334), ('35', 334), ('32', 

In [11]:
vocab_list = sorted(counter.keys())
print(vocab_list)

['0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99']


In [12]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length

i = len(src_vocab2idx_dict)
for token in vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '0': 1, '1': 2, '10': 3, '11': 4, '12': 5, '13': 6, '14': 7, '15': 8, '16': 9, '17': 10, '18': 11, '19': 12, '2': 13, '20': 14, '21': 15, '22': 16, '23': 17, '24': 18, '25': 19, '26': 20, '27': 21, '28': 22, '29': 23, '3': 24, '30': 25, '31': 26, '32': 27, '33': 28, '34': 29, '35': 30, '36': 31, '37': 32, '38': 33, '39': 34, '4': 35, '40': 36, '41': 37, '42': 38, '43': 39, '44': 40, '45': 41, '46': 42, '47': 43, '48': 44, '49': 45, '5': 46, '50': 47, '51': 48, '52': 49, '53': 50, '54': 51, '55': 52, '56': 53, '57': 54, '58': 55, '59': 56, '6': 57, '60': 58, '61': 59, '62': 60, '63': 61, '64': 62, '65': 63, '66': 64, '67': 65, '68': 66, '69': 67, '7': 68, '70': 69, '71': 70, '72': 71, '73': 72, '74': 73, '75': 74, '76': 75, '77': 76, '78': 77, '79': 78, '8': 79, '80': 80, '81': 81, '82': 82, '83': 83, '84': 84, '85': 85, '86': 86, '87': 87, '88': 88, '89': 89, '9': 90, '90': 91, '91': 92, '92': 93, '93': 94, '94': 95, '95': 96, '96': 97, '97': 98, '98': 99, '99': 100}


In [13]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
tgt_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(tgt_vocab2idx_dict)
for token in vocab_list:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(tgt_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '0': 3, '1': 4, '10': 5, '11': 6, '12': 7, '13': 8, '14': 9, '15': 10, '16': 11, '17': 12, '18': 13, '19': 14, '2': 15, '20': 16, '21': 17, '22': 18, '23': 19, '24': 20, '25': 21, '26': 22, '27': 23, '28': 24, '29': 25, '3': 26, '30': 27, '31': 28, '32': 29, '33': 30, '34': 31, '35': 32, '36': 33, '37': 34, '38': 35, '39': 36, '4': 37, '40': 38, '41': 39, '42': 40, '43': 41, '44': 42, '45': 43, '46': 44, '47': 45, '48': 46, '49': 47, '5': 48, '50': 49, '51': 50, '52': 51, '53': 52, '54': 53, '55': 54, '56': 55, '57': 56, '58': 57, '59': 58, '6': 59, '60': 60, '61': 61, '62': 62, '63': 63, '64': 64, '65': 65, '66': 66, '67': 67, '68': 68, '69': 69, '7': 70, '70': 71, '71': 72, '72': 73, '73': 74, '74': 75, '75': 76, '76': 77, '77': 78, '78': 79, '79': 80, '8': 81, '80': 82, '81': 83, '82': 84, '83': 85, '84': 86, '85': 87, '86': 88, '87': 89, '88': 90, '89': 91, '9': 92, '90': 93, '91': 94, '92': 95, '93': 96, '94': 97, '95': 98, '96': 99, '97': 100, '9

### Val

In [16]:
# white space tokenization
val_xs = white_space_tokenizer(raw_val_xs)
val_ys = white_space_tokenizer(raw_val_ys)

In [17]:
# take a look
for i in range(-10, 0, 1):
    print('src:', val_xs[i])
    print('tgt:', val_ys[i])
    print()

src: ['88', '96', '60', '36', '5']
tgt: ['5', '36', '60', '88', '96']

src: ['54', '87', '13', '30', '57']
tgt: ['13', '30', '54', '57', '87']

src: ['48', '86', '80', '54', '58']
tgt: ['48', '54', '58', '80', '86']

src: ['82', '9', '40', '47', '45']
tgt: ['9', '40', '45', '47', '82']

src: ['78', '19', '60', '18', '9']
tgt: ['9', '18', '19', '60', '78']

src: ['42', '38', '25', '64', '9']
tgt: ['9', '25', '38', '42', '64']

src: ['39', '44', '25', '31', '18']
tgt: ['18', '25', '31', '39', '44']

src: ['68', '18', '71', '58', '73']
tgt: ['18', '58', '68', '71', '73']

src: ['42', '73', '79', '23', '52']
tgt: ['23', '42', '52', '73', '79']

src: ['13', '56', '28', '3', '11']
tgt: ['3', '11', '13', '28', '56']



### Test

In [18]:
# white space tokenization
test_xs = white_space_tokenizer(raw_test_xs)
test_ys = white_space_tokenizer(raw_test_ys)

In [19]:
# take a look
for i in range(-10, 0, 1):
    print('src:', test_xs[i])
    print('tgt:', test_ys[i])
    print()

src: ['82', '29', '48', '74', '51']
tgt: ['29', '48', '51', '74', '82']

src: ['66', '88', '22', '35', '31']
tgt: ['22', '31', '35', '66', '88']

src: ['91', '89', '10', '68', '1']
tgt: ['1', '10', '68', '89', '91']

src: ['28', '98', '90', '72', '61']
tgt: ['28', '61', '72', '90', '98']

src: ['82', '10', '13', '73', '97']
tgt: ['10', '13', '73', '82', '97']

src: ['29', '2', '43', '17', '43']
tgt: ['2', '17', '29', '43', '43']

src: ['62', '83', '64', '55', '53']
tgt: ['53', '55', '62', '64', '83']

src: ['72', '22', '49', '40', '68']
tgt: ['22', '40', '49', '68', '72']

src: ['43', '39', '38', '19', '84']
tgt: ['19', '38', '39', '43', '84']

src: ['36', '54', '53', '90', '13']
tgt: ['13', '36', '53', '54', '90']



In [20]:
# combine data sets to a dict
train_dict = {}
train_dict['ys'] = train_ys

val_dict = {}
val_dict['xs'] = val_xs
val_dict['ys'] = val_ys

test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

data_dict = dict()
data_dict['train'] = train_dict
data_dict['val'] = val_dict
data_dict['test'] = test_dict

vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

In [21]:
# save output as json
data_path = os.path.join(outdir, 'data.json')
vocab_path = os.path.join(outdir, 'vocab.json')

save_json(data_path, data_dict)
save_json(vocab_path, vocab_dict)