# Data Preprocessing

## End2End Moldes
+ Naive RNN

## Notes:

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
import os
import numpy as np
from collections import Counter
from utils import load_txt, white_space_tokenizer

In [3]:
# define parameters
vocab_size = 10
seq_len = 5
data_size = 10000

In [4]:
# load path
indir = 'raw'
indir = os.path.join(indir, 'vocab_size_{}'.format(vocab_size), 
                     'seq_len_{}'.format(seq_len), 
                     'data_size_{}'.format(data_size))
indir

'raw/vocab_size_10/seq_len_5/data_size_10000'

In [5]:
# save path
outdir = 'end2end'

outdir = os.path.join(outdir, 'vocab_size_{}'.format(vocab_size), 
                      'seq_len_{}'.format(seq_len), 
                      'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'end2end/vocab_size_10/seq_len_5/data_size_10000'

In [6]:
# load raw dataset
raw_xs = load_txt(os.path.join(indir, 'x.txt'))
raw_ys = load_txt(os.path.join(indir, 'y.txt'))

In [7]:
# check data size
print('sample size', len(raw_xs))
print('label size', len(raw_ys))

sample size 10000
label size 10000


In [8]:
# check duplicates
dataset = [(src, tgt) for src, tgt in zip(raw_xs, raw_ys)]
dataset = list(set(dataset))
xs = [src for src, _ in dataset]
ys = [tgt for _, tgt in dataset]
print(len(xs))
print(len(ys))

10000
10000


In [9]:
# take a look
for i in range(-10, 0, 1):
    x, y = xs[i], ys[i]
    print('input:', x)
    print('output:', y)
    print()

input: 4 4 2 6 8
output: 4 - 4 / 2 + 6 == 8

input: 3 5 1 5 5
output: 3 / 5 / 1 + 5 == 5

input: 8 1 7 8 9
output: 8 * 1 / 7 + 8 == 9

input: 3 1 2 4 3
output: 3 + 1 * 2 / 4 == 3

input: 8 9 0 6 8
output: 8 + 9 * 0 * 6 == 8

input: 9 8 7 1 7
output: 9 / 8 * 7 * 1 == 7

input: 0 4 1 1 1
output: 0 / 4 / 1 + 1 == 1

input: 7 4 3 9 3
output: 7 - 4 + 3 / 9 == 3

input: 8 6 5 9 1
output: 8 / 6 + 5 / 9 == 1

input: 8 4 3 6 4
output: 8 * 4 / 3 - 6 == 4



In [10]:
# train test split
dataset = np.array(dataset)
data_size = dataset.shape[0]
indices = np.random.permutation(data_size)
train_size = int(0.7*data_size)
valid_size = int(0.15*data_size)
test_size = int(0.15*data_size)
train_idx = indices[:train_size]
valid_idx = indices[train_size:train_size+valid_size]
test_idx = indices[train_size+valid_size:]
train_set = dataset[train_idx, :]
valid_set = dataset[valid_idx, :]
test_set = dataset[test_idx, :]
print('train size', train_size, train_set.shape[0])
print('valid size', valid_size, valid_set.shape[0])
print('test size', test_size, test_set.shape[0])

train size 7000 7000
valid size 1500 1500
test size 1500 1500


## Training & Validation

In [11]:
# white space tokenization
train_valid_set = np.vstack((train_set, valid_set))
train_valid_x = train_valid_set[:, 0].tolist()
train_valid_y = train_valid_set[:, 1].tolist()
train_valid_x = white_space_tokenizer(train_valid_x)
train_valid_y = white_space_tokenizer(train_valid_y)

In [12]:
# srouce vocabulary frequency distribution
train_valid_counter = Counter()
for x in train_valid_x:
    train_valid_counter.update(x)

for y in train_valid_y:
    train_valid_counter.update(y)

train_valid_counter.most_common()

[('0', 11740),
 ('1', 9740),
 ('2', 8754),
 ('/', 8754),
 ('3', 8614),
 ('==', 8500),
 ('4', 8224),
 ('5', 8054),
 ('6', 7762),
 ('7', 7726),
 ('8', 7240),
 ('9', 7146),
 ('+', 6122),
 ('-', 5356),
 ('*', 5268)]

In [14]:
train_valid_vocab_list = sorted(train_valid_counter.keys())
train_valid_vocab_list

['*', '+', '-', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '==']

In [15]:
# generate source vocabulary dictionary for train and valid
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
src_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
src_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence
src_vocab2idx_dict['<unk>'] = 3 # to represent the unknow word

i = len(src_vocab2idx_dict)
for token in train_valid_vocab_list:
    src_vocab2idx_dict[token] = i
    i += 1

print(src_vocab2idx_dict)

{'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '*': 4, '+': 5, '-': 6, '/': 7, '0': 8, '1': 9, '2': 10, '3': 11, '4': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17, '==': 18}


In [None]:
# convert vocabulary to index
