# Data Preparation
1. SCAN

In [21]:
import os
import random
from tqdm import tqdm, trange
from collections import Counter

import numpy as np

import utils

%load_ext autoreload 
%autoreload 2
%config Completer.use_jedi = False

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# SCAN

In [22]:
MAX_NUM_PRIMITIVE = 10
TRAIN_SIZE_RATE = 0.8
OVER_SAMPLING = 1 # batch size

In [24]:
NUM_PRIMITIVE = 2
primitive_xs = ['jump', 'run', 'look', 'walk']
primitive_ys = ['I_JUMP', 'I_RUN', 'I_LOOK', 'I_WALK']
extra_primitive_xs = primitive_xs[:NUM_PRIMITIVE]
extra_primitive_ys = primitive_ys[:NUM_PRIMITIVE]

## Input

In [3]:
# I/o
RAW = 'raw'
SCAN = os.path.join(RAW, 'scan')
SCAN_RAW_TXT = os.path.join(SCAN, 'tasks.txt')

EXP1 = 'exp1'
EXP2 = 'exp2'
EXP3 = 'exp3'
EXP4 = 'exp4'
EXP5 = 'exp5'

In [4]:
scan_raw_data = utils.load_txt_to_list(SCAN_RAW_TXT)
len(scan_raw_data)
scan_raw_xs = list(set(scan_raw_data))
print(len(scan_raw_data))

20910


In [5]:
random.choice(scan_raw_data)

'IN: look left after run left thrice OUT: I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_LOOK'

In [6]:
scan_raw_xs, scan_raw_ys = utils.scan_parser(scan_raw_data)

In [7]:
idx = random.choice(range(len(scan_raw_data)))
print(idx, '/', len(scan_raw_data))
print(scan_raw_xs[idx])
print(scan_raw_ys[idx])

11604 / 20910
['look', 'opposite', 'left', 'twice', 'and', 'run', 'around', 'left', 'thrice']
['I_TURN_LEFT', 'I_TURN_LEFT', 'I_LOOK', 'I_TURN_LEFT', 'I_TURN_LEFT', 'I_LOOK', 'I_TURN_LEFT', 'I_RUN', 'I_TURN_LEFT', 'I_RUN', 'I_TURN_LEFT', 'I_RUN', 'I_TURN_LEFT', 'I_RUN', 'I_TURN_LEFT', 'I_RUN', 'I_TURN_LEFT', 'I_RUN', 'I_TURN_LEFT', 'I_RUN', 'I_TURN_LEFT', 'I_RUN', 'I_TURN_LEFT', 'I_RUN', 'I_TURN_LEFT', 'I_RUN', 'I_TURN_LEFT', 'I_RUN', 'I_TURN_LEFT', 'I_RUN']


In [8]:
' '.join(scan_raw_xs[idx])

'look opposite left twice and run around left thrice'

In [9]:
' '.join(scan_raw_ys[idx])

'I_TURN_LEFT I_TURN_LEFT I_LOOK I_TURN_LEFT I_TURN_LEFT I_LOOK I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN I_TURN_LEFT I_RUN'

In [10]:
# source vocabulary
scan_src_counter = Counter()
for x in scan_raw_xs:
    scan_src_counter.update(x)
    
print(scan_src_counter.most_common())

[('right', 18405), ('left', 18405), ('thrice', 13906), ('twice', 13906), ('opposite', 12270), ('around', 12270), ('after', 10404), ('and', 10404), ('walk', 8589), ('run', 8589), ('look', 8589), ('jump', 8589), ('turn', 7362)]


In [11]:
# target vocabulary
scan_tgt_counter = Counter()
for y in scan_raw_ys:
    scan_tgt_counter.update(y)
    
print(scan_tgt_counter.most_common())

[('I_TURN_RIGHT', 85890), ('I_TURN_LEFT', 85890), ('I_RUN', 31902), ('I_WALK', 31902), ('I_LOOK', 31902), ('I_JUMP', 31902)]


## Primitive Standardization
+ run
+ jump
+ look
+ walk

In [12]:
verbs = ['run', 'jump', 'look', 'walk', 'turn']

In [13]:
max_num_primitives = 0
for x in scan_raw_xs:
    c = Counter()
    c.update(x)
    m = 0
    for k in c:
        if k in verbs:
            m += 1
    max_num_primitives = max(max_num_primitives, m)

In [14]:
max_num_primitives

2

In [15]:
# 1. remove primitive pair
# ['look'] -> ['I_LOOK']
# ['jump'] -> ['I_JUMP']
# ['run'] -> ['I_RUN']
# ['walk'] -> ['I_WALK']
# 2. split lv1 and lv2 samples
max_num_primitives = 0
scan_l1_idxes, scan_l2_idxes = [], []
for i, (x, y) in enumerate(zip(scan_raw_xs, scan_raw_ys)):
    c = Counter()
    c.update(x)
    m = 0
    for k in c:
        if k in verbs:
            m += 1
    max_num_primitives = max(max_num_primitives, m)
    if m == 1:
        if len(x) > 1:
            scan_l1_idxes.append(i)
        else:
            print(x, '->', y)
    elif m == 2:
        scan_l2_idxes.append(i)
    else:
        print(x)

['look'] -> ['I_LOOK']
['jump'] -> ['I_JUMP']
['run'] -> ['I_RUN']
['walk'] -> ['I_WALK']


In [16]:
len(scan_l1_idxes)

4274

In [17]:
len(scan_l2_idxes)

16632

In [18]:
len(scan_raw_xs)

20910

In [19]:
(len(scan_l1_idxes) + len(scan_l2_idxes) + 4) / len(scan_raw_xs)

1.0

## Full Data

In [29]:
primitives = ['jump', '']

In [30]:
scan_raw_l12_xs = np.array(scan_raw_xs, dtype=object)[scan_l1_idxes + scan_l2_idxes].tolist()
scan_raw_l12_ys = np.array(scan_raw_ys, dtype=object)[scan_l1_idxes + scan_l2_idxes].tolist()
print(len(scan_raw_l12_xs))
print(len(scan_raw_l12_ys))

20906
20906


In [31]:
scan_raw_l12_src_lens = [len(x) for x in scan_raw_l12_xs]
min(scan_raw_l12_src_lens)

2

In [32]:
# take a look
idx = random.choice(range(len(scan_raw_l12_ys)))
print(' '.join(scan_raw_l12_xs[idx]))
print(' '.join(scan_raw_l12_ys[idx]))

walk around right twice after walk twice
I_WALK I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK


In [33]:
MAX_NUM_PRIMITIVE

10

In [34]:
scan_l12_xs = scan_raw_l12_xs.copy()
scan_l12_ys = scan_raw_l12_ys.copy()

extra_data = set()

for n in trange(MAX_NUM_PRIMITIVE):
    for p in primitives:
        new_p = p + '_{}'.format(n)
        for x, y in zip(scan_raw_l12_xs, scan_raw_l12_ys):
            if p in x:
                new_x = ' '.join(x).replace(p, new_p).split()
                scan_l12_xs.append(new_x)
                scan_l12_ys.append(y)
                extra_data.add(' '.join(new_x))

100%|██████████| 10/10 [00:00<00:00, 36.49it/s]


In [36]:
print(len(scan_l12_xs))
print(len(scan_l12_ys))

97966
97966


In [37]:
# source vocabulary
scan_src_counter = Counter()
for x in scan_l12_xs:
    scan_src_counter.update(x)

In [38]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length

i = len(src_vocab2idx_dict)
for token in scan_src_counter:
    src_vocab2idx_dict[token] = i
    i += 1

print(len(src_vocab2idx_dict))

24


In [39]:
# target vocabulary
scan_tgt_counter = Counter()
for y in scan_l12_ys:
    scan_tgt_counter.update(y)

In [40]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
tgt_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(tgt_vocab2idx_dict)
for token in scan_tgt_counter:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(len(tgt_vocab2idx_dict))

9


In [41]:
vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

###  Train Test Split

In [42]:
extra_primitive_xs = ['jump']
extra_primitive_ys = ['I_JUMP']

In [43]:
new_primitives = [p + '_{}'.format(i) for i in range(MAX_NUM_PRIMITIVE) for p in extra_primitive_xs]
new_primitives

['jump_0',
 'jump_1',
 'jump_2',
 'jump_3',
 'jump_4',
 'jump_5',
 'jump_6',
 'jump_7',
 'jump_8',
 'jump_9']

In [44]:
train_idxes = []
test_idxes = []
for i, x in enumerate(scan_l12_xs):
    if set(new_primitives) & set(x):
        test_idxes.append(i)
    else:
        train_idxes.append(i)

In [45]:
(len(train_idxes), len(test_idxes))

(20906, 77060)

In [46]:
train_xs = np.array(scan_l12_xs, dtype=object)[train_idxes].tolist()
train_ys = np.array(scan_l12_ys, dtype=object)[train_idxes].tolist()
print(len(train_xs))
print(len(train_ys))

20906
20906


In [47]:
test_xs = np.array(scan_l12_xs, dtype=object)[test_idxes].tolist()
test_ys = np.array(scan_l12_ys, dtype=object)[test_idxes].tolist()
print(len(test_xs))
print(len(test_ys))

77060
77060


In [48]:
test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

### Experiment 1

In [49]:
OVER_SAMPLING

1

In [50]:
data_dict = dict()
data_dict['test'] = test_dict

for t in ['0', '5', '10']:
    train_dict = {}
    new_train_xs = train_xs.copy()
    new_train_ys = train_ys.copy()
    for i in range(int(t)):
        for p_x, p_y in zip(extra_primitive_xs, extra_primitive_ys):
            new_p_x = p_x + '_{}'.format(i)
            new_train_xs += [[new_p_x]] * OVER_SAMPLING
            new_train_ys += [[p_y]] * OVER_SAMPLING
    new_train_xs += [[x] for x in extra_primitive_xs]
    new_train_ys += [[y] for y in extra_primitive_ys]
    train_dict['xs'] = new_train_xs
    train_dict['ys'] = new_train_ys
    data_dict[t] = train_dict

In [51]:
for k in data_dict.keys():
    print(k, len(data_dict[k]['xs']))

test 77060
0 20907
5 20912
10 20917


In [52]:
# save output as json
out_path = os.path.join(EXP1, 'scan_l12')
if not os.path.exists(out_path): os.makedirs(out_path)

utils.save_json(os.path.join(out_path, 'data.json'), data_dict)
utils.save_json(os.path.join(out_path, 'vocab.json'), vocab_dict)

### Experiment 2

In [53]:
OVER_SAMPLING

1

In [54]:
data_dict = dict()
data_dict['test'] = test_dict

for t in ['0', '5', '10']:
    train_dict = {}
    new_train_xs = train_xs.copy()
    new_train_ys = train_ys.copy()
    for i in range(int(t)):
        for p_x, p_y in zip(extra_primitive_xs, extra_primitive_ys):
            new_p_x = p_x + '_{}'.format(i)
            new_train_xs += [[new_p_x]] * OVER_SAMPLING
            new_train_ys += [[p_y]] * OVER_SAMPLING
    train_dict['xs'] = new_train_xs
    train_dict['ys'] = new_train_ys
    data_dict[t] = train_dict

In [55]:
for k in data_dict.keys():
    print(k, len(data_dict[k]['xs']))

test 77060
0 20906
5 20911
10 20916


In [58]:
for x in tqdm(data_dict['10']['xs']):
    if x not in data_dict['0']['xs']:
        print(x)

100%|██████████| 20916/20916 [00:10<00:00, 2011.75it/s]

['jump_0']
['jump_1']
['jump_2']
['jump_3']
['jump_4']
['jump_5']
['jump_6']
['jump_7']
['jump_8']
['jump_9']





In [59]:
# save output as json
out_path = os.path.join(EXP2, 'scan_l12')
if not os.path.exists(out_path): os.makedirs(out_path)

utils.save_json(os.path.join(out_path, 'data.json'), data_dict)
utils.save_json(os.path.join(out_path, 'vocab.json'), vocab_dict)

### Experiment 3

In [60]:
extra_primitive_x = '{} left'
extra_primitive_y = 'I_TURN_LEFT {}'

In [62]:
data_dict = dict()
data_dict['test'] = test_dict

for t in ['0', '5', '10']:
    train_dict = {}
    new_train_xs = train_xs.copy()
    new_train_ys = train_ys.copy()
    for i in range(int(t)):
        for p_x, p_y in zip(extra_primitive_xs, extra_primitive_ys):
            new_p_x = p_x + '_{}'.format(i)
            new_train_xs += [extra_primitive_x.format(new_p_x).split()] * OVER_SAMPLING
            new_train_ys += [extra_primitive_y.format(p_y).split()] * OVER_SAMPLING
    train_dict['xs'] = new_train_xs
    train_dict['ys'] = new_train_ys
    data_dict[t] = train_dict

In [63]:
for k in data_dict.keys():
    print(k, len(data_dict[k]['xs']))

test 77060
0 20906
5 20911
10 20916


In [64]:
for x in tqdm(data_dict['10']['xs']):
    if x not in data_dict['0']['xs']:
        print(x)

100%|██████████| 20916/20916 [00:09<00:00, 2100.64it/s]

['jump_0', 'left']
['jump_1', 'left']
['jump_2', 'left']
['jump_3', 'left']
['jump_4', 'left']
['jump_5', 'left']
['jump_6', 'left']
['jump_7', 'left']
['jump_8', 'left']
['jump_9', 'left']





In [65]:
# save output as json
out_path = os.path.join(EXP3, 'scan_l12')
if not os.path.exists(out_path): os.makedirs(out_path)

utils.save_json(os.path.join(out_path, 'data.json'), data_dict)
utils.save_json(os.path.join(out_path, 'vocab.json'), vocab_dict)

### Experiment 4

In [66]:
extra_primitive_x = '{} left'
extra_primitive_y = 'I_TURN_LEFT {}'

In [67]:
# test
new_test_xs, new_test_ys = [], []
for i in range(MAX_NUM_PRIMITIVE):
    for p_x, p_y in zip(extra_primitive_xs, extra_primitive_ys):
        new_p_x = p_x + '_{}'.format(i)
        new_test_xs.append([new_p_x])
        new_test_ys.append([p_y])

new_test_xs.append(['left'])
new_test_ys.append(['I_TURN_LEFT'])

new_test_dict = {}
new_test_dict['xs'] = new_test_xs
new_test_dict['ys'] = new_test_ys

In [68]:
new_test_xs

[['jump_0'],
 ['jump_1'],
 ['jump_2'],
 ['jump_3'],
 ['jump_4'],
 ['jump_5'],
 ['jump_6'],
 ['jump_7'],
 ['jump_8'],
 ['jump_9'],
 ['left']]

In [69]:
data_dict = dict()
data_dict['test'] = new_test_dict

train_dict = {}
new_train_xs = train_xs.copy()
new_train_ys = train_ys.copy()

for i in range(MAX_NUM_PRIMITIVE):
    for p_x, p_y in zip(extra_primitive_xs, extra_primitive_ys):
        new_p_x = p_x + '_{}'.format(i)
        new_train_xs += [extra_primitive_x.format(new_p_x).split()] * OVER_SAMPLING
        new_train_ys += [extra_primitive_y.format(p_y).split()] * OVER_SAMPLING

new_train_xs += [[x] for x in extra_primitive_xs]
new_train_ys += [[y] for y in extra_primitive_ys]

train_dict['xs'] = new_train_xs
train_dict['ys'] = new_train_ys
data_dict[str(MAX_NUM_PRIMITIVE)] = train_dict

In [71]:
len(data_dict['10']['xs'])

20917

In [73]:
new_train_xs[-20:]

[['turn', 'left', 'twice', 'after', 'jump'],
 ['walk', 'right', 'thrice', 'and', 'run', 'opposite', 'right', 'thrice'],
 ['jump', 'left', 'twice', 'after', 'look'],
 ['run', 'right', 'and', 'jump', 'opposite', 'left'],
 ['look', 'around', 'left', 'after', 'walk', 'around', 'left', 'thrice'],
 ['jump',
  'around',
  'right',
  'thrice',
  'after',
  'run',
  'opposite',
  'right',
  'twice'],
 ['jump', 'around', 'left', 'twice', 'and', 'run', 'left'],
 ['look', 'opposite', 'left', 'thrice', 'and', 'walk', 'thrice'],
 ['walk', 'around', 'left', 'twice', 'after', 'look', 'right', 'thrice'],
 ['jump_0', 'left'],
 ['jump_1', 'left'],
 ['jump_2', 'left'],
 ['jump_3', 'left'],
 ['jump_4', 'left'],
 ['jump_5', 'left'],
 ['jump_6', 'left'],
 ['jump_7', 'left'],
 ['jump_8', 'left'],
 ['jump_9', 'left'],
 ['jump']]

In [74]:
# save output as json
out_path = os.path.join(EXP4, 'scan_l12')
if not os.path.exists(out_path): os.makedirs(out_path)

utils.save_json(os.path.join(out_path, 'data.json'), data_dict)
utils.save_json(os.path.join(out_path, 'vocab.json'), vocab_dict)

## Numeber of Primitive # 1
0, 5, 10, full

In [26]:
primitives = ['run', 'jump', 'look', 'walk']
primitives = ['run', 'jump']


In [27]:
scan_raw_l1_xs = np.array(scan_raw_xs, dtype=object)[scan_l1_idxes].tolist()
scan_raw_l1_ys = np.array(scan_raw_ys, dtype=object)[scan_l1_idxes].tolist()
print(len(scan_raw_l1_xs))
print(len(scan_raw_l1_ys))

4274
4274


In [28]:
scan_raw_l1_src_lens = [len(x) for x in scan_raw_l1_xs]
min(scan_raw_l1_src_lens)

2

In [29]:
# take a look
idx = random.choice(range(len(scan_raw_l1_ys)))
print(' '.join(scan_raw_l1_xs[idx]))
print(' '.join(scan_raw_l1_ys[idx]))

look opposite right thrice after look around right twice
I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_TURN_RIGHT I_LOOK


In [30]:
MAX_NUM_PRIMITIVE

10

In [31]:
scan_l1_xs = scan_raw_l1_xs.copy()
scan_l1_ys = scan_raw_l1_ys.copy()

for n in trange(MAX_NUM_PRIMITIVE):
    for p in primitives:
        new_p = p + '_{}'.format(n)
        for x, y in zip(scan_raw_l1_xs, scan_raw_l1_ys):
            if p in x:
                new_x = ' '.join(x).replace(p, new_p).split()
                scan_l1_xs.append(new_x)
                scan_l1_ys.append(y)

100%|██████████| 10/10 [00:00<00:00, 120.26it/s]


In [32]:
print(len(scan_l1_xs))
print(len(scan_l1_ys))

22314
22314


In [33]:
# source vocabulary
scan_src_counter = Counter()
for x in scan_l1_xs:
    scan_src_counter.update(x)

In [34]:
# soruce vocabulary dictionary
src_vocab2idx_dict = dict()
src_vocab2idx_dict['<pad>'] = 0 # to pad sequence length

i = len(src_vocab2idx_dict)
for token in scan_src_counter:
    src_vocab2idx_dict[token] = i
    i += 1

print(len(src_vocab2idx_dict))

34


In [35]:
# target vocabulary
scan_tgt_counter = Counter()
for y in scan_l1_ys:
    scan_tgt_counter.update(y)

In [36]:
# target vocabulary dictionary
tgt_vocab2idx_dict = dict()
tgt_vocab2idx_dict['<pad>'] = 0 # to pad sequence length
tgt_vocab2idx_dict['<s>'] = 1 # to mark the start of a sequence
tgt_vocab2idx_dict['</s>'] = 2 # to mark the end of a sequence

i = len(tgt_vocab2idx_dict)
for token in scan_tgt_counter:
    tgt_vocab2idx_dict[token] = i
    i += 1

print(len(tgt_vocab2idx_dict))

9


In [37]:
tgt_vocab2idx_dict

{'<pad>': 0,
 '<s>': 1,
 '</s>': 2,
 'I_TURN_RIGHT': 3,
 'I_LOOK': 4,
 'I_TURN_LEFT': 5,
 'I_RUN': 6,
 'I_WALK': 7,
 'I_JUMP': 8}

In [38]:
vocab_dict = dict()
vocab_dict['src'] = src_vocab2idx_dict
vocab_dict['tgt'] = tgt_vocab2idx_dict

###  Train Test Split

In [39]:
extra_primitive_xs = ['run', 'jump', 'look', 'walk']
extra_primitive_ys = ['I_RUN', 'I_JUMP', 'I_LOOK', 'I_WALK']
extra_primitive_xs = extra_primitive_xs[:2]
extra_primitive_ys = extra_primitive_ys[:2]

In [41]:
extra_primitive_ys

['I_RUN', 'I_JUMP']

In [114]:
new_primitives = [p + '_{}'.format(i) for i in range(MAX_NUM_PRIMITIVE) for p in extra_primitive_xs]

In [115]:
new_primitives

['run_0',
 'jump_0',
 'look_0',
 'walk_0',
 'run_1',
 'jump_1',
 'look_1',
 'walk_1',
 'run_2',
 'jump_2',
 'look_2',
 'walk_2',
 'run_3',
 'jump_3',
 'look_3',
 'walk_3',
 'run_4',
 'jump_4',
 'look_4',
 'walk_4',
 'run_5',
 'jump_5',
 'look_5',
 'walk_5',
 'run_6',
 'jump_6',
 'look_6',
 'walk_6',
 'run_7',
 'jump_7',
 'look_7',
 'walk_7',
 'run_8',
 'jump_8',
 'look_8',
 'walk_8',
 'run_9',
 'jump_9',
 'look_9',
 'walk_9']

In [116]:
train_idxes = []
test_idxes = []
for i, x in enumerate(scan_l1_xs):
    if set(new_primitives) & set(x):
        test_idxes.append(i)
    else:
        train_idxes.append(i)

In [117]:
(len(train_idxes), len(test_idxes))

(4274, 36080)

In [118]:
train_xs = np.array(scan_l1_xs, dtype=object)[train_idxes].tolist()
train_ys = np.array(scan_l1_ys, dtype=object)[train_idxes].tolist()
print(len(train_xs))
print(len(train_ys))

4274
4274


In [119]:
test_xs = np.array(scan_l1_xs, dtype=object)[test_idxes].tolist()
test_ys = np.array(scan_l1_ys, dtype=object)[test_idxes].tolist()
print(len(test_xs))
print(len(test_ys))

36080
36080


In [120]:
test_dict = {}
test_dict['xs'] = test_xs
test_dict['ys'] = test_ys

### Experiment 1

In [121]:
data_dict = dict()
data_dict['test'] = test_dict

for t in ['0', '5', '10']:
    train_dict = {}
    new_train_xs = train_xs.copy()
    new_train_ys = train_ys.copy()
    for i in range(int(t)):
        for p_x, p_y in zip(extra_primitive_xs, extra_primitive_ys):
            new_p_x = p_x + '_{}'.format(i)
            new_train_xs += [[new_p_x]] * OVER_SAMPLING
            new_train_ys += [[p_y]] * OVER_SAMPLING
    new_train_xs += [[x] for x in extra_primitive_xs]
    new_train_ys += [[y] for y in extra_primitive_ys]
    train_dict['xs'] = new_train_xs
    train_dict['ys'] = new_train_ys
    data_dict[t] = train_dict

In [122]:
for k in data_dict.keys():
    print(k, len(data_dict[k]['xs']))

test 36080
0 4278
5 6838
10 9398


In [123]:
# for x in data_dict['10']['xs']:
#     if x not in data_dict['0']['xs']:
#         print(x)
set([' '.join(x) for x in data_dict['10']['xs'] if x not in data_dict['0']['xs']])

{'jump_0',
 'jump_1',
 'jump_2',
 'jump_3',
 'jump_4',
 'jump_5',
 'jump_6',
 'jump_7',
 'jump_8',
 'jump_9',
 'look_0',
 'look_1',
 'look_2',
 'look_3',
 'look_4',
 'look_5',
 'look_6',
 'look_7',
 'look_8',
 'look_9',
 'run_0',
 'run_1',
 'run_2',
 'run_3',
 'run_4',
 'run_5',
 'run_6',
 'run_7',
 'run_8',
 'run_9',
 'walk_0',
 'walk_1',
 'walk_2',
 'walk_3',
 'walk_4',
 'walk_5',
 'walk_6',
 'walk_7',
 'walk_8',
 'walk_9'}

In [124]:
# for x in data_dict['10']['xs']:
#     if len(x) == 1:
#         print(x)

In [125]:
# save output as json
out_path = os.path.join(EXP1, 'scan_l1')
if not os.path.exists(out_path): os.makedirs(out_path)

utils.save_json(os.path.join(out_path, 'data.json'), data_dict)
utils.save_json(os.path.join(out_path, 'vocab.json'), vocab_dict)

#### Data Exploration

In [126]:
# source vocab
c = Counter()
for x in data_dict['10']['xs']:
    c.update(x)
print(c.most_common())

[('right', 3717), ('left', 3717), ('thrice', 2818), ('twice', 2818), ('opposite', 2478), ('around', 2478), ('and', 2088), ('after', 2088), ('look', 1785), ('run', 1785), ('walk', 1785), ('jump', 1785), ('turn', 1314), ('run_0', 1), ('jump_0', 1), ('look_0', 1), ('walk_0', 1), ('run_1', 1), ('jump_1', 1), ('look_1', 1), ('walk_1', 1), ('run_2', 1), ('jump_2', 1), ('look_2', 1), ('walk_2', 1), ('run_3', 1), ('jump_3', 1), ('look_3', 1), ('walk_3', 1), ('run_4', 1), ('jump_4', 1), ('look_4', 1), ('walk_4', 1), ('run_5', 1), ('jump_5', 1), ('look_5', 1), ('walk_5', 1), ('run_6', 1), ('jump_6', 1), ('look_6', 1), ('walk_6', 1), ('run_7', 1), ('jump_7', 1), ('look_7', 1), ('walk_7', 1), ('run_8', 1), ('jump_8', 1), ('look_8', 1), ('walk_8', 1), ('run_9', 1), ('jump_9', 1), ('look_9', 1), ('walk_9', 1)]


In [127]:
run_x = [x for x in data_dict['10']['xs'] if 'run' in ''.join(x)]
jump_x = [x for x in data_dict['10']['xs'] if 'jump' in ''.join(x)]

In [128]:
len(run_x)

913

In [129]:
len(jump_x)

913

In [130]:
for i in range(913):
    run_x[i] = [tk for tk in run_x[i] if 'run' not in tk]
    jump_x[i] = [tk for tk in jump_x[i] if 'jump' not in tk]

In [132]:
run_x = [' '.join(x) for x in run_x]
jump_x = [' '.join(x) for x in jump_x]

In [None]:
run_x = 

In [133]:
set(run_x) - set(jump_x)

set()

In [76]:
# source length
x_lens = [len(x) for x in data_dict['10']['xs']]
c = Counter()
c.update(x_lens)
print(c.most_common())

[(8, 1280), (7, 1216), (6, 704), (9, 640), (5, 264), (4, 104), (3, 48), (1, 44), (2, 18)]


In [77]:
for x in data_dict['10']['xs']:
    if len(x) == 1:
        print(x)

['run_0']
['jump_0']
['look_0']
['walk_0']
['run_1']
['jump_1']
['look_1']
['walk_1']
['run_2']
['jump_2']
['look_2']
['walk_2']
['run_3']
['jump_3']
['look_3']
['walk_3']
['run_4']
['jump_4']
['look_4']
['walk_4']
['run_5']
['jump_5']
['look_5']
['walk_5']
['run_6']
['jump_6']
['look_6']
['walk_6']
['run_7']
['jump_7']
['look_7']
['walk_7']
['run_8']
['jump_8']
['look_8']
['walk_8']
['run_9']
['jump_9']
['look_9']
['walk_9']
['run']
['jump']
['look']
['walk']


### Experiment 2
The same as Experiment 3 in Brenden Lake 2018 ICML

In [42]:
data_dict = dict()
data_dict['test'] = test_dict

for t in ['0', '5', '10']:
    train_dict = {}
    new_train_xs = train_xs.copy()
    new_train_ys = train_ys.copy()
    for i in range(int(t)):
        for p_x, p_y in zip(extra_primitive_xs, extra_primitive_ys):
            new_p_x = p_x + '_{}'.format(i)
            new_train_xs.append([new_p_x])
            new_train_ys.append([p_y])
    train_dict['xs'] = new_train_xs
    train_dict['ys'] = new_train_ys
    data_dict[t] = train_dict

In [43]:
for k in data_dict.keys():
    print(k, len(data_dict[k]['xs']))

test 36080
0 4274
5 4294
10 4314


In [44]:
for x in data_dict['5']['xs']:
    if x not in data_dict['0']['xs']:
        print(x)

['run_0']
['jump_0']
['look_0']
['walk_0']
['run_1']
['jump_1']
['look_1']
['walk_1']
['run_2']
['jump_2']
['look_2']
['walk_2']
['run_3']
['jump_3']
['look_3']
['walk_3']
['run_4']
['jump_4']
['look_4']
['walk_4']


In [45]:
# save output as json
out_path = os.path.join(EXP2, 'scan_l1')
if not os.path.exists(out_path): os.makedirs(out_path)

utils.save_json(os.path.join(out_path, 'data.json'), data_dict)
utils.save_json(os.path.join(out_path, 'vocab.json'), vocab_dict)

### Experiment 3

In [46]:
extra_primitive_x = '{} left'
extra_primitive_y = 'I_TURN_LEFT {}'

In [47]:
data_dict = dict()
data_dict['test'] = test_dict

for t in ['0', '5', '10']:
    train_dict = {}
    new_train_xs = train_xs.copy()
    new_train_ys = train_ys.copy()
    for i in range(int(t)):
        for p_x, p_y in zip(extra_primitive_xs, extra_primitive_ys):
            new_p_x = p_x + '_{}'.format(i)
            new_train_xs.append(extra_primitive_x.format(new_p_x).split())
            new_train_ys.append(extra_primitive_y.format(p_y).split())
    train_dict['xs'] = new_train_xs
    train_dict['ys'] = new_train_ys
    data_dict[t] = train_dict

In [48]:
for k in data_dict.keys():
    print(k, len(data_dict[k]['xs']))

test 36080
0 4274
5 4294
10 4314


In [49]:
for x in data_dict['5']['xs']:
    if x not in data_dict['0']['xs']:
        print(x)

['run_0', 'left']
['jump_0', 'left']
['look_0', 'left']
['walk_0', 'left']
['run_1', 'left']
['jump_1', 'left']
['look_1', 'left']
['walk_1', 'left']
['run_2', 'left']
['jump_2', 'left']
['look_2', 'left']
['walk_2', 'left']
['run_3', 'left']
['jump_3', 'left']
['look_3', 'left']
['walk_3', 'left']
['run_4', 'left']
['jump_4', 'left']
['look_4', 'left']
['walk_4', 'left']


In [50]:
# save output as json
out_path = os.path.join(EXP3, 'scan_l1')
if not os.path.exists(out_path): os.makedirs(out_path)

utils.save_json(os.path.join(out_path, 'data.json'), data_dict)
utils.save_json(os.path.join(out_path, 'vocab.json'), vocab_dict)

### Experiment 4

In [37]:
extra_primitive_x = '{} left'
extra_primitive_y = 'I_TURN_LEFT {}'

In [38]:
# test
new_test_xs, new_test_ys = [], []
for i in range(MAX_NUM_PRIMITIVE):
    for p_x, p_y in zip(extra_primitive_xs, extra_primitive_ys):
        new_p_x = p_x + '_{}'.format(i)
        new_test_xs.append([new_p_x])
        new_test_ys.append([p_y])

new_test_xs.append(['left'])
new_test_ys.append(['I_TURN_LEFT'])

new_test_dict = {}
new_test_dict['xs'] = new_test_xs
new_test_dict['ys'] = new_test_ys

In [60]:
new_test_xs

[['run_0'],
 ['jump_0'],
 ['look_0'],
 ['walk_0'],
 ['run_1'],
 ['jump_1'],
 ['look_1'],
 ['walk_1'],
 ['run_2'],
 ['jump_2'],
 ['look_2'],
 ['walk_2'],
 ['run_3'],
 ['jump_3'],
 ['look_3'],
 ['walk_3'],
 ['run_4'],
 ['jump_4'],
 ['look_4'],
 ['walk_4'],
 ['run_5'],
 ['jump_5'],
 ['look_5'],
 ['walk_5'],
 ['run_6'],
 ['jump_6'],
 ['look_6'],
 ['walk_6'],
 ['run_7'],
 ['jump_7'],
 ['look_7'],
 ['walk_7'],
 ['run_8'],
 ['jump_8'],
 ['look_8'],
 ['walk_8'],
 ['run_9'],
 ['jump_9'],
 ['look_9'],
 ['walk_9'],
 ['left']]

In [57]:
data_dict = dict()
data_dict['test'] = new_test_dict

train_dict = {}
new_train_xs = train_xs.copy()
new_train_ys = train_ys.copy()

for i in range(MAX_NUM_PRIMITIVE):
    for p_x, p_y in zip(extra_primitive_xs, extra_primitive_ys):
        new_p_x = p_x + '_{}'.format(i)
        new_train_xs.append(extra_primitive_x.format(new_p_x).split())
        new_train_ys.append(extra_primitive_y.format(p_y).split())

new_train_xs += [[x] for x in extra_primitive_xs]
new_train_ys += [[y] for y in extra_primitive_ys]

train_dict['xs'] = new_train_xs
train_dict['ys'] = new_train_ys
data_dict[str(MAX_NUM_PRIMITIVE)] = train_dict

In [58]:
for k in data_dict.keys():
    print(k, len(data_dict[k]['xs']))

test 41
10 4318


In [59]:
for x in data_dict['10']['xs']:
    if x not in train_xs:
        print(x)

['run_0', 'left']
['jump_0', 'left']
['look_0', 'left']
['walk_0', 'left']
['run_1', 'left']
['jump_1', 'left']
['look_1', 'left']
['walk_1', 'left']
['run_2', 'left']
['jump_2', 'left']
['look_2', 'left']
['walk_2', 'left']
['run_3', 'left']
['jump_3', 'left']
['look_3', 'left']
['walk_3', 'left']
['run_4', 'left']
['jump_4', 'left']
['look_4', 'left']
['walk_4', 'left']
['run_5', 'left']
['jump_5', 'left']
['look_5', 'left']
['walk_5', 'left']
['run_6', 'left']
['jump_6', 'left']
['look_6', 'left']
['walk_6', 'left']
['run_7', 'left']
['jump_7', 'left']
['look_7', 'left']
['walk_7', 'left']
['run_8', 'left']
['jump_8', 'left']
['look_8', 'left']
['walk_8', 'left']
['run_9', 'left']
['jump_9', 'left']
['look_9', 'left']
['walk_9', 'left']
['run']
['jump']
['look']
['walk']


In [55]:
for x in new_train_xs:
    if '_' in ''.join(x):
        print(x)

['run_9', 'left']
['jump_9', 'left']
['look_9', 'left']
['walk_9', 'left']


In [61]:
# save output as json
out_path = os.path.join(EXP4, 'scan_l1')
if not os.path.exists(out_path): os.makedirs(out_path)

utils.save_json(os.path.join(out_path, 'data.json'), data_dict)
utils.save_json(os.path.join(out_path, 'vocab.json'), vocab_dict)

###  Full Data

In [29]:
# # train test split
# data_size = len(scan_l1_xs)
# train_size = int(TRAIN_SIZE_RATE * data_size)
# idx_pool = np.random.permutation(data_size)
# idx_pools = np.split(idx_pool, [train_size, data_size])
# train_pool = set(idx_pools[0])
# test_pool = set(idx_pools[1])
# print('train size:', len(train_pool), 'test size:', len(test_pool))
# train_xs = [x for i, x in enumerate(scan_l1_xs) if i in train_pool]
# train_ys = [y for i, y in enumerate(scan_l1_ys) if i in train_pool]
# test_xs = [x for i, x in enumerate(scan_l1_xs) if i in test_pool]
# test_ys = [y for i, y in enumerate(scan_l1_ys) if i in test_pool]
# print('data_size', data_size)
# print('train_size', len(train_xs))
# print('test_size', len(test_ys))

In [30]:
# # combine data sets to a dict
# train_full_dict = {}
# train_full_dict['xs'] = train_xs
# train_full_dict['ys'] = train_ys

# test_dict = {}
# test_dict['xs'] = test_xs
# test_dict['ys'] = test_ys

# data_dict = dict()
# data_dict['full'] = train_full_dict
# data_dict['test'] = test_dict

# vocab_dict = dict()
# vocab_dict['src'] = src_vocab2idx_dict
# vocab_dict['tgt'] = tgt_vocab2idx_dict

### Incremental Data

In [31]:
# extra_primitive_x = '{} left'
# extra_primitive_y = 'I_TURN_LEFT {}'
# extra_primitive_xs = ['run', 'jump', 'look', 'walk']
# extra_primitive_ys = ['I_RUN', 'I_JUMP', 'I_LOOK', 'I_WALK']

In [32]:
# new_primitives = [p + '_{}'.format(i) for i in range(MAX_NUM_PRIMITIVE) for p in primitives]

In [33]:
# # remove all primitive commands without primitive left
# valid_idxes = []
# for i, x in enumerate(tqdm(train_xs)):
#     if set(new_primitives) & set(x):
#         continue
#     valid_idxes.append(i)

In [34]:
# print(len(valid_idxes))

In [35]:
# train_xs = np.array(train_xs, dtype=object)[valid_idxes].tolist()
# train_ys = np.array(train_ys, dtype=object)[valid_idxes].tolist()
# print(len(train_xs))
# print(len(train_ys))

In [36]:
# for t in ['0', '5', '10']:
#     train_dict = {}
#     new_train_xs = train_xs.copy()
#     new_train_ys = train_ys.copy()
#     new_test_xs = test_xs.copy()
#     new_test_ys = test_ys.copy()
#     for i in range(int(t)):
#         for p_x, p_y in zip(extra_primitive_xs, extra_primitive_ys):
#             new_p_x = p_x + '_{}'.format(i)
#             new_train_xs.append(extra_primitive_x.format(new_p_x).split())
#             new_train_ys.append(extra_primitive_y.format(p_y).split())
#     train_dict['xs'] = new_train_xs
#     train_dict['ys'] = new_train_ys
#     data_dict[t] = train_dict

In [37]:
# for k in data_dict.keys():
#     print(k, len(data_dict[k]['xs']))

In [38]:
# # save output as json
# data_path = os.path.join('scan_l1', 'data.json')
# vocab_path = os.path.join('scan_l1', 'vocab.json')

# utils.save_json(data_path, data_dict)
# utils.save_json(vocab_path, vocab_dict)

## Numeber of Primitive # 2

In [86]:
# take a look
idx = random.choice(scan_l2_idxes)
print(scan_raw_xs[idx])
print(scan_raw_ys[idx])

['jump', 'around', 'right', 'thrice', 'after', 'turn', 'around', 'left', 'thrice']
['I_TURN_LEFT', 'I_TURN_LEFT', 'I_TURN_LEFT', 'I_TURN_LEFT', 'I_TURN_LEFT', 'I_TURN_LEFT', 'I_TURN_LEFT', 'I_TURN_LEFT', 'I_TURN_LEFT', 'I_TURN_LEFT', 'I_TURN_LEFT', 'I_TURN_LEFT', 'I_TURN_RIGHT', 'I_JUMP', 'I_TURN_RIGHT', 'I_JUMP', 'I_TURN_RIGHT', 'I_JUMP', 'I_TURN_RIGHT', 'I_JUMP', 'I_TURN_RIGHT', 'I_JUMP', 'I_TURN_RIGHT', 'I_JUMP', 'I_TURN_RIGHT', 'I_JUMP', 'I_TURN_RIGHT', 'I_JUMP', 'I_TURN_RIGHT', 'I_JUMP', 'I_TURN_RIGHT', 'I_JUMP', 'I_TURN_RIGHT', 'I_JUMP', 'I_TURN_RIGHT', 'I_JUMP']


In [80]:
a = 0
for i in range(640):
    a += 1
    if not a % 128:
        print(a)

128
256
384
512
640
