# Data Generation for Arithmetic Operators Restoration (AOR)

In [1]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [2]:
# dependency
# public
import os
import numpy as np
from tqdm import tqdm
# private
from utils import save_txt

In [3]:
# the calss to generate dataset for
# the Arithmetic Operators Restoration (AOR) probelm
class ArithmeticOperatorRestoration(): 
    """docstring for ArithmeticOperatorRestoration"""
    def __init__(self, operators, num_size):
        super(ArithmeticOperatorRestoration, self).__init__()
        self.operators = operators
        self.pos_digits_pool = np.arange(2, num_size+2).tolist()
        self.neg_digits_pool = np.arange(-num_size, -1).tolist()
        self.digits_pool = self.pos_digits_pool + self.neg_digits_pool
    
    def gen_base_dict(self):
        # initialize a base value dict
        return {str(i):[] for i in self.pos_digits_pool}
        
    def gen_operation(self, seq_len):
        # a recursion to geneate  the left side of an equation
        if seq_len == 1:
            a = np.random.choice(self.digits_pool)
            return [str(a)]
        else:
            left_side  = self.gen_operation(seq_len-1)
            operator = np.random.choice(self.operators)
            b = np.random.choice(self.pos_digits_pool)
            return left_side + [operator, str(b)]
    
    def gen_operation_list(self, seq_len, data_size):
        # to control the data size
        operations_pool = set()
        for i in tqdm(range(data_size)):
            while True: 
                # to avoid duplicates
                operation = self.gen_operation(seq_len) 
                if ''.join(operation) in operations_pool: 
                    continue
                else:
                    operations_pool.add(''.join(operation)) 
                # to avoid zero division error
                try: 
                    # flost to int to string
                    value = eval(''.join(operation))
                    if value % 1 != 0.: 
                        continue
                    else:
                        value = str(int(value))
                        # to keep vocab size
                        if value in self.value_dict: 
                            self.value_dict[value].append(operation)
                            break
                except: 
                    pass
    
    def gen_equation_list(self):
        for v in self.value_dict:
            for x in self.value_dict[v]:
                x = x[0].replace('-', '- ').split() + x[1:]
                y = x + ["=="] + [v]
                x = [i for i in y if i.isdigit()]
                self.xs.append(' '.join(x))
                self.ys.append(' '.join(y))

    def generate(self, seq_len, data_size):
        # input sequences, output sequences
        self.xs, self.ys = [], []
        # initialize a value dictionary
        # to save the value of each left side
        self.value_dict = self.gen_base_dict()
        # generate the left side of an equation
        self.gen_operation_list(
            seq_len=seq_len, 
            data_size=data_size)
        # generate relations given the value dict
        self.gen_equation_list()
        
        return self.xs, self.ys

In [4]:
# definition
num_size = 10
seq_len = 5 # must >= 0
data_size = 10000
operators = ['+', '-', '*', '/']

In [5]:
# data generation
moi = ArithmeticOperatorRestoration(operators, num_size)
xs, ys = moi.generate(
    seq_len=seq_len-1, 
    data_size=data_size)

100%|██████████| 10000/10000 [00:08<00:00, 1153.77it/s]


In [6]:
len(xs)

10000

In [7]:
idxes = np.random.choice(range(data_size), 10)
for i in idxes:
    print(xs[i])

7 8 4 2 5
10 2 4 4 11
3 2 6 4 4
5 8 8 7 2
2 6 5 5 11
8 8 8 11 2
6 2 4 2 10
4 2 6 3 11
9 6 2 7 6
6 8 8 3 9


In [8]:
len(ys)

10000

In [9]:
for i in idxes:
    print(ys[i])

7 - 8 + 4 + 2 == 5
- 10 / 2 + 4 * 4 == 11
3 * 2 / 6 * 4 == 4
- 5 * 8 / 8 + 7 == 2
2 * 6 - 5 / 5 == 11
- 8 - 8 / 8 + 11 == 2
6 + 2 * 4 / 2 == 10
4 - 2 + 6 + 3 == 11
- 9 + 6 + 2 + 7 == 6
6 / 8 * 8 + 3 == 9


In [125]:
for i in range(data_size):
    for j in range(data_size):
        if xs[i] == xs[j] and i != j:
            print(i, j)

440 590
590 440
1463 1836
1663 1849
1836 1463
1849 1663
3822 3948
3948 3822
4350 4863
4361 4403
4403 4361
4863 4350
5263 5278
5278 5263
7100 7163
7163 7100
7349 7440
7440 7349


In [142]:
xs[3822]

'12 12 6 5 6 7'

In [143]:
ys[3822]

'12 / 12 * 6 - 5 + 6 == 7'

In [144]:
ys[3948]

'12 + 12 - 6 - 5 - 6 == 7'

In [10]:
# train val test split
dataset = np.array([(x, y) for x, y in zip(xs, ys)])
data_size = dataset.shape[0]
indices = np.random.permutation(data_size)
train_size = int(0.7*data_size)
val_size = int(0.15*data_size)
test_size = data_size - train_size - val_size
train_idxes = indices[:train_size]
val_idxes = indices[train_size: train_size+val_size]
test_idxes = indices[train_size+val_size:]
trainset = dataset[train_idxes]
valset = dataset[val_idxes]
testset = dataset[test_idxes]
print('train size', train_size, trainset.shape)
print('val size', val_size, valset.shape)
print('test size', test_size, testset.shape)

train size 7000 (7000, 2)
val size 1500 (1500, 2)
test size 1500 (1500, 2)


In [11]:
# to save dataset
outdir = 'aor/'
outdir = os.path.join(
    outdir, 
    'num_size_{}'.format(num_size), 
    'seq_len_{}'.format(seq_len+1), 
    'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'aoi/num_size_10/seq_len_5/data_size_10000'

In [12]:
save_txt(os.path.join(outdir, 'train_x.txt'), trainset[:, 0])
save_txt(os.path.join(outdir, 'train_y.txt'), trainset[:, 1])
save_txt(os.path.join(outdir, 'val_x.txt'), valset[:, 0])
save_txt(os.path.join(outdir, 'val_y.txt'), valset[:, 1])
save_txt(os.path.join(outdir, 'test_x.txt'), testset[:, 0])
save_txt(os.path.join(outdir, 'test_y.txt'), testset[:, 1])

# Archive Code

In [13]:
# # the calss to generate dataset for
# # the Arithmetic Operators Insertion (AOI) probelm
# class ArithmeticOperatorInsertion(): 
#     """docstring for ArithmeticOperatorInsertion"""
#     def __init__(self, operators, num_size):
#         super(MathematicalOperatorInsertion, self).__init__()
#         self.operators = operators
#         self.pos_digits_pool = np.arange(2, num_size+2).tolist()
#         self.neg_digits_pool = np.arange(-num_size, -1).tolist()
#         self.digits_pool = self.pos_digits_pool + self.neg_digits_pool
    
# #     def gen_base_dataset(self, vocab_size):
# #     def gen_base_dataset(self):
#         # return a base dataset
# #         x = [str(i) for i in range(2, vocab_size+2)]
# #         x = [str(i) for i in range(-vocab_size//2, vocab_size//2)]
# #         x = [str(i) for i in self.digits_pool]
# #         y = x.copy()
# #         return x, y
    
# #     def gen_base_dict(self, vocab_size):
#     def gen_base_dict(self):
#         # initialize a base value dict
# #         return {str(i):[] for i in range(vocab_size)}
# #         return {str(i):[] for i in range(2, vocab_size+2)}
# #         return {str(i):[] for i in range(-vocab_size//2, vocab_size//2)}
#         return {str(i):[] for i in self.pos_digits_pool}
        
#     def gen_operation(self, vocab_size, seq_len):
#         # a recursion to geneate  the left side of an equation
#         if seq_len == 1:
#             # a = np.random.choice(range(2, vocab_size+2))
# #             a = np.random.choice(range(-vocab_size//2, vocab_size//2))
#             a = np.random.choice(self.digits_pool)
#             return [str(a)]
#         else:
#             left_side = self.gen_operation(vocab_size, seq_len-1)
#             operator = np.random.choice(self.operators)
# #             b = np.random.choice(range(vocab_size))
# #             b = np.random.choice(range(-vocab_size//2, vocab_size//2))
#             b = np.random.choice(self.pos_digits_pool)
#             return left + [operator, str(b)]
    
# #     def gen_operation_list(self, vocab_size, seq_len, data_size):
#     def gen_operation_list(self, seq_len, data_size):
#         # to control the data size
#         operations_pool = set()
#         for i in tqdm(range(data_size)):
#             while True: 
#                 # to avoid duplicates
#                 operation = self.gen_operation(vocab_size, seq_len) 
#                 if ''.join(operation) in operations_pool: 
#                     continue
#                 else:
#                     operations_pool.add(''.join(operation)) 
#                 # to avoid zero division error
#                 try: 
#                     # flost to int to string
#                     value = eval(''.join(operation))
#                     if value % 1 != 0.: 
#                         continue
#                     else:
#                         value = str(int(value))
#                         # to keep vocab size
#                         if value in self.value_dict: 
#                             self.value_dict[value].append(operation)
#                             break
#                 except: 
#                     pass
    
#     def gen_equation_list(self):
#         # generate the relational equation
#         # given the value dict
#         for v in self.value_dict:
#             for x in self.value_dict[v]:
#                 x = x[0].replace('-', '- ').split() + x[1:]
#                 y = x + ["=="] + [v]
#                 x = [i for i in y if i.isdigit()]
#                 self.xs.append(' '.join(x))
#                 self.ys.append(' '.join(y))

# #     def generate(self, vocab_size, seq_len, data_size):
#     def generate(self, seq_len, data_size):
#         if seq_len == 0:
#             return self.gen_base_dataset()
# #             return self.gen_base_dataset(
# #                 vocab_size=vocab_size)
#         # input sequences, # output sequences
#         self.xs, self.ys = [], []
#         # initialize a value dictionary
#         # to save the value of each sequence
#         self.value_dict = self.gen_base_dict()
# #         self.value_dict = self.gen_base_dict(
# #             vocab_size=vocab_size)
#         # insert operators and generate equations
#         self.gen_operation_list(
# #             vocab_size=vocab_size, 
#             seq_len=seq_len, 
#             data_size=data_size)
#         # generate relations given the value dict
#         self.gen_equation_list()
        
#         return self.xs, self.ys

In [14]:
# # the calss to generate dataset for
# # the Arithmetic Operators Insertion (AOI) probelm
# class ArithmeticOperatorInsertion(): 
#     """docstring for ArithmeticOperatorInsertion"""
#     def __init__(self, operators, num_size):
#         super(ArithmeticOperatorInsertion, self).__init__()
#         self.operators = operators
#         self.pos_digits_pool = np.arange(2, num_size+2).tolist()
#         self.neg_digits_pool = np.arange(-num_size, -1).tolist()
#         self.digits_pool = self.pos_digits_pool + self.neg_digits_pool
    
#     def gen_base_dict(self):
#         # initialize a base value dict
#         return {str(i):[] for i in self.pos_digits_pool}
        
#     def gen_operation(self, seq_len):
#         # a recursion to geneate  the left side of an equation
#         if seq_len == 1:
#             a = np.random.choice(self.digits_pool)
#             return [str(a)]
#         else:
#             left_side  = self.gen_operation(seq_len-1)
#             operator = np.random.choice(self.operators)
#             b = np.random.choice(self.pos_digits_pool)
#             return left_side + [operator, str(b)]
    
#     def gen_operation_list(self, seq_len, data_size):
#         # to control the data size
#         operations_pool = set()
#         for i in tqdm(range(data_size)):
#             while True: 
#                 # to avoid duplicates
#                 operation = self.gen_operation(seq_len) 
#                 if ''.join(operation) in operations_pool: 
#                     continue
#                 else:
#                     operations_pool.add(''.join(operation)) 
#                 # to avoid zero division error
#                 try: 
#                     # flost to int to string
#                     value = eval(''.join(operation))
#                     if value % 1 != 0.: 
#                         continue
#                     else:
#                         value = str(int(value))
#                         # to keep vocab size
#                         if value in self.value_dict: 
#                             self.value_dict[value].append(operation)
#                             break
#                 except: 
#                     pass
    
#     def gen_equation_list(self):
#         # generate the relational equation
#         # given the value dict
#         for v in self.value_dict:
#             for x in self.value_dict[v]:
#                 x = x[0].replace('-', '- ').split() + x[1:]
#                 y = x + ["=="] + [v]
#                 x = [i for i in y if i.isdigit()]
#                 self.xs.append(' '.join(x))
#                 self.ys.append(' '.join(y))

#     def generate(self, seq_len, data_size):
#         # input sequences, output sequences
#         self.xs, self.ys = [], []
#         # initialize a value dictionary
#         # to save the value of each left side
#         self.value_dict = self.gen_base_dict()
#         # generate the left side of an equation
#         self.gen_operation_list(
#             seq_len=seq_len, 
#             data_size=data_size)
#         # generate relations given the value dict
#         self.gen_equation_list()
        
#         return self.xs, self.ys