# Dataset for the Arithmetic Equation Correction (AEC) Task

In [56]:
# dependency
# public
import os
import numpy as np
from tqdm import tqdm
# private
from utils import save_txt

## Notes

+ Errors  
Deletion  
Insertion  
substitution  

# Examples
|                       Source Sequence               | Target Sequence |
|:---------------------------------------------------:|:------------------:|
| - 4 + 6 3 4 2 - 4 7 2 | - 4 + 6 + 4 - 4 == 2 | 
| 5 - 2 * 2 7 == 2 | 5 - 2 * 5 + 7 == 2 | 
| - 6 - 6 / 6 + 9 == 2 | - 6 - 6 / 6 + 9 == 2 |

In [82]:
# class for data generation of the Arithmetic Equation Correction (AEC) problem 
class ArithmeticEquationCorrection(): 
    """docstring for ArithmeticEquationCorrection"""
    def __init__(self, operators, num_size):
        super().__init__()
        self.operators = operators
        self.pos_digits = np.arange(2, num_size+2).tolist()
        self.neg_digits = np.arange(-num_size, -1).tolist()
        self.digits = self.pos_digits + self.neg_digits
        
        def delete(tk_y, idx): 
            tk_y[idx] = ''
            return tk_y
        def insert(tk_y, idx): 
            tk_y[idx] = str(np.random.choice(self.operators+self.pos_digits)) + ' ' + tk_y[idx]
            return tk_y 
        def sub(tk_y, idx):
            tk_y[idx] = str(np.random.choice(self.operators+self.pos_digits))
            return tk_y
        
#         self.trans_funs = [delete, insert, sub]
        self.trans_funs = [delete, insert]
    
    def gen_base_dict(self):
        return {str(i):[] for i in self.pos_digits}
    
    def gen_operation(self, seq_len):
        if seq_len == 1:
            a = np.random.choice(self.digits)
            return [str(a)]
        else:
            left_side  = self.gen_operation(seq_len-1)
            o = np.random.choice(self.operators)
            b = np.random.choice(self.pos_digits)
            return left_side + [o, str(b)]
    
    def gen_operation_list(self, seq_len, data_size):
        # to control the data size
        operations_pool = set()
        for i in tqdm(range(data_size)):
            while True: 
                # to avoid duplicates
                operation = self.gen_operation(seq_len) 
                if ''.join(operation) in operations_pool: 
                    continue
                else:
                    operations_pool.add(''.join(operation)) 
                # to avoid zero division error
                try: 
                    # flost to int to string
                    value = eval(''.join(operation))
                    if value % 1 != 0.: 
                        continue
                    else:
                        value = str(int(value))
                        # to keep vocab size
                        if value in self.value_dict: 
                            self.value_dict[value].append(operation)
                            break
                except: 
                    pass
    
    def gen_equation_list(self):
        ys = []
        for v in self.value_dict:
            for y in self.value_dict[v]:
                y = y[0].replace('-', '- ').split() + y[1:]
                y += ["=="] + [v]
                ys.append(' '.join(y))
        return ys
    
    def transform(self, tk_y, idxes): 
        for idx in idxes: 
            f = np.random.choice(self.trans_funs)
            tk_y = f(tk_y, idx)
        return tk_y
        
    def random_transform(self, ys, num_errors): 
        xs = []
        for y in ys:
            tk_y = y.split() 
            y_len = len(tk_y) - 1
            num_idxes = np.random.choice(range(num_errors+1))
            idxes = sorted(np.random.choice(range(y_len), num_idxes, False))
            tk_x = self.transform(tk_y, idxes)
            xs.append(' '.join([x for x in tk_x if len(x)>0]))
        return xs
    
    def generate(self, seq_len, data_size, num_errors):
        # input sequences, output sequences
        xs, ys = [], []
        self.value_dict = self.gen_base_dict()
        self.gen_operation_list(
            seq_len=seq_len, 
            data_size=data_size)
        ys = self.gen_equation_list()
        xs = self.random_transform(ys, num_errors)
        
        return xs, ys

In [117]:
# data parameters 
seq_len = 5
num_size = 10
data_size = 10000
operators = ['+', '-', '*', '/']
error_rate = 1/5
num_errors = int((2*seq_len-1)*error_rate)
print(num_errors)

1


In [84]:
aes = ArithmeticEquationCorrection(operators, num_size)
xs, ys = aes.generate(seq_len-1, data_size, num_errors)

100%|██████████| 10000/10000 [00:21<00:00, 464.42it/s]


In [85]:
len(xs)

10000

In [86]:
xs[:15]

['- 6 + 8 / 7 * 7 == 2',
 '10 / 5 - 8 + 8 == 2',
 '- 7 / * 7 - 5 + 8 == 2',
 '7 / 7 + 11 11 == 2',
 '10 + 3 - 6 - 5 == 2',
 '+ 7 - 9 + 7 - 3 == 2',
 '6 * 2 - 2 * 5 == 2',
 '4 - 3 + 5 - 4 == 2',
 '11 + 3 - 3 - 9 == 2',
 '8 - 3 / 3 * 6 == 2',
 '4 / 2 - 10 + 10 == 2',
 '- 3 + 8 * 2 - 11 == 2',
 '5 - 10 / 10 * 3 == 2',
 '7 + 10 9 - 6 == 2',
 '11 * 5 10 / 11 - 3 == 2']

In [87]:
len(ys)

10000

In [88]:
ys[:15]

['- 6 + 8 / 7 * 7 == 2',
 '10 / 5 - 8 + 8 == 2',
 '- 7 / 7 - 5 + 8 == 2',
 '7 / 7 + 11 / 11 == 2',
 '10 + 3 - 6 - 5 == 2',
 '7 - 9 + 7 - 3 == 2',
 '6 * 2 - 2 * 5 == 2',
 '4 - 3 + 5 - 4 == 2',
 '11 + 3 - 3 - 9 == 2',
 '8 - 3 / 3 * 6 == 2',
 '4 / 2 - 10 + 10 == 2',
 '- 3 + 8 * 2 - 11 == 2',
 '5 - 10 / 10 * 3 == 2',
 '7 + 10 - 9 - 6 == 2',
 '11 * 5 / 11 - 3 == 2']

In [89]:
(np.array(xs) == np.array(ys)).sum()/data_size

0.5001

In [90]:
# train val test split
dataset = np.array([(x, y) for x, y in zip(xs, ys)])
data_size = dataset.shape[0]
indices = np.random.permutation(data_size)
train_size = int(0.7*data_size)
val_size = int(0.15*data_size)
test_size = data_size - train_size - val_size
train_idxes = indices[:train_size]
val_idxes = indices[train_size: train_size+val_size]
test_idxes = indices[train_size+val_size:]
trainset = dataset[train_idxes]
valset = dataset[val_idxes]
testset = dataset[test_idxes]
print('train size', train_size, trainset.shape)
print('val size', val_size, valset.shape)
print('test size', test_size, testset.shape)

train size 7000 (7000, 2)
val size 1500 (1500, 2)
test size 1500 (1500, 2)


In [91]:
# to save dataset
outdir = 'aec/'
outdir = os.path.join(
    outdir, 
    'num_size_{}'.format(num_size), 
    'seq_len_{}'.format(seq_len), 
    'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'aec/num_size_10/seq_len_5/data_size_10000'

In [92]:
save_txt(os.path.join(outdir, 'train_x.txt'), trainset[:, 0])
save_txt(os.path.join(outdir, 'train_y.txt'), trainset[:, 1])
save_txt(os.path.join(outdir, 'val_x.txt'), valset[:, 0])
save_txt(os.path.join(outdir, 'val_y.txt'), valset[:, 1])
save_txt(os.path.join(outdir, 'test_x.txt'), testset[:, 0])
save_txt(os.path.join(outdir, 'test_y.txt'), testset[:, 1])

In [68]:
for dataset in [trainset, valset, testset]: 
    print((np.array(dataset[:, 0]) == np.array(dataset[:, 1])).sum()/dataset.shape[0])

0.5162857142857142
0.518
0.524


In [70]:
1/2

0.5