# Dataset for Text Editing Task

This notebook mainly shows the code to generate a dataset for inserting operators. For example, the input consists of a sequence of positive real numbers (e.g., 112), and its corresponding outputs should be a correct math equation (e.g., 1+1=2) by inserting operators at the right positions.

## Notes
Both the input and output should be the same single real number when the input sequence length is 1. For example, "1"$\rightarrow$"1."  
In other cases, operators are necessary to hold the output equation. For example, "1 1 2"$\rightarrow$"1 + 1 = 2."
1. sequence length  
output_seq_len = 2 * input_seq_len - 1
2. vocab size  
This parameter stands for the unique number of digits involved in the inputs. For exmaple, samples range from 0 to 9 if input vocab size is 10.
3. data size

In [5]:
#!/usr/bin/env python
# -*- coding:utf-8 -*-

__author__ = 'Shining'
__email__ = 'mrshininnnnn@gmail.com'

In [6]:
# dependency
# public
import os
import numpy as np
from tqdm import tqdm
# private
from utils import save_txt

In [16]:
# the calss to generate dataset
# for math operator intertion task
class MathematicalOperatorInsertion(): 
    """docstring for ClassName"""
    def __init__(self, operators):
        super(MathematicalOperatorInsertion, self).__init__()
        self.operators = operators
    
    def gen_base_dataset(self, vocab_size):
        # return a base dataset
        x = [str(i) for i in range(2, vocab_size+2)]
        y = x.copy()
        return x, y
    
    def gen_base_dict(self, vocab_size):
        # initialize a base value dict
        return {str(i):[] for i in range(2, vocab_size+2)}
        
    def gen_operation(self, vocab_size, seq_len):
        # a recursive function to geneate an operation
        # given the number of digits to involve
        a = np.random.choice(range(2, vocab_size+2))
        if seq_len == 1:
            return [str(a)]
        else:
            out_set = self.gen_operation(vocab_size, seq_len-1)
            o = np.random.choice(self.operators)
            b = np.random.choice(range(2, vocab_size+2))
            return out_set + [o, str(b)]
    
    def gen_operation_list(self, vocab_size, seq_len, data_size):
        # to control the data size
        operations_pool = set()
        for i in tqdm(range(data_size)):
            while True: 
                # to avoid duplicates
                operation = self.gen_operation(vocab_size, seq_len) 
                if ''.join(operation) in operations_pool: 
                    continue
                else:
                    operations_pool.add(''.join(operation)) 
                # to avoid zero division error
                try: 
                    # flost to int to string
                    value = eval(' '.join(operation)) 
                    if value % 1 != 0.: 
                        continue
                    else:
                        value = str(int(value))
                        # to keep vocab size
                        if value in self.value_dict: 
                            self.value_dict[value].append(operation)
                            break
                except: 
                    pass
    
    def gen_equation_list(self):
        # generate the relational equation
        # given the value dict
        for v in self.value_dict:
            for x in self.value_dict[v]:
                y = x + ["=="] + [v]
                x = [i for i in y if i.isdigit()]
                self.xs.append(' '.join(x))
                self.ys.append(' '.join(y))

    def generate(self, vocab_size, seq_len, data_size):
        if seq_len == 0:
            return self.gen_base_dataset(
                vocab_size=vocab_size)
        # input sequences, # output sequences
        self.xs, self.ys = [], []
        # initialize a value dictionary
        # to save the value of each sequence
        self.value_dict = self.gen_base_dict(
            vocab_size=vocab_size)
        # insert operators and generate equations
        self.gen_operation_list(
            vocab_size=vocab_size, 
            seq_len=seq_len, 
            data_size=data_size)
        # generate relations given the value dict
        self.gen_equation_list()
        
        return self.xs, self.ys

In [17]:
# definition
vocab_size = 10
seq_len = 4 # must >= 0
data_size = 60000
operators = ['+', '-', '*', '/']

In [18]:
# data generation
moi = MathematicalOperatorInsertion(operators)
xs, ys = moi.generate(
    vocab_size=vocab_size, 
    seq_len=seq_len, 
    data_size=data_size)

 83%|████████▎ | 50036/60000 [04:16<09:17, 17.87it/s] 

KeyboardInterrupt: 

In [10]:
len(xs)

10000

In [15]:
xs[-15:]

['7 8 8 4 11',
 '8 3 6 6 11',
 '2 8 10 5 11',
 '2 8 2 5 11',
 '5 3 4 6 11',
 '4 5 4 2 11',
 '6 4 2 3 11',
 '9 4 2 4 11',
 '4 4 3 8 11',
 '4 7 9 9 11',
 '7 10 3 11 11',
 '5 7 6 7 11',
 '2 9 9 9 11',
 '5 5 6 5 11',
 '9 9 5 2 11']

In [8]:
len(ys)

10000

In [14]:
ys[-15:]

['7 + 8 / 8 * 4 == 11',
 '8 + 3 * 6 / 6 == 11',
 '2 * 8 - 10 + 5 == 11',
 '2 + 8 / 2 + 5 == 11',
 '5 + 3 * 4 - 6 == 11',
 '4 + 5 + 4 - 2 == 11',
 '6 + 4 / 2 + 3 == 11',
 '9 - 4 + 2 + 4 == 11',
 '4 - 4 + 3 + 8 == 11',
 '4 + 7 - 9 + 9 == 11',
 '7 - 10 + 3 + 11 == 11',
 '5 + 7 + 6 - 7 == 11',
 '2 + 9 * 9 / 9 == 11',
 '5 + 5 * 6 / 5 == 11',
 '9 / 9 + 5 * 2 == 11']

In [10]:
# train val test split
dataset = np.array([(x, y) for x, y in zip(xs, ys)])
data_size = dataset.shape[0]
indices = np.random.permutation(data_size)
train_size = int(0.7*data_size)
val_size = int(0.15*data_size)
test_size = data_size - train_size - val_size
train_idxes = indices[:train_size]
val_idxes = indices[train_size: train_size+val_size]
test_idxes = indices[train_size+val_size:]
trainset = dataset[train_idxes]
valset = dataset[val_idxes]
testset = dataset[test_idxes]
print('train size', train_size, trainset.shape)
print('val size', val_size, valset.shape)
print('test size', test_size, testset.shape)

train size 7000 (7000, 2)
val size 1500 (1500, 2)
test size 1500 (1500, 2)


In [11]:
# to save dataset
outdir = 'raw'
outdir = os.path.join(
    outdir, 
    'vocab_size_{}'.format(vocab_size), 
    'seq_len_{}'.format(seq_len+1), 
    'data_size_{}'.format(data_size))
if not os.path.exists(outdir): 
    os.makedirs(outdir)
outdir

'raw/vocab_size_10/seq_len_5/data_size_10000'

In [12]:
save_txt(os.path.join(outdir, 'train_x.txt'), trainset[:, 0])
save_txt(os.path.join(outdir, 'train_y.txt'), trainset[:, 1])
save_txt(os.path.join(outdir, 'val_x.txt'), valset[:, 0])
save_txt(os.path.join(outdir, 'val_y.txt'), valset[:, 1])
save_txt(os.path.join(outdir, 'test_x.txt'), testset[:, 0])
save_txt(os.path.join(outdir, 'test_y.txt'), testset[:, 1])