# MathDataset

In [4]:
import numpy as np
from math_generator import MathGenerator
from math_vocab import MathVocab
from math_dataset import MathDataset

text_path    = "texts/math.txt"
vocab_path   = "data/math_vocab.data"
dataset_path = "data/math_dataset.data"

## Generator

In [10]:
gen = MathGenerator(10, 20)
sents = gen.generate('=', 10000)
text = '\n'.join(sents)

print(text[:200] + " ...")

with open(text_path, 'w') as f:
    f.write(text)

13+2+1+2+1 = 1+16+2
16 = 2+3+1+8+1+1
1+3+4+3 = 3+2+6
2+6+2+2 = 1+11
1+3+7+4+2+1 = 16+1+1
1+18 = 16+1+1+1
6+1+7+1+1 = 13+3
1+11+1+3 = 7+2+6+1
1+11 = 3+2+6+1
2+8+2+4+1 = 3+14
2+7+1 = 1+1+7+1
3+3+3+4 = 1 ...


## Vocab

In [11]:
voc = MathVocab()
voc.build(text_path)
voc.save(vocab_path)
print(voc)

assert voc.size == 17 # '0123456789 =+' + UNK, EOS, BOS, PAD

MathVocab:
  size: 17
  _tokens_to_words: ['<UNK>', '<BOS>', '<EOS>', '<PAD>', '+', '1', ' ', '2', '=', '3', '4', '5', '6', '7', '8', '9', '0']


## Dataset

In [12]:
voc = MathVocab()
voc.restore(vocab_path)

dataset = MathDataset()
dataset.build(text_path, voc, max_len=32)
dataset.save(dataset_path)
print(dataset)

MathDataset:
  shape: [10000, 34]
  data_limit: None


## Batch

In [13]:
voc = MathVocab()
voc.restore(vocab_path)

dataset = MathDataset()
dataset.restore(dataset_path)

batch = dataset.get_next_batch(15)
for sent in batch:
    restored = "".join(voc.to_words(sent))
    restored = restored.replace('<PAD>', '_')
    print(restored)

<BOS>4+1+7+2 = 5+2+7<EOS>_________________
<BOS>4+2+9+2 = 16+1<EOS>__________________
<BOS>3+8+1 = 2+2+8<EOS>___________________
<BOS>2+3+9+4 = 1+6+2+1+2+5+1<EOS>_________
<BOS>5+3+4 = 10+2<EOS>____________________
<BOS>19 = 6+1+1+11<EOS>___________________
<BOS>7+1+10+1 = 1+4+11+3<EOS>_____________
<BOS>5+3+8 = 1+12+3<EOS>__________________
<BOS>7+3+9 = 3+11+1+1+1+2<EOS>____________
<BOS>9+1+4 = 6+5+3<EOS>___________________
<BOS>1+17 = 3+5+9+1<EOS>__________________
<BOS>8+1+1 = 1+7+1+1<EOS>_________________
<BOS>4+1+9 = 13+1<EOS>____________________
<BOS>1+4+4+1+4+2+1 = 15+2<EOS>____________
<BOS>3+2+1+6 = 5+1+1+1+4<EOS>_____________
