In [None]:
import numpy as np
import random
from seqgen.vocabulary import Vocabulary

%load_ext autoreload
%autoreload 2

In [None]:
def read_label_file(filename):
    with open(filename, "r") as f:
        content = f.read()
    return content

def parse_label_file(content):
    lines = content.split("\n")
    formulas = []
    boxes = []
    for idx_line in range(len(lines)):
        end = lines[idx_line][1:].index("$")
        formula = lines[idx_line][1:end+1]
        formulas.append(formula)
        suffix = lines[idx_line][end+3:]
        coords = suffix.split(" ")
        assert len(coords) % 5 == 0, "There must be 5n items in the coordinates string"
        boxes.append(np.array(coords, dtype=float).reshape(-1, 5))
    return formulas, boxes

def parse_formula(s, keys):
    lst = []
    pos = 0
    while pos < len(s):
        found = False
        for k in keys:
            if s[pos:].startswith(' '):
                pos += 1
                found=True
                break
            if s[pos:].startswith(k):
                l = len(k)
                lst.append(s[pos:pos+l])
                pos += l
                found=True
                break
        if not found:
            raise Exception(f"${s[pos:]}$ could not be parsed")
    return lst

In [None]:
# get the vocabulary
vocab_in = Vocabulary(vocab_filename="seqgen/vocab_in.txt")
vocab_out = Vocabulary(vocab_filename="seqgen/vocab_out.txt")
# read the label file
content = read_label_file("data/train/label.txt")
# get the formulas, classes and coordinates
formulas, boxes = parse_label_file(content)
# get the keys
keys = np.array(list(sorted(list(vocab_out.word2idx.keys()), key=lambda k: len(k))))[::-1]
# get the key lengths
lens = np.array(list(map(lambda k: len(k), keys)))

In [None]:
from seqgen.datasets.realdata import RealSequencesDataset
dataset = RealSequencesDataset(filename="data/train/label.txt", vocab_in=vocab_out, vocab_out=vocab_out, max_length=50, batch_size=10)

In [None]:
input_seqs, coordinates, target_seqs = dataset.__getitem__(0)
input_seqs.shape, target_seqs.shape, coordinates.shape

In [None]:
input_seqs[0], np.array(vocab_in.decode_sequence(input_seqs[0])), coordinates[0]

In [None]:
target_seqs[0], np.array(vocab_out.decode_sequence(target_seqs[0]))