In [1]:
import numpy as np
from seqgen.vocabulary import Vocabulary

%load_ext autoreload
%autoreload 2

In [2]:
def read_label_file(filename):
    with open(filename, "r") as f:
        content = f.read()
    return content

def parse_label_file(content):
    lines = content.split("\n")
    formulas = []
    boxes = []
    for idx_line in range(len(lines)):
        end = lines[idx_line][1:].index("$")
        formula = lines[idx_line][1:end+1]
        formulas.append(formula)
        suffix = lines[idx_line][end+3:]
        coords = suffix.split(" ")
        assert len(coords) % 5 == 0, "There must be 5n items in the coordinates string"
        boxes.append(np.array(coords, dtype=float).reshape(-1, 5))
    return formulas, boxes

def parse_formula(s, keys):
    lst = []
    pos = 0
    while pos < len(s):
        found = False
        for k in keys:
            if s[pos:].startswith(' '):
                pos += 1
                found=True
                break
            if s[pos:].startswith(k):
                l = len(k)
                lst.append(s[pos:pos+l])
                pos += l
                found=True
                break
        if not found:
            raise Exception(f"${s[pos:]}$ could not be parsed")
    return lst

In [74]:
# get the vocabulary
vocab_out = Vocabulary(vocab_filename="seqgen/vocab_out.txt")
# read the label file
content = read_label_file("data/train/label.txt")
# get the formulas, classes and coordinates
formulas, boxes = parse_label_file(content)
# get the keys
keys = np.array(list(sorted(list(vocab_out.word2idx.keys()), key=lambda k: len(k))))[::-1]
# get the key lengths
lens = np.array(list(map(lambda k: len(k), keys)))

In [315]:
from seqgen.datasets.realdata import RealSequencesDataset
dataset = RealSequencesDataset(vocab_in=vocab_out, vocab_out=vocab_out, max_length=50, batch_size=10)

In [316]:
input_seqs, coordinates, target_seqs = dataset.__getitem__(0)
input_seqs[0], target_seqs[0], coordinates[0]

(array([38, 26, 12,  8,  9, 38, 26, 38, 26,  4,  8,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1]),
 array([89, 20, 39, 58, 15,  4, 41, 18, 89, 20, 58, 89, 20, 39, 58, 16,  4,
        41,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1]),
 array([[0.        , 0.03508772, 0.14715719, 0.87719298],
        [0.11036789, 0.78947368, 0.15719064, 1.        ],
        [0.16053512, 0.80701754, 0.19732441, 0.96491228],
        [0.2006689 , 0.75438596, 0.24080268, 1.        ],
        [0.35785953, 0.50877193, 0.44816054, 0.71929825],
        [0.51505017, 0.        , 0.62541806, 0.78947368],
        [0.6187291 , 0.73684211, 0.66220736, 0.92982456],
        [0.75585284, 0.01754386, 0.83277592, 0.77192982],
        [0.85284281, 0.75438596, 0.90301003, 0.92982456],
        [0.909699  , 0.824

In [294]:
coords = np.zeros_like(coordinates[0])
coords[:, 0] = coordinates[0][:, 0] - 0.5 * coordinates[0][:, 2]
coords[:, 2] = coordinates[0][:, 0] + 0.5 * coordinates[0][:, 2]
coords[:, 1] = coordinates[0][:, 1] - 0.5 * coordinates[0][:, 3]
coords[:, 3] = coordinates[0][:, 1] + 0.5 * coordinates[0][:, 3]

In [295]:
from seqgen.preprocess import *

In [303]:
np.array(normalize_coordinates(np.array([coords]), contains_class=False)[0])

array([[0.02566453, 0.24585635, 0.23831347, 0.91436464],
       [0.08340972, 0.31860037, 0.3015582 , 1.        ],
       [0.09990834, 0.32228361, 0.340055  , 1.        ],
       [0.10999083, 0.31491713, 0.37396884, 0.99631676],
       [0.13932172, 0.3038674 , 0.51695692, 0.9558011 ],
       [0.17690192, 0.24677716, 0.65169569, 0.90607735],
       [0.22364803, 0.31675875, 0.71860678, 0.9907919 ],
       [0.25206233, 0.24953959, 0.84051329, 0.90699816],
       [0.28597617, 0.31860037, 0.91292392, 0.99263352],
       [0.30430797, 0.32780847, 0.95692026, 0.99815838],
       [0.31989001, 0.31767956, 1.        , 0.98987109],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        ,

In [299]:
min_max(5, 4, 10)

0.16666666666666666