# Training sequence generator

In [1]:
import torch
import numpy as np
import seqgen.seq_gen as g
from seqgen.vocabulary import *
from seqgen.preprocess import *

In [2]:
features, targets = g.generator(num_samples=1)
features, targets

([[['<start>', 0, 0, 0, 0],
   ['op_divide',
    108.7325213434235,
    90.56245913318605,
    171.08478442319398,
    152.23735937885886],
   ['5',
    130.7994065179126,
    104.18008026262295,
    191.4009817033159,
    165.90370178488106],
   ['<end>', 0, 0, 0, 0],
   ['<end>', 0, 0, 0, 0],
   ['<end>', 0, 0, 0, 0],
   ['<end>', 0, 0, 0, 0],
   ['<end>', 0, 0, 0, 0],
   ['<end>', 0, 0, 0, 0],
   ['<end>', 0, 0, 0, 0]]],
 [['<start>',
   '/',
   '5',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>',
   '<end>']])

In [3]:
features, targets = g.generator(num_samples=1)
g.save_as_json({"features": features, "targets": targets})

In [4]:
vocab_in = Vocabulary(vocab_filename="seqgen/vocab_in.txt")
vocab_in.word2idx

{'<start>': 0,
 '<end>': 1,
 '<unk>': 2,
 '0': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '7': 10,
 '8': 11,
 '9': 12,
 'op_plus': 13,
 'op_minus': 14,
 'op_multiply': 15,
 'op_divide': 16}

In [5]:
vocab_in.idx2word

{0: '<start>',
 1: '<end>',
 2: '<unk>',
 3: '0',
 4: '1',
 5: '2',
 6: '3',
 7: '4',
 8: '5',
 9: '6',
 10: '7',
 11: '8',
 12: '9',
 13: 'op_plus',
 14: 'op_minus',
 15: 'op_multiply',
 16: 'op_divide'}

In [6]:
vocab_out = Vocabulary(vocab_filename="seqgen/vocab_out.txt")
vocab_out.word2idx

{'<start>': 0,
 '<end>': 1,
 '<unk>': 2,
 '0': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '7': 10,
 '8': 11,
 '9': 12,
 '+': 13,
 '-': 14,
 '\\cdot': 15,
 '/': 16,
 '(': 17,
 ')': 18,
 '[': 19,
 ']': 20,
 '{': 21,
 '}': 22}

### Translate sequence of tokens to sequence of integers

In [7]:
encoded_feature_seqs = encode_classes_of_bboxes(features, vocab_in)
features, encoded_feature_seqs

([[['<start>', 0, 0, 0, 0],
   ['0',
    94.61510915836786,
    90.86242543820914,
    153.32300197550038,
    153.95489359729783],
   ['2',
    127.28366113166248,
    90.45878801910774,
    183.23315445494814,
    150.23734428726516],
   ['3',
    173.1910817292214,
    102.86273510781226,
    228.62746728562198,
    164.05234538752117],
   ['6',
    202.20771391660858,
    99.60780669348743,
    266.9852080647058,
    156.34103144512517],
   ['0',
    239.46119404555344,
    100.46679165234391,
    300.36702301877403,
    162.38847556562672],
   ['1',
    260.8723098557072,
    95.76546685520792,
    316.272871979957,
    151.6592648561477],
   ['op_divide',
    285.0744064098115,
    109.72299777443621,
    347.02761420337777,
    165.47518429753967],
   ['8',
    307.87145802780253,
    102.35237508051075,
    369.46668424027405,
    165.4122485127249],
   ['<end>', 0, 0, 0, 0]]],
 [[[0, 0, 0, 0, 0],
   [3,
    94.61510915836786,
    90.86242543820914,
    153.32300197550038,
    

In [8]:
encoded_target_seqs = encode_latex_tokens(targets, vocab_out)
targets, encoded_target_seqs

([['<start>', '0', '2', '3', '6', '0', '1', '/', '8', '<end>']],
 [[0, 3, 5, 6, 9, 3, 4, 16, 11, 1]])

### Normalize coordinates

In [9]:
normalize_coordinates(encoded_feature_seqs)

[[[0, 0.0, 0.0, 0.0, 0.0],
  [3,
   0.25608563151755553,
   0.5491000105177711,
   0.4149846481849236,
   0.9303805537419599],
  [5,
   0.3445064644824282,
   0.54666074797327,
   0.4959395860867031,
   0.9079146515233639],
  [6,
   0.46875967202658686,
   0.6216203084739012,
   0.6188040141041227,
   0.9914014967497476],
  [9,
   0.5472962043449295,
   0.6019501178761849,
   0.7226232281638638,
   0.9448004672652882],
  [3,
   0.6481266221281956,
   0.6071411376807736,
   0.8129745815550647,
   0.9813463949593628],
  [4,
   0.70607803350966,
   0.5787300812610836,
   0.8560254157429694,
   0.9165076050524309],
  [16, 0.7715835244955943, 0.6630782630049478, 0.9392663236117291, 1.0],
  [11, 0.83328611525792, 0.6185360996273116, 1.0, 0.9996196663259089],
  [1, 0.0, 0.0, 0.0, 0.0]]]

### Translate an encoded sequence back to a sequence of words

In [10]:
print("Original Sequence", targets[0])
encoded_sequence = [vocab_out(x) for x in targets[0]]
print("Encoded Sequence", encoded_sequence)
print("Decoded Sequence", vocab_out.decode_sequence(encoded_sequence))

Original Sequence ['<start>', '0', '2', '3', '6', '0', '1', '/', '8', '<end>']
Encoded Sequence [0, 3, 5, 6, 9, 3, 4, 16, 11, 1]
Decoded Sequence ['<start>', '0', '2', '3', '6', '0', '1', '/', '8', '<end>']


In [13]:
g.generate_synthetic_training_data(2, max_length=20)

(tensor([[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 6.0000,  0.3859,  0.5675,  0.5938,  0.9300],
          [15.0000,  0.4704,  0.5811,  0.7092,  0.9245],
          [10.0000,  0.6705,  0.5642,  0.9075,  0.9208],
          [15.0000,  0.7806,  0.6240,  1.0000,  1.0000],
          [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
          [ 1.0000,  0.0000,  0