# Training sequence generator

In [1]:
import numpy as np
import seqgen.seq_gen as g
from seqgen.vocabulary import *
from seqgen.preprocess import *

In [2]:
synthetic_sequences = g.generator(num_samples=1)
synthetic_sequences

[{'feature_seq': [['op_divide',
    96.97837215652243,
    109.90634295543171,
    154.41770014662683,
    166.46042246351612],
   ['8',
    138.51789921272334,
    93.36090253541308,
    197.20596957459173,
    158.49422647441847],
   ['1',
    161.8800121490045,
    107.13441963599833,
    226.06838534922616,
    161.55876458032105],
   ['op_multiply',
    187.78745313029756,
    106.93364314898628,
    250.04756239727976,
    168.2485235477471],
   ['5',
    210.69314968297007,
    108.83178395351669,
    273.28985396353926,
    174.05374824892812],
   ['4',
    282.6607671502018,
    90.5111945192654,
    343.5736268534532,
    152.55420728367525],
   ['7',
    308.06120212362674,
    94.16761056952438,
    372.12791400530756,
    155.6891635625144],
   ['0',
    360.28075896759697,
    105.24523719281922,
    421.35916872797577,
    166.9485462050548],
   ['op_plus',
    352.26453107297687,
    90.40803839490671,
    411.61395075383206,
    150.96051778806986],
   ['op_plus',
    

In [3]:
g.save_as_json(g.generator(num_samples=5))

In [4]:
vocab_in = Vocabulary(vocab_filename="seqgen/vocab_in.txt")
vocab_in.word2idx

{'<start>': 0,
 '<end>': 1,
 '<unk>': 2,
 '0': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '7': 10,
 '8': 11,
 '9': 12,
 'op_plus': 13,
 'op_minus': 14,
 'op_multiply': 15,
 'op_divide': 16}

In [5]:
vocab_in.idx2word

{0: '<start>',
 1: '<end>',
 2: '<unk>',
 3: '0',
 4: '1',
 5: '2',
 6: '3',
 7: '4',
 8: '5',
 9: '6',
 10: '7',
 11: '8',
 12: '9',
 13: 'op_plus',
 14: 'op_minus',
 15: 'op_multiply',
 16: 'op_divide'}

In [6]:
vocab_out = Vocabulary(vocab_filename="seqgen/vocab_out.txt")
vocab_out.word2idx

{'<start>': 0,
 '<end>': 1,
 '<unk>': 2,
 '0': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '7': 10,
 '8': 11,
 '9': 12,
 '+': 13,
 '-': 14,
 '\\cdot': 15,
 '/': 16,
 '(': 17,
 ')': 18,
 '[': 19,
 ']': 20,
 '{': 21,
 '}': 22}

### Translate sequence of tokens to sequence of integers

In [7]:
encoded_sequences = encode_classes_of_sequences(synthetic_sequences, vocab_in, vocab_out)
encoded_sequences

[{'feature_seq': [[16,
    96.97837215652243,
    109.90634295543171,
    154.41770014662683,
    166.46042246351612],
   [11,
    138.51789921272334,
    93.36090253541308,
    197.20596957459173,
    158.49422647441847],
   [4,
    161.8800121490045,
    107.13441963599833,
    226.06838534922616,
    161.55876458032105],
   [15,
    187.78745313029756,
    106.93364314898628,
    250.04756239727976,
    168.2485235477471],
   [8,
    210.69314968297007,
    108.83178395351669,
    273.28985396353926,
    174.05374824892812],
   [7,
    282.6607671502018,
    90.5111945192654,
    343.5736268534532,
    152.55420728367525],
   [10,
    308.06120212362674,
    94.16761056952438,
    372.12791400530756,
    155.6891635625144],
   [3,
    360.28075896759697,
    105.24523719281922,
    421.35916872797577,
    166.9485462050548],
   [13,
    352.26453107297687,
    90.40803839490671,
    411.61395075383206,
    150.96051778806986],
   [13,
    391.83124009138214,
    102.0102001992025,
 

### Normalize coordinates

In [8]:
normalize_coordinates(encoded_sequences[0]["feature_seq"])

[[16, 0.0, 0.23310585318187227, 0.16067381330825983, 0.9092203796385508],
 [11,
  0.11619763755403403,
  0.03530203934738202,
  0.2803645314001701,
  0.8139830267246922],
 [4,
  0.18154798032018205,
  0.1999669949634299,
  0.36110075457809304,
  0.8506201490738339],
 [15,
  0.25401831521407525,
  0.19756667476335693,
  0.4281771976898306,
  0.9305974602724719],
 [8, 0.31809193411563697, 0.22025930069531507, 0.4931923666802597, 1.0],
 [7,
  0.5194054232839359,
  0.0012332506298137573,
  0.6897956731440762,
  0.7429689938339469],
 [10,
  0.5904575210308276,
  0.044946383755710646,
  0.7696699746640712,
  0.7804479785219876],
 [3,
  0.7365301792769966,
  0.17738146790560333,
  0.9073835187989213,
  0.9150559896464109],
 [13, 0.7141065550939535, 0.0, 0.8801234273560754, 0.7239161398574944],
 [13, 0.8247856706146073, 0.13870599968060401, 1.0, 0.888864371944344]]

### Translate an encoded sequence back to a sequence of words

In [9]:
print("Original Sequence", synthetic_sequences[0]["target_seq"])
encoded_sequence = [vocab_out(x) for x in synthetic_sequences[0]["target_seq"]]
print("Encoded Sequence", encoded_sequence)
print("Decoded Sequence", vocab_out.decode_sequence(encoded_sequence))

Original Sequence ['/', '8', '1', '\\cdot', '5', '4', '7', '0', '+', '+']
Encoded Sequence [16, 11, 4, 15, 8, 7, 10, 3, 13, 13]
Decoded Sequence ['/', '8', '1', '\\cdot', '5', '4', '7', '0', '+', '+']
