# Dataset Visualization

In [None]:
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
import cv2
import pickle
import random
import string
from seqgen.vocabulary import *
from seqgen.datasets.sequences import *
from seqgen.preprocess import *
from seqgen.dataset import *
from seqgen.visualize import *
from seqgen.seq_gen import add_noise_to_coordinates
from seqgen.symbol_replacement import replace_symbols

%load_ext autoreload
%autoreload 2

In [None]:
batch_size=4
max_length=25
img_width=1120
img_height=224

vocab_in = Vocabulary(vocab_filename="seqgen/vocab_in.txt")
vocab_out = Vocabulary(vocab_filename="seqgen/vocab_out.txt")

basedir = "dataset-generation/archive/latex_images"
dataset = SyntheticSequenceDataset(vocab_in, vocab_out, max_length, batch_size, continue_prob=0.9999, device="cpu")
class_samples = get_class_samples(basedir)

input_seqs, coordinates, target_seqs = dataset[0]
coordinates[:, :, [0,2]] *= img_width
coordinates[:, :, [1,3]] *= img_height

In [None]:
fig, ax = plt.subplots(batch_size*2, figsize=(50, 80))

for i in range(batch_size):
    ax[2*i].imshow(create_img_array_from_coordinates(coordinates[i], img_height, img_width, max_length))
    ax[2*i+1].imshow(create_cv2img_array_from_coordinates(input_seqs[i], coordinates[i], img_height, img_width, class_samples, vocab_in, max_length, basedir))

## Image Patching

In [None]:
images = create_images_from_input_seqs(input_seqs, coordinates, batch_size, img_height, img_width, class_samples, vocab_in, max_length, basedir)

In [None]:
images[0].shape
plt.imshow(images[0][0])

In [None]:
image_tensor = torch.tensor(images)
image_patches = create_image_patches(image_tensor, patch_size=224, flatten_patches=False)
image_patches.shape

In [None]:
n_patches_dim0, n_patches_dim1 = 1, 5
fix, ax = plt.subplots(n_patches_dim0, n_patches_dim1, figsize=(32,8))

for i in range(n_patches_dim0):
    for j in range(n_patches_dim1):
        idx = i*n_patches_dim1+j
        if n_patches_dim0 > 1:
            ax[i, j].imshow(image_patches[0][idx])
        else:
            ax[j].imshow(image_patches[0][idx])

## Display real sequences

In [None]:
from seqgen.datasets.realdata import RealSequencesDataset
dataset = RealSequencesDataset(filename="data/val/label.txt", vocab_in=vocab_in, vocab_out=vocab_out, max_length=50, batch_size=10)

input_seqs, coordinates, target_seqs = dataset.__getitem__(0)

In [None]:
fig, ax = plt.subplots(2*batch_size, figsize=(50, 80))

for i in range(batch_size):
    coords = np.array(coordinates[i])
    coords = np.array(normalize_coordinates(np.array([coords]), contains_class=False)).squeeze()
    coords[:, [0,2]] *= img_width
    coords[:, [1,3]] *= img_height
    coords = torch.tensor(coords)
    ax[2*i].imshow(create_img_array_from_coordinates(coords, img_height, img_width).numpy())
    ax[2*i+1].imshow(create_cv2img_array_from_coordinates(input_seqs[i], coords, img_height, img_width, vocab_in))

In [None]:
fig, ax = plt.subplots(2*batch_size, figsize=(50, 80))

for i in range(batch_size):
    coords = add_noise_to_coordinates(coordinates[i])
    coords = np.array(normalize_coordinates(np.array(coords), contains_class=False)).squeeze()
    coords[:, [0,2]] *= img_width
    coords[:, [1,3]] *= img_height
    coords = torch.tensor(coords)
    ax[2*i].imshow(create_img_array_from_coordinates(coords, img_height, img_width).numpy())
    ax[2*i+1].imshow(create_cv2img_array_from_coordinates(input_seqs[i], coords, img_height, img_width, vocab_in))

In [None]:
fig, ax = plt.subplots(2*batch_size, figsize=(50, 80))

for i in range(batch_size):
    coords = add_noise_to_coordinates(coordinates[i])
    coords = np.array(normalize_coordinates(np.array(coords), contains_class=False)).squeeze()
    coords[:, [0,2]] *= img_width
    coords[:, [1,3]] *= img_height
    coords = torch.tensor(coords)
    in_seq, out_seq = replace_symbols(input_seqs[i], target_seqs[i], vocab_in, vocab_out)
    ax[2*i].imshow(create_img_array_from_coordinates(coords, img_height, img_width).numpy())
    ax[2*i+1].imshow(create_cv2img_array_from_coordinates(in_seq, coords, img_height, img_width, vocab_in))

In [None]:
input_seqs.shape