In [None]:
import json
import random
import zipfile


from pathlib import Path
from pylab import cm
from torchvision import transforms
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from google.cloud import storage
import pandas as pd


from layout_gnn.dataset.dataset import RICOSemanticAnnotationsDataset
from layout_gnn.dataset.transformations import *
from layout_gnn.utils import *
from layout_gnn.similarity_metrics import compute_edit_distance, compute_iou
from tqdm.autonotebook import tqdm
from joblib import Parallel, delayed

ROOT_PATH = Path.cwd()
DATA_PATH = ROOT_PATH / '../data'
DATA_PATH.mkdir(parents=True, exist_ok=True)
BUCKET_ID = 'crowdstf-rico-uiuc-4540'

In [None]:
rico_dataset = RICOSemanticAnnotationsDataset(
    transform=transforms.Compose([
        process_data,
        normalize_bboxes,
        add_networkx,
    ]),
    only_data=False
)

In [None]:
dataloader = DataLoader(rico_dataset, batch_size=1, num_workers=16, collate_fn=default_data_collate)
dataset = []

for data in tqdm(dataloader):
    dataset.extend(data)
    
dataset = sorted(dataset, key=lambda x: x['filename'])

In [None]:
datapoint_1, datapoint_2 = rico_dataset[200], rico_dataset[1000]

In [None]:
fig = plot_datapoint(datapoint_1, rico_dataset.label_color_map)

In [None]:
fig = plot_datapoint(datapoint_2, rico_dataset.label_color_map)

## Compute ED between both trees

In [None]:
compute_edit_distance(datapoint_1['graph'].to_undirected(), datapoint_2['graph'].to_undirected())

In [None]:
node_labels = sorted(list(set(node['label'] for _, node in datapoint_1['graph'].nodes(data=True)) | set(node['label'] for _, node in datapoint_2['graph'].nodes(data=True))))
node_labels = {label: idx for idx, label in enumerate(node_labels)}
image_shape = (256, 256)

In [None]:
compute_iou(datapoint_1, datapoint_2)

In [None]:
def compute_pairs(datapoint, dataset, pos_thresh=0.9, neg_thresh=0.1):
    sample = {
            'graph': datapoint['filename'],
    }
    for datapoint2 in dataset:
        iou = compute_iou(datapoint, datapoint2)
        ted = compute_edit_distance(datapoint['graph'].to_undirected(), datapoint2['graph'].to_undirected())
        normalized_ted = 1 - ted['normalized_edit_distance']
        
        if iou['iou'] >= pos_thresh:
            sample['positive_iou'] = {
                'graph': datapoint2['filename'],
                **iou
            }
        if iou['iou'] <= neg_thresh:
            sample['negative_iou'] = {
                'graph': datapoint2['filename'],
                **iou
            }
            
        if normalized_ted >= pos_thresh:
            sample['positive_ged'] = {
                'graph': datapoint2['filename'],
                **ted
            }
        if normalized_ted <= neg_thresh:
            sample['negative_ged'] = {
                'graph': datapoint2['filename'],
                **ted
            }
            
        if 'positive_iou' in sample and 'negative_iou' in sample and 'positive_ged' in sample and 'negative_ged' in sample:
            break
        
        return sample

In [None]:
def compute_distances(datapoint1, dataset):
    distances = []
    for datapoint2 in random.choices(dataset, k=100):
        iou = compute_iou(datapoint1, datapoint2)
        ted = compute_edit_distance(datapoint1['graph'].to_undirected(), datapoint2['graph'].to_undirected())
        distances.append({
           **iou,
           **ted
        })
        
    return distances

In [None]:
final_distances = Parallel(n_jobs=16)(delayed(compute_distances)(datapoint1, dataset) for datapoint1 in tqdm(random.choices(dataset, k=100), total=100))

In [None]:
final_distances = [f_point for f in final_distances for f_point in f]

In [None]:
pd.DataFrame(final_distances)['iou'].plot.hist()

In [None]:
pd.DataFrame(final_distances)['iou'].describe(percentiles=[.1, .25, .5, .75, .9])

In [None]:
pd.DataFrame(final_distances)['normalized_edit_distance'].plot.hist()


In [None]:
pd.DataFrame(final_distances)['normalized_edit_distance'].describe(percentiles=[.1, .25, .5, .75, .9])


## Neighbour Layouts

In [5]:
import json
import random
import zipfile
import numpy as np
import gzip

from pathlib import Path
from google.cloud import storage
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from tqdm.autonotebook import tqdm
from layout_gnn.dataset.dataset import RICOSemanticAnnotationsDataset
from layout_gnn.dataset.transformations import *
from layout_gnn.similarity_metrics import compute_edit_distance, compute_iou
from torchvision import transforms



ROOT_PATH = Path.cwd()
DATA_PATH = ROOT_PATH / '../data'
DATA_PATH.mkdir(parents=True, exist_ok=True)
BUCKET_ID = 'crowdstf-rico-uiuc-4540'

In [6]:
rico_dataset = RICOSemanticAnnotationsDataset(
    transform=transforms.Compose([
        process_data,
        normalize_bboxes,
        add_networkx,
    ]),
    only_data=False
)

In [7]:
zip_filename = DATA_PATH / 'ui_layout_vectors.zip'
if not zip_filename.exists():
    client = storage.Client.create_anonymous_client()
    bucket = client.bucket(BUCKET_ID)

    blob = bucket.blob('rico_dataset_v0.1/ui_layout_vectors.zip')
    blob.download_to_filename(zip_filename)

extracted_folder = DATA_PATH / 'ui_layout_vectors'
if not extracted_folder.exists():
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(DATA_PATH)

In [9]:
embeddings = np.load(extracted_folder / 'ui_vectors.npy')
files = json.load(open(extracted_folder / 'ui_names.json'))['ui_names']
train_split = json.load(open(DATA_PATH / 'train_split.json'))['test_uis']
train_files = set(files) & set(train_split)

files_ids = [files.index(f.stem + '.png') for f in rico_dataset.files if f.stem + '.png' in train_files]
files_names = [f.stem for f in rico_dataset.files if f.stem + '.png' in train_files]
embeddings = embeddings[files_ids, :]

In [10]:
distances = pdist(embeddings, metric='euclidean')
dist_matrix = squareform(distances)

In [11]:
data = []
for i, row in tqdm(enumerate(dist_matrix), total=len(dist_matrix)):
    indexes = np.argsort(row)
    data += [{   
        'anchor': files_names[i],
        'closest': [files_names[j] for j in indexes[1:1001]],
        'farthest': [files_names[j] for j in indexes[-1000:]]
    }]

  0%|          | 0/13110 [00:00<?, ?it/s]

In [12]:
with gzip.open(DATA_PATH / 'neighbors_test.gzip', 'wt') as fp:
    json.dump(data, fp)

In [None]:
with gzip.open(DATA_PATH / 'neighbors.gzip', 'rt') as fp:
    test = json.load(fp)

In [10]:
file2idx = {file.stem:i for i, file in enumerate(rico_dataset.files)}

In [None]:
ntotal, pos_iou = 1000, []
for i in tqdm(range(ntotal), total=ntotal):
    for file in data[i]['closest']:
        iou = compute_iou(rico_dataset[file2idx[data[i]['anchor']]], rico_dataset[file2idx[file]])
        
        if iou['iou'] > 0.50:
            pos_iou.append(file)
            break
        
    else:
        print('Bodega')


In [None]:
ntotal, pos_ged = 1000, []
for i in tqdm(range(ntotal), total=ntotal):
    for file in data[i]['closest']:
        ged = compute_edit_distance(rico_dataset[file2idx[data[i]['anchor']]]['graph'].to_undirected(),
                                    rico_dataset[file2idx[file]]['graph'].to_undirected())
        
        
        if ged['normalized_edit_distance'] < 0.35:
            pos_ged.append(file)
            break
        
    else:
        print('Bodega')

In [None]:
sum(int(file_ged == file_iou) for file_ged, file_iou in zip(pos_ged, pos_iou)) / len(pos_iou)

In [None]:
ntotal, neg_iou = 1000, []
for i in tqdm(range(ntotal), total=ntotal):
    for file in data[i]['farthest']:
        iou = compute_iou(rico_dataset[file2idx[data[i]['anchor']]], rico_dataset[file2idx[file]])
        
        if iou['iou'] < 0.32:
            neg_iou.append(file)
            break
        
    else:
        print('Bodega')


In [None]:
sum(int(file_ged == file_iou) for file_ged, file_iou in zip(neg_ged, neg_iou)) / len(neg_iou)
