## Multi Dataset Size Statistics
This notebook can be used to collect the size of different FPDE setups.

In [1]:
# Load the datasets
import jsonlines
import glob
import osmnx as ox
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import folium
import tqdm
import shapely
import algos.fpd_extended_lib.cfg as cfg
import random
import os
from shapely.wkt import loads
import numpy as np
import re
import seaborn as sns
from algos.alg_fpd_extended import FpdExtended
import pickle
import gzip
import bz2

alg = FpdExtended()
NBR_ITER = 100

DATASETS_PATHS = ["QGIS/Intersection_Tricky_Cases.shp", "QGIS/Report_Visualizations.shp"]
LOAD_SAMPLED_DATASETS = True
STORE_SAMPLED_DATASETS = False
datasets = []
if not LOAD_SAMPLED_DATASETS:
    for ds in DATASETS_PATHS:
        if not ds.endswith('.shp'):
            files = glob.glob(ds + '/*.shp')
        else:
            files = [ds]

        ds_shapes = pd.DataFrame()
        for i, f in enumerate(tqdm.tqdm(files, disable=True)):
            file_df = gpd.read_file(f)
            print("Count of entries:", len(file_df))
            file_df = file_df[file_df.type != "Point"]
            print("---", ds, "---")
            print("Count of entries (no points):", len(file_df))
            if len(file_df) == 0:
                continue
            else:
                shapes = file_df.geometry
                if len(ds_shapes) > 0:
                    ds_shapes = pd.concat([ds_shapes, shapes])
                else:
                    ds_shapes = shapes

        if NBR_ITER != -1 and len(ds_shapes) > 0:
            ds_shapes = ds_shapes.sample(n=NBR_ITER, replace=True)
        if len(ds_shapes) > 0:
            datasets.append(ds_shapes)
    if STORE_SAMPLED_DATASETS:
        with open('data/compression_alternatives_ds_sampled.pkl', 'wb') as f:
          pickle.dump(datasets, f)
else:
    with open('data/compression_alternatives_ds_sampled.pkl', 'rb') as f:
        datasets = pickle.load(f)

In [2]:
configs = []

## ------ BASELINE ------ ##
config = {}
def setup():
    cfg.BASELINE_ON, cfg.DISABLE_OPTIMIZED_INTERSECTION, cfg.DISABLE_OPTIMIZED_ADD_VERTEX, cfg.DISABLE_OPTIMIZED_BOUNDING_BOX = True, True, True, True
config['name'] = "FPD (Baseline)"
config['setup_function'] = setup
configs.append(config)

## ------ FPDE ------ ##
config = {}
config['name'] = "FPDE"
configs.append(config)


## ------ FPDE + Entropy ------ ##
config = {}
def setup():
    pass#cfg.ENTROPY_METHOD = cfg.EM.AUTO
config['name'] = "FPDE + Entropy"
config['setup_function'] = setup
configs.append(config)


## ------ FPDE + Entropy ------ ##
config = {}
def setup():
    #cfg.ENTROPY_METHOD = cfg.EM.AUTO
    #cfg.ENTROPY_RANDOM_ACCESS = False
    cfg.DISABLE_OPTIMIZED_INTERSECTION, cfg.DISABLE_OPTIMIZED_BOUNDING_BOX = True, True
config['name'] = "FPDE: Size Optimal"
config['setup_function'] = setup
configs.append(config)


## ------ FPDE - FLOAT DELTAS ------ ##
config = {}
def setup():
    cfg.USE_DEFAULT_DOUBLE, cfg.FLOAT_SIZE, cfg.USE_FPINT = True, 64, False
config['name'] = "FPDE: Arbitrary Precision"
config['setup_function'] = setup
configs.append(config)

## ------ WKB ------ ##
config = {}
config['name'] = "WKB"
config['compress_function'] = lambda shp: shapely.to_wkb(shp)
configs.append(config)

## ------ WKB + GZIP ------ ##
config = {}
config['name'] = "WKB + GZIP"
config['compress_function'] = lambda shp: gzip.compress(shapely.to_wkb(shp))
configs.append(config)

## ------ WKB + BZ2 ------ ##
config = {}
config['name'] = "WKB + BZIP2"
config['compress_function'] = lambda shp: bz2.compress(shapely.to_wkb(shp))
configs.append(config)


In [3]:
# Create resulting df
df = pd.DataFrame(columns=["Dataset", "Algorithm", "Size"])

def evaluate_fpde_config(shapes, df, dataset_name, config):
    old_cfg = cfg.USE_DEFAULT_DOUBLE, cfg.ENTROPY_METHOD, cfg.CHUNK_COMP_METHOD, cfg.D_BITSIZE_SIZE, cfg.POLY_RING_CNT_SIZE, cfg.RING_CHK_CNT_SIZE, cfg.MAX_NUM_DELTAS, cfg.D_CNT_SIZE, cfg.BASELINE_ON, cfg.DISABLE_OPTIMIZED_UNPACKING, cfg.DELTA_ENCODE_CHUNK_BBOXES, cfg.DISABLE_OPTIMIZED_INTERSECTION, cfg.DISABLE_OPTIMIZED_ADD_VERTEX, cfg.DISABLE_OPTIMIZED_BOUNDING_BOX, cfg.binary_length, cfg.offset, cfg.FLOAT_SIZE, cfg.ENTROPY_PARAM, cfg.USE_FPINT, cfg.USE_ENTROPY, cfg.COMPRESS_CHUNK, cfg.EXPONENT, cfg.EOF_THRESHOLD 
    if 'setup_function' in config:
        config['setup_function']()
        
    for idx, s in enumerate(tqdm.tqdm(shapes)):
        if 'compress_function' not in config:
            bin = alg.compress(s)[1]
        else:
            bin = config['compress_function'](s)
        bin_len = len(bin)
        df.loc[len(df)] = [dataset_name, config['name'], bin_len]
    cfg.USE_DEFAULT_DOUBLE, cfg.ENTROPY_METHOD, cfg.CHUNK_COMP_METHOD, cfg.D_BITSIZE_SIZE, cfg.POLY_RING_CNT_SIZE, cfg.RING_CHK_CNT_SIZE, cfg.MAX_NUM_DELTAS, cfg.D_CNT_SIZE, cfg.BASELINE_ON, cfg.DISABLE_OPTIMIZED_UNPACKING, cfg.DELTA_ENCODE_CHUNK_BBOXES, cfg.DISABLE_OPTIMIZED_INTERSECTION, cfg.DISABLE_OPTIMIZED_ADD_VERTEX, cfg.DISABLE_OPTIMIZED_BOUNDING_BOX, cfg.binary_length, cfg.offset, cfg.FLOAT_SIZE, cfg.ENTROPY_PARAM, cfg.USE_FPINT, cfg.USE_ENTROPY, cfg.COMPRESS_CHUNK, cfg.EXPONENT, cfg.EOF_THRESHOLD = old_cfg
    return df

ds_labels = ["Sweden Buildings", "Sweden Roads", "Sweden All", "China Water", "Country Borders"]
for c in configs:
    for ds_i, ds in enumerate(datasets):
        df = evaluate_fpde_config(ds, df, ds_labels[ds_i], c)

100%|██████████| 100/100 [00:00<00:00, 309.75it/s]
100%|██████████| 100/100 [00:00<00:00, 628.25it/s]
100%|██████████| 100/100 [00:00<00:00, 332.66it/s]
100%|██████████| 100/100 [00:00<00:00, 351.12it/s]
100%|██████████| 100/100 [00:00<00:00, 302.44it/s]
100%|██████████| 100/100 [00:00<00:00, 553.10it/s]
100%|██████████| 100/100 [00:00<00:00, 388.47it/s]
100%|██████████| 100/100 [00:00<00:00, 592.57it/s]
100%|██████████| 100/100 [00:00<00:00, 505.78it/s]
100%|██████████| 100/100 [00:00<00:00, 616.85it/s]
100%|██████████| 100/100 [00:00<00:00, 859.88it/s]
100%|██████████| 100/100 [00:00<00:00, 892.28it/s]
100%|██████████| 100/100 [00:00<00:00, 795.38it/s]
100%|██████████| 100/100 [00:00<00:00, 846.52it/s]
100%|██████████| 100/100 [00:00<00:00, 480.75it/s]
100%|██████████| 100/100 [00:00<00:00, 691.40it/s]


In [4]:
import plotly.express as px
def plot_total_time_box(df):
    fig = px.box(df, 
                 x="Algorithm", 
                 y="Size",
                 facet_col="Dataset", 
                 color="Dataset",
                 boxmode="group", 
                 facet_col_spacing=0.1, 
                 facet_row_spacing=0.1,
                 title=f"Execution Time for in Different Contexts & Sizes",
                 labels=dict(sizes="Sizes", context="Context", baseline="Baseline"),
                 facet_col_wrap=3,
                 height=700
                 )
    #make_general_facet_title(fig, "Bouning Box Context", "Mean Execution Time (s)")
    fig.update_yaxes(matches=None)
    fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
    
    fig.show()

display(df)
plot_total_time_box(df)

Unnamed: 0,Dataset,Algorithm,Size
0,Sweden Buildings,FPD (Baseline),52
1,Sweden Buildings,FPD (Baseline),212
2,Sweden Buildings,FPD (Baseline),70
3,Sweden Buildings,FPD (Baseline),279
4,Sweden Buildings,FPD (Baseline),279
...,...,...,...
1595,Sweden Roads,WKB + BZIP2,563
1596,Sweden Roads,WKB + BZIP2,166
1597,Sweden Roads,WKB + BZIP2,166
1598,Sweden Roads,WKB + BZIP2,402
