## Multi Dataset Size Statistics
This notebook can be used to collect the size of different FPDE setups.

In [None]:
import jsonlines
import glob
import osmnx as ox
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import folium
import tqdm
import shapely
import algos.fpd_extended_lib.cfg as cfg
import random
import os
from shapely.wkt import loads
import numpy as np
import re
import seaborn as sns
from algos.alg_fpd_extended import FpdExtended
import pickle
import gzip
import bz2

alg = FpdExtended()
NBR_ITER = 30000

DATASETS_PATHS = ["data/sweden-latest-free/gis_osm_buildings_a_free_1.shp", "data/sweden-latest-free/gis_osm_roads_free_1.shp", 'data/sweden-latest-free', "data/china-latest-free/gis_osm_water_a_free_1.shp", "data/final_country_borders.shp"]
LOAD_SAMPLED_DATASETS = True
STORE_SAMPLED_DATASETS = False
datasets = []
if not LOAD_SAMPLED_DATASETS:
    for ds in DATASETS_PATHS:
        if not ds.endswith('.shp'):
            files = glob.glob(ds + '/*.shp')
        else:
            files = [ds]

        ds_shapes = pd.DataFrame()
        for i, f in enumerate(tqdm.tqdm(files, disable=True)):
            file_df = gpd.read_file(f)
            print("Count of entries:", len(file_df))
            file_df = file_df[file_df.type != "Point"]
            print("---", ds, "---")
            print("Count of entries (no points):", len(file_df))
            if len(file_df) == 0:
                continue
            else:
                shapes = file_df.geometry
                if len(ds_shapes) > 0:
                    ds_shapes = pd.concat([ds_shapes, shapes])
                else:
                    ds_shapes = shapes

        if NBR_ITER != -1 and len(ds_shapes) > 0:
            ds_shapes = ds_shapes.sample(n=NBR_ITER, replace=True)
        if len(ds_shapes) > 0:
            datasets.append(ds_shapes)
    if STORE_SAMPLED_DATASETS:
        with open('data/compression_alternatives_ds_sampled.pkl', 'wb') as f:
          pickle.dump(datasets, f)
else:
    with open('data/compression_alternatives_ds_sampled.pkl', 'rb') as f:
        datasets = pickle.load(f)

In [None]:
configs = []

## ------ BASELINE ------ ##
config = {}
def setup():
    cfg.MAX_NUM_DELTAS, cfg.BASELINE_ON, cfg.DISABLE_OPTIMIZED_INTERSECTION, cfg.DISABLE_OPTIMIZED_ADD_VERTEX, cfg.DISABLE_OPTIMIZED_BOUNDING_BOX, cfg.USE_DEFAULT_DOUBLE, cfg.FLOAT_SIZE, cfg.USE_FPINT = 99999, True, True, True, True, True, 64, False
config['name'] = "FPD (Baseline)"
config['setup_function'] = setup
configs.append(config)

## ------ FPDE ------ ##
config = {}
config['name'] = "FPDE"
configs.append(config)


## ------ FPDE + Entropy ------ ##
config = {}
def setup():
    cfg.ENTROPY_METHOD = cfg.EM.AUTO
config['name'] = "FPDE: Entropy Encoding"
config['setup_function'] = setup
configs.append(config)


## ------ FPDE + Size Optimal ------ ##
config = {}
def setup():
    cfg.ENTROPY_METHOD = cfg.EM.AUTO
    cfg.DISABLE_RANDOM_ACCESS = True
    cfg.DISABLE_OPTIMIZED_INTERSECTION, cfg.DISABLE_OPTIMIZED_BOUNDING_BOX = True, True
config['name'] = "FPDE: Size Optimized"
config['setup_function'] = setup
configs.append(config)


## ------ FPDE - FLOAT DELTAS ------ ##
config = {}
def setup():
    cfg.USE_DEFAULT_DOUBLE, cfg.FLOAT_SIZE, cfg.USE_FPINT = True, 64, False
config['name'] = "FPDE: Arbitrary Precision"
config['setup_function'] = setup
configs.append(config)

# ------ WKB ------ ##
config = {}
config['name'] = "WKB"
config['compress_function'] = lambda shp: shapely.to_wkb(shp)
configs.append(config)

## ------ WKB + GZIP ------ ##
config = {}
config['name'] = "WKB: GZIP"
config['compress_function'] = lambda shp: gzip.compress(shapely.to_wkb(shp))
configs.append(config)

## ------ WKB + BZ2 ------ ##
config = {}
config['name'] = "WKB: BZIP2"
config['compress_function'] = lambda shp: bz2.compress(shapely.to_wkb(shp))
configs.append(config)


In [None]:
DO_EVALUATE = False

if DO_EVALUATE:
    # Create resulting df
    df = pd.DataFrame(columns=["Dataset", "Algorithm", "Size", "Relative Size", "Wkb Size"])

    def evaluate_fpde_config(shapes, df, dataset_name, config):
        old_cfg = cfg.USE_DEFAULT_DOUBLE, cfg.ENTROPY_METHOD, cfg.DISABLE_RANDOM_ACCESS, cfg.CHUNK_COMP_METHOD, cfg.D_BITSIZE_SIZE, cfg.POLY_RING_CNT_SIZE, cfg.RING_CHK_CNT_SIZE, cfg.MAX_NUM_DELTAS, cfg.D_CNT_SIZE, cfg.BASELINE_ON, cfg.DISABLE_OPTIMIZED_UNPACKING, cfg.DELTA_ENCODE_CHUNK_BBOXES, cfg.DISABLE_OPTIMIZED_INTERSECTION, cfg.DISABLE_OPTIMIZED_ADD_VERTEX, cfg.DISABLE_OPTIMIZED_BOUNDING_BOX, cfg.binary_length, cfg.offset, cfg.FLOAT_SIZE, cfg.ENTROPY_PARAM, cfg.USE_FPINT, cfg.USE_ENTROPY, cfg.COMPRESS_CHUNK, cfg.EXPONENT, cfg.EOF_THRESHOLD 
        if 'setup_function' in config:
            config['setup_function']()
            
        for idx, s in enumerate(tqdm.tqdm(shapes)):
            if 'compress_function' not in config:
                bin = alg.compress(s)[1]
            else:
                bin = config['compress_function'](s)
            bin_len = len(bin)
            wkb_len = len(shapely.to_wkb(s))
            df.loc[len(df)] = [dataset_name, config['name'], bin_len, wkb_len / bin_len, wkb_len]
        cfg.USE_DEFAULT_DOUBLE, cfg.ENTROPY_METHOD, cfg.DISABLE_RANDOM_ACCESS, cfg.CHUNK_COMP_METHOD, cfg.D_BITSIZE_SIZE, cfg.POLY_RING_CNT_SIZE, cfg.RING_CHK_CNT_SIZE, cfg.MAX_NUM_DELTAS, cfg.D_CNT_SIZE, cfg.BASELINE_ON, cfg.DISABLE_OPTIMIZED_UNPACKING, cfg.DELTA_ENCODE_CHUNK_BBOXES, cfg.DISABLE_OPTIMIZED_INTERSECTION, cfg.DISABLE_OPTIMIZED_ADD_VERTEX, cfg.DISABLE_OPTIMIZED_BOUNDING_BOX, cfg.binary_length, cfg.offset, cfg.FLOAT_SIZE, cfg.ENTROPY_PARAM, cfg.USE_FPINT, cfg.USE_ENTROPY, cfg.COMPRESS_CHUNK, cfg.EXPONENT, cfg.EOF_THRESHOLD = old_cfg
        return df

    ds_labels = ["Sweden Buildings", "Sweden Roads", "Sweden All", "China Water", "Country Borders"]
    for c in configs:
        for ds_i, ds in enumerate(datasets):
            df = evaluate_fpde_config(ds, df, ds_labels[ds_i], c)

In [None]:
import pickle
# with open('data/compression_config_stats.pkl', 'wb') as f:
#     pickle.dump(df, f)


with open('data/compression_config_stats.pkl', 'rb') as f:
        df = pickle.load(f)


In [None]:
import plotly.express as px
def plot_total_time_box(df, y='Size', title=""):
    fig = px.box(df, 
                 x="Algorithm", 
                 y=y,
                 facet_col="Dataset", 
                 color="Algorithm",
                 boxmode="group", 
                 facet_col_spacing=0.1, 
                 facet_row_spacing=0.1,
                 title=title,
                 labels=dict(sizes="Sizes", context="Context", baseline="Baseline"),
                 facet_col_wrap=3,
                 height=900,
                 width=1200
                 )
    #make_general_facet_title(fig, "Bouning Box Context", "Mean Execution Time (s)")
    fig.update_yaxes(matches=None)
    fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
    
    fig.show()

#display(df)
plot_total_time_box(df, title="Compression Size")
plot_total_time_box(df, y="Relative Size", title="Compression Factor")
display(df)

In [None]:
p = sns.catplot(kind='boxen', data=df, x='Algorithm', y='Relative Size', col='Dataset', col_wrap=2, height=4)

# add figure level title
p.fig.subplots_adjust(top=0.9)
p.fig.suptitle('Sapiens', size=16)

# enable tick labels for xticks on all axes
for ax in p.axes.flat:
    ax.tick_params(labelbottom=False)
p.tight_layout()
plt.grid()

In [None]:

df_stripped = df[~df['Algorithm'].isin(["WKB: BZIP2", "WKB: GZIP"])]
sns.set_context("notebook", font_scale=1.2)
p = sns.catplot(kind='violin', data=df_stripped, x='Relative Size', y='Algorithm', col='Dataset', hue="Algorithm", dodge=False, aspect=1.5, palette='pastel', col_wrap=2, height=3.5, orient='h', legend=True, legend_out=False)
# add figure level title
# boxen

axes = p.axes.flat
axes[0].legend(bbox_to_anchor=(1.97, -1.57))
# enable tick labels for xticks on all axes
for i, ax in enumerate(axes):
    if i != 2:
        ax.set_ylabel('')
    if i != 4:
        ax.set_xlabel('')
    else:
        ax.set_xlabel('Compression Factor')
    original_title = ax.title.get_text()  # Get the original title
    title_without_prefix = original_title.split('=')[-1].strip()  # Remove the "Dataset =" prefix
    ax.set_title(title_without_prefix)  # Set the modified title
    ax.tick_params(labelbottom=True)
    ax.set_xlim([0, 5.5])
    ax.tick_params(labelleft=False)
    ax.grid(axis='x')
    ax.set_axisbelow(True)

#axes[0].tick_params(labelleft=True)
plt.show()

In [None]:
display(df)
pd.options.display.float_format = "{:,.2f}".format
df_avg = df.groupby(['Dataset', 'Algorithm']).sum()
df_avg['Total Comp Factor'] = df_avg['Wkb Size'] / df_avg['Size']
df_avg['Total Comp Ratio'] = 100 * df_avg['Size'] / df_avg['Wkb Size']
df_avg['Size Kb'] = df_avg['Size'] / (8388608)
display(df_avg)

res = ""
prev = None
for i, d in df_avg.iterrows():
    if i[0] != prev:
        if prev != None:
            print(res)
        res = f"{i[0]} & "
        prev = i[0] # ({round(d['Total Comp Ratio'], 1)})
    res += f"{round(d['Total Comp Factor'], 2)} & "
    #print(i[1])

print(res)
# df_avg = df.sum()
# df_avg['Total Comp Factor'] = df_avg['Wkb Size'] / df_avg['Size']
# df_avg['Total Comp Ratio'] = 100 * df_avg['Size'] / df_avg['Wkb Size']
# df_avg['Size Kb'] = df_avg['Size'] / (8388608)

In [97]:

df2= df_avg.groupby("Algorithm")['Total Comp Factor', "Wkb Size"].mean()
df2.reset_index(inplace=True)
import plotly.express as px
df2 = df2[~df2['Algorithm'].isin(["WKB: BZIP2", "FPDE: Arbitrary Precision"])]
df2 = df2.rename(columns={"Total Comp Factor": "Compression Factor"})
display(df2)
fig = px.bar(df2,  x="Algorithm", y="Compression Factor", text_auto=".1f", category_orders={"Algorithm":["WKB", "WKB: GZIP"]})
fig.update_annotations(font=dict(size=16))
fig.update_yaxes(matches=None, showticklabels=True,  tickfont=dict(size=16))
fig.update_xaxes(tickfont=dict(size=16))
fig.update_traces(textfont=dict(size=16))
fig.update_yaxes(matches=None, showticklabels=True,  tickfont=dict(size=20))
fig.update_layout(
font = dict(size=18))
fig.show()



Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,Algorithm,Compression Factor,Wkb Size
0,FPD (Baseline),1.63,36735442.8
1,FPDE,2.29,36735442.8
3,FPDE: Entropy Encoding,2.41,36735442.8
4,FPDE: Size Optimized,4.26,36735442.8
5,WKB,1.0,36735442.8
7,WKB: GZIP,1.19,36735442.8
