## Test Bench
This  notebook is used to evaluate the performance of a compression algorithm.

It is written such that a new compression algorithm can be added without altering the bench (just change the input in the config).

In [13]:
# ADD NEW ALGORITHMS HERE
from algos.alg_wkt import Wkt
from algos.alg_wkb import Wkb
from algos.alg_meta_wkb_comp import MetaWkbComp # Wkb with Metadat
from algos.alg_meta_wkt_comp import MetaWktComp # Wkt with Metadat
from algos.alg_wkb_comp import WkbComp
from algos.alg_wkt_comp import WktComp
from algos.alg_fpd import Fpd
from algos.alg_fpd_extended import FpdExtended
from algos.alg_fpd_predictor import FpdPredictor
from algos.fpd_extended_lib.entropy_coder import *


# CONFIG
ALG = FpdExtended()
#DATASET_PATH = "data/lund_building_highway.json"
#DATASET_PATH = "data/sweden_processed_whole"
#DATASET_PATH = "data/world.json"
#DATASET_PATH = "data/world_7_dec.json"
#DATASET_PATH = "data/sweden.json"
#DATASET_PATH = "data/latest_export.json"
#DATASET_PATH = "data/latest_export" # Use folder (single files)
NBR_ITER = 100000
N = 5
AVOID_DATASET_LENGTH_CHECK = True # For very large folders, disable

IS_SHP_DIR = True
#DATASET_PATH = "data/china-latest-free/gis_osm_waterways_free_1.shp"
DATASET_PATH = "data/sweden-latest-free"
#DATASET_PATH = 'data/sweden-latest-free/gis_osm_buildings_a_free_1.shp'
# Used sets: ["data/sweden-latest-free/gis_osm_buildings_a_free_1.shp", "data/sweden-latest-free/gis_osm_roads_free_1.shp", "data/china-latest-free/gis_osm_water_a_free_1.shp", "data/ne_10m_admin_1_states_provinces.shp"]

COMPR_PATH = "data/testbench_compressed_single"
#COMPR_PATH = "data/testbench_compressed"

DECOMPR_PATH = "data/testbench_decompressed_single"

In [14]:
import time
import os
from enum import Enum
import pandas as pd
from datetime import datetime
import filecmp
import random
import json
import tqdm
from shapely.geometry import shape
import glob
import timeit
import shapely
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd

# List the metrics which are to be logged
ET = Enum('ET', ['COMP_TIME', 'DECOMP_TIME', 'COMP_SIZE', 'DECOMP_SIZE',
    'VERTICES', 'TYPE', 'BOUNDING_BOX', 'ADD_VERTEX', 'IS_INTERSECTING', 'INTERSECTION', 'IS_BASELINE'
]) # EVAL_TYPE

if not os.path.exists(COMPR_PATH):
   os.makedirs(COMPR_PATH)
for f in glob.glob(COMPR_PATH + '/*'):
    os.remove(f)
if not os.path.exists(DECOMPR_PATH):
   os.makedirs(DECOMPR_PATH)
for f in glob.glob(DECOMPR_PATH + '/*'):
    os.remove(f)

In [15]:
# Maps an evaluation type to the corresponding function in the algorithm.
# Note that ALG (global variable) can change, so a function is used.
from functools import reduce
import operator

def et_to_func():
    return {
        ET.VERTICES: ALG.vertices, 
        ET.TYPE: ALG.type,
        ET.BOUNDING_BOX: ALG.bounding_box,
        ET.ADD_VERTEX: ALG.add_vertex, 
        ET.IS_INTERSECTING: ALG.is_intersecting, 
        ET.INTERSECTION: ALG.intersection 
        }

# Params: "et: EVAL_TYPE, idx: geometry index to query, value_only: dont save entry in log"
def measure(et, args, value_only=False): # Works for functions taking only a geometry 'idx' as parameter
    """
    Takes an evaluation type and executes the corresponding operation, storing the execution time and resulting value.
    Also returns the resulting value of the operation.
    """
    sum_t = 0
    for _ in range(N):
        func = et_to_func()[et] # Find the function to be called
        t, v = func(args)
        if not value_only:
            sum_t += t
    
    res[et].append(sum_t / N)
    return v

SEED = 123 # If we want to enforce the same ordering and indexes for multiple runs, else None
#SEED = random.randint(0, 9999999999999999)
random.seed(SEED) # Init random

res = {x: [] for x in ['idx_unary', 'idx_binary'] + list(ET)} # Store the 'results' of the measurements


# ----- LOAD DATA FROM DATA SET ------
if DATASET_PATH.endswith('.json'):
    # Extract the nested feature attribute of the geo_json file containing the geometries
    with open(DATASET_PATH, 'r') as f:
        data = json.loads(f.read())
    file_df: pd.DataFrame = pd.json_normalize(data, record_path=['features'])
    # Create a dataframe suitable for the WKT format for easy convertion to shapely objects
    df_dataset = pd.DataFrame(
        {'type': file_df['geometry.type'], 'coordinates': file_df['geometry.coordinates']})
    max_idx = len(df_dataset) - 1

    def get_shape_data(idx):
        return df_dataset.iloc[idx]
elif DATASET_PATH.endswith('.shp') or IS_SHP_DIR:
    if not DATASET_PATH.endswith('.shp') and IS_SHP_DIR:
        files = glob.glob(DATASET_PATH + '/*.shp')
    else:
        files = [DATASET_PATH]

    all_shapes = pd.DataFrame()
    for i, f in enumerate(tqdm.tqdm(files, disable=True)):
        file_df = gpd.read_file(f)
        print("Count of entries:", len(file_df))
        file_df = file_df[file_df.type != "Point"]
        print("---", DATASET_PATH, "---")
        print("Count of entries (no points):", len(file_df))
        if len(file_df) != 0:
            shapes = file_df.geometry
            all_shapes = pd.concat([all_shapes, shapes])
        
    if NBR_ITER != -1:
        all_shapes = all_shapes.sample(n=NBR_ITER, replace=True)
    
    max_idx = len(shapes) - 1
    def get_shape_data(idx):
        return shapes.iloc[idx]
else:
    if not AVOID_DATASET_LENGTH_CHECK:
        tree = os.walk(DATASET_PATH, topdown=True)
        max_idx = len(next(tree)[2]) - 1
    else:
        max_idx = 5000000

    def get_shape_data(idx):
        with open(f'{DATASET_PATH}/{idx}.json', 'r') as f:
            data = json.loads(f.read())
        
        file_df: pd.DataFrame = pd.json_normalize(data, record_path=['features'])
        # # Create a dataframe suitable for the WKT format for easy convertion to shapely objects
        df_dataset = pd.DataFrame(
             {'type': file_df['geometry.type'], 'coordinates': file_df['geometry.coordinates']})
        return df_dataset.iloc[0]
# ----- /END/ LOAD DATA FROM DATA SET ------

#df

Count of entries: 1065926
--- data/sweden-latest-free ---
Count of entries (no points): 1065926



CRS not set for some of the concatenation inputs. Setting output's CRS as WGS 84 (the single non-null crs provided).



Count of entries: 230823
--- data/sweden-latest-free ---
Count of entries (no points): 0
Count of entries: 2970860
--- data/sweden-latest-free ---
Count of entries (no points): 2970860
Count of entries: 107150
--- data/sweden-latest-free ---
Count of entries (no points): 107150
Count of entries: 779
--- data/sweden-latest-free ---
Count of entries (no points): 0
Count of entries: 298738
--- data/sweden-latest-free ---
Count of entries (no points): 298738
Count of entries: 52222
--- data/sweden-latest-free ---
Count of entries (no points): 0
Count of entries: 1509
--- data/sweden-latest-free ---
Count of entries (no points): 1509
Count of entries: 3472
--- data/sweden-latest-free ---
Count of entries (no points): 3472
Count of entries: 66473
--- data/sweden-latest-free ---
Count of entries (no points): 0
Count of entries: 3761
--- data/sweden-latest-free ---
Count of entries (no points): 3761
Count of entries: 127325
--- data/sweden-latest-free ---
Count of entries (no points): 0
Count 

In [60]:
unary_idxs = [random.randint(0, max_idx) for i in range(NBR_ITER if NBR_ITER != -1 else max_idx)] # Generate list of indexes to query on
binary_idxs = [(random.randint(0, max_idx), random.randint(0, max_idx)) for i in range(NBR_ITER if NBR_ITER != -1 else max_idx)] # Generate pair of indexes to query on. # TODO: Better selection than random
random.seed(SEED) # Reset random

def act_deact_baseline(activate=True):
        cfg.BASELINE_ON = activate
        cfg.DISABLE_OPTIMIZED_INTERSECTION = False if not activate else True
        cfg.DISABLE_OPTIMIZED_ADD_VERTEX = False if not activate else True
        cfg.DISABLE_OPTIMIZED_BOUNDING_BOX = False if not activate else True

def compress(data, file_comp):
    # Compress
    if not (IS_SHP_DIR or DATASET_PATH.endswith('.shp')):
        geometry = shape(data)
    else:
        geometry = data
    t, bin = ALG.compress(geometry)

    # Write to disk
    f = open(file_comp, "wb")
    f.write(bin)
    f.close()
    return t, bin

# Compress files, benchmark unaries
for is_baseline in [True, False]:
    act_deact_baseline(is_baseline)
    for idx in tqdm.tqdm(unary_idxs): # List of single idxs
        res[ET.IS_BASELINE].append(is_baseline) 
        file_comp = f"{COMPR_PATH}/{idx}"
        #t = timeit.timeit(lambda: compress(get_shape_data(idx), file_comp), number=50)
        t, bin = compress(get_shape_data(idx), file_comp)
        res[ET.COMP_TIME].append(t) # Store delta time
        #res[ET.COMP_SIZE].append(os.stat(file_comp).st_size) # Store compressed file size
        res[ET.COMP_SIZE].append(len(bin))
    for idx in tqdm.tqdm(list(reduce(operator.concat, binary_idxs))): # List of pairs collapsed
        if idx not in unary_idxs:
            file_comp = f"{COMPR_PATH}/{idx}"
            compress(get_shape_data(idx), file_comp)

    for i in tqdm.tqdm(range(len(unary_idxs))):
    # ------------ UNARY ------------ #
        idx = unary_idxs[i]
        file_comp = f"{COMPR_PATH}/{idx}"
        file_decomp = f"{DECOMPR_PATH}/{idx}"

        res["idx_unary"].append(idx)

        # Read binary into memory
        f = open(file_comp, "rb")
        bin = f.read()
        f.close()

        # Decompress
        t, decomp = ALG.decompress(bin)
        #decomp = shapely.to_wkt(decomp, rounding_precision=-1)
        decomp = shapely.to_wkb(decomp)
        # Write to disk
        f = open(file_decomp, "wb")
        f.write(decomp)
        f.close()
        res[ET.DECOMP_TIME].append(t) # Store delta time
        res[ET.DECOMP_SIZE].append(os.stat(file_decomp).st_size) # Store decompressed file size

        v = measure(ET.VERTICES, bin)
        measure(ET.TYPE, bin)
        measure(ET.BOUNDING_BOX, bin)
        add_idx = random.randint(0, len(v) - 2)
        add_point = (round(v[add_idx][0] + random.randint(-25, 25) * 0.00001,7), round(v[add_idx][1] + random.randint(-25, 25) * 0.00001,7))
        if not (cfg.USE_ENTROPY or cfg.COMPRESS_CHUNK):
            measure(ET.ADD_VERTEX, (bin, add_idx, add_point))

    for i in tqdm.tqdm(range(len(binary_idxs))):
    # ------------ BINARY ------------ #
        l_idx, r_idx = binary_idxs[i]

        res["idx_binary"].append(binary_idxs[i])

        # Read both binary into memory
        f = open(f"{COMPR_PATH}/{l_idx}", "rb")
        l_bin = f.read()
        f.close()

        f = open(f"{COMPR_PATH}/{r_idx}", "rb")
        r_bin = f.read()
        f.close()

        measure(ET.IS_INTERSECTING, (l_bin, r_bin))
        measure(ET.INTERSECTION, (l_bin, r_bin))
df = pd.DataFrame(res)

100%|██████████| 10000/10000 [00:07<00:00, 1416.56it/s]
100%|██████████| 20000/20000 [00:15<00:00, 1258.55it/s]
100%|██████████| 10000/10000 [00:44<00:00, 222.58it/s]
100%|██████████| 10000/10000 [00:17<00:00, 561.31it/s]
100%|██████████| 10000/10000 [00:06<00:00, 1598.75it/s]
100%|██████████| 20000/20000 [00:16<00:00, 1199.94it/s]
100%|██████████| 10000/10000 [00:12<00:00, 796.99it/s]
100%|██████████| 10000/10000 [00:05<00:00, 1771.48it/s]


In [61]:
df_relative = df.copy() 
def calculate_ratio(x):
    base = x[x[ET.IS_BASELINE] == True]
    non_base = x[x[ET.IS_BASELINE] == False]
    if not base.empty and not non_base.empty:
        return non_base.divide(base.values, axis=1)
    else:
        return None
df_relative["idx_unary"] += 1
df_relative.to_csv("reports/" + '_[' + DATASET_PATH.split("/")[1] + "_df]_" + datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + ".csv")                    

df_relative = df_relative.groupby(["idx_unary", "idx_binary"]).apply(calculate_ratio)
df_relative.reset_index(inplace=True)
if 'level_2'  in  df_relative.columns:
    df_relative.drop('level_2', axis=1, inplace=True)
df_relative.drop([ET.IS_BASELINE, ET.TYPE, ET.VERTICES], axis=1, inplace=True)

df_relative = df_relative.rename(columns={ET.IS_BASELINE: "is_baseline",
                            ET.ADD_VERTEX: "Add Vertex",
                            ET.IS_INTERSECTING: "Is Intersection",
                            ET.INTERSECTION: "Intersection",
                            ET.COMP_TIME: "Compress",
                            ET.COMP_SIZE: "comp_size",
                            ET.DECOMP_TIME: "Decompress",
                            ET.DECOMP_SIZE: "decomp_size",
                            ET.BOUNDING_BOX: "Bounding Box",
                            ET.TYPE: "Type",
                            ET.VERTICES: "Verticies"})


In [95]:

if True:
    df1 = pd.DataFrame()
    prevs = glob.glob(f'reports/*')
    for prev in prevs:
        dataset = prev.split("[")[1].split("]")[0]
        prev_df = pd.read_csv(prev)
        prev_df.drop("idx_binary", axis=1, inplace=True)
        prev_df.drop(columns=["Unnamed: 0",	"idx_unary"],axis=1, inplace=True)
        display(prev_df.groupby(str(ET.IS_BASELINE)).mean())

        prev_df['Dataset'] = dataset
        df1 = pd.concat([prev_df, df1])
    display(df)


Unnamed: 0_level_0,ET.COMP_TIME,ET.DECOMP_TIME,ET.COMP_SIZE,ET.DECOMP_SIZE,ET.VERTICES,ET.TYPE,ET.BOUNDING_BOX,ET.ADD_VERTEX,ET.IS_INTERSECTING,ET.INTERSECTION
ET.IS_BASELINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
False,0.000301,0.000104,222.628283,211.0072,6.1e-05,5.275209e-07,3e-06,2.7e-05,8e-06,1.9e-05
True,0.000299,0.000103,137.16695,211.0072,5.8e-05,5.572111e-07,8.7e-05,0.000574,0.000121,0.000131


Unnamed: 0_level_0,ET.COMP_TIME,ET.DECOMP_TIME,ET.COMP_SIZE,ET.DECOMP_SIZE,ET.VERTICES,ET.TYPE,ET.BOUNDING_BOX,ET.ADD_VERTEX,ET.IS_INTERSECTING,ET.INTERSECTION
ET.IS_BASELINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
False,0.000642,0.000234,600.3489,683.6272,0.000176,5.19528e-07,3e-06,3.5e-05,8e-06,1.8e-05
True,0.000562,0.000218,440.7425,683.6272,0.000165,5.150722e-07,0.0002,0.000962,0.000381,0.000394


Unnamed: 0_level_0,ET.COMP_TIME,ET.DECOMP_TIME,ET.COMP_SIZE,ET.DECOMP_SIZE,ET.VERTICES,ET.TYPE,ET.BOUNDING_BOX,ET.ADD_VERTEX,ET.IS_INTERSECTING,ET.INTERSECTION
ET.IS_BASELINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
False,0.003168,0.001236,4291.1279,4587.6042,0.000979,6.624584e-07,3e-06,6.1e-05,9e-06,2.4e-05
True,0.002866,0.001205,3371.8159,4587.6042,0.000964,6.873019e-07,0.001138,0.004237,0.002394,0.002349


Unnamed: 0,Dataset,Statistic,Seconds,Compress,Decompress,Bounding Box,Add Vertex,Is Intersection,Intersection
0,China Water,Add Vertex,4.581278,,,,,,
1,China Water,Bounding Box,3.378239,,,,,,
2,China Water,Compress,126.911883,,,,,,
3,China Water,Decompress,114.178360,,,,,,
4,China Water,Intersection,11.032883,,,,,,
...,...,...,...,...,...,...,...,...,...
9995,Country Borders,,,0.929239,0.923350,0.019999,0.070859,0.001162,0.002681
9996,Country Borders,,,2.127099,0.880218,0.031219,0.053038,0.009597,0.014263
9997,Country Borders,,,0.951946,1.128949,0.030739,0.091167,0.006172,0.022035
9998,Country Borders,,,1.143939,0.245358,0.019296,0.026636,0.000809,0.001690


### Plot the Results

In [99]:
SHOW_PREVIOUS = True
#dataset_name = os.path.splitext(os.path.basename(DATASET_PATH))[0]
#times = df_relative.drop(axis=1, labels=['idx_unary', 'idx_binary', "comp_size", "decomp_size"])
#times['Dataset'] = dataset_name
times = pd.DataFrame()
prevs = glob.glob(f'reports2/*')
for prev in prevs:
    dataset = prev.split("[")[1].split("]")[0]
    prev_df = pd.read_csv(prev)
    prev_df['Dataset'] = dataset
    prev_times = prev_df.drop(axis=1, labels=['idx_unary', 'idx_binary', 'comp_size', 'decomp_size', 'Unnamed: 0'])
    times = pd.concat([times, prev_times])

times = times.melt('Dataset', var_name="Statistic", value_name='Seconds')
times['Statistic'] = times['Statistic'].apply(lambda x: str(x))

import  plotly.express as px
times.columns = ['Dataset', 'Statistic', 'Seconds']
times = times.groupby(["Dataset",  "Statistic"]).mean()
times.reset_index(inplace=True)
times["Seconds"] *= 100
fig = px.bar(times, x="Statistic", y='Seconds', color="Dataset", barmode="group", log_y=False, text="Seconds", 
            title="Relative Execution Time for Different Operations and Datasets",
            category_orders={"Statistic": ["Compress", "Decompress", "Intersection", "Is Intersection", "Add Vertex", "Bounding Box"], 
                             "Dataset": ["Sweden All", "China Waters", "Country Borders"]},
            color_discrete_map={
    'Sweden All': 'rgb(169,234,184)',
    'China Water': 'rgb(255,183,180)',
    'Country Borders': 'rgb(254,198,161)'
})




fig.update_yaxes(title_text="Relative Execution time (%)", tickfont=dict(size=18), title_font=dict(size=22))
fig.update_xaxes(title_text="Operation", title_font=dict(size=16))
#fig.update_annotations(font=dict(size=16))


fig.update_traces(texttemplate='%{text:.1f}',textfont=dict(size=17), textposition='outside')
fig.update_layout(legend=dict(
                            yanchor="top",
                            y=1.44,
                            xanchor="left",
                            x=0.83,
                            orientation="v",
                            font=dict(size=16),
                        ),
                  title={"font": {'size': 24}},
                  title_y=0.86,
                  yaxis_range=[0,160],
                  font_family="Arial")
fig.update_xaxes(tickfont=dict(size=18))




fig.show()

### Export to CSV

In [64]:
df_relative.to_csv("reports2/" + ALG.__class__.__name__ + '_[' + "Sweden_All2" + "]_" + datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + ".csv")