## Analysis of datasets + stats for compressing with FPDE
This notebook contains methods for passing a .shp-file (or directory) and obtaining stats for the FPDE properties of the compression. Stats include the number of chunks, number of deltas within each chunk, average number of vertices, and a distribution of the overhead within the format.

## Evaluating one shape-file
The methods take one .shp-file and return the stats. Multiple calls can be merged further done if a set of .shp-files are to be analysed.

In [60]:
import jsonlines
import glob
import osmnx as ox
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import folium
import tqdm
import shapely
import random
import os
from shapely.wkt import loads
from algos.fpd_extended_lib.cfg import *
import numpy as np
import re
import seaborn as sns
from algos.alg_fpd_extended import FpdExtended

In [66]:
#df = gpd.read_file('data/ne_10m_admin_1_states_provinces.shp')
#df = gpd.read_file('data/sweden-latest-free/gis_osm_buildings_a_free_1.shp')
#df = gpd.read_file('data/sweden-latest-free/gis_osm_natural_a_free_1.shp')
df = gpd.read_file('data/sweden-latest-free/gis_osm_railways_free_1.shp')

MAX_ITER = -1
ONLY_VERTEX_CNT = True # Can be used if only the amount of vertices in the dataset is required

df = df[df.type != "Point"]
print("Count of entries:", len(df))
shapes = df.geometry

vertex_cnt = []
if ONLY_VERTEX_CNT:
    for s in tqdm.tqdm(shapes[0:MAX_ITER]):
        vertex_cnt.append(shapely.get_num_coordinates(s))

    print("Vertex count (avg, min, max): " + str(round(np.average(vertex_cnt), 2)), np.min(vertex_cnt), np.max(vertex_cnt), sep=', ')

Count of entries: 25885


100%|██████████| 25884/25884 [00:00<00:00, 141248.96it/s]

Vertex count (avg, min, max): 13.72, 2, 448





In [67]:
stats = pd.DataFrame(columns=["No Comp Size", "Comp Size", "Comp Factor", "Vertex Cnt", "Chunk Cnt", "Avg Vertices in Chk", "Min Vertices in Chk", "Max Vertices in Chk"])
stats_distribution = pd.DataFrame()
stats_max_values = pd.DataFrame()
alg = FpdExtended()

for idx, s in enumerate(tqdm.tqdm(shapes[0:MAX_ITER])):
    bin = alg.compress(s)[1]
    wkb_len = len(shapely.to_wkb(s))
    bin_len = len(bin)
    coords_len = shapely.get_num_coordinates(s)

    chks, _, overhead_stats = alg.get_chunks(bin, include_ring_start=False, verbose=True)
    chk_cnt = len(chks)
    chk_lens = list(map(lambda x: len(x), chks))

    stats.loc[len(stats)] = [wkb_len, bin_len, wkb_len / bin_len, coords_len, chk_cnt, np.average(chk_lens), np.min(chk_lens), np.max(chk_lens)]

    # Analysis of space
    max_values, distrb = overhead_stats
    theoretical_size = sum(distrb.values())
    distrb = pd.DataFrame(distrb, index=[idx])
    stats_distribution = pd.concat([stats_distribution, distrb])
    stats_max_values = pd.concat([stats_max_values, pd.DataFrame(max_values, index=[idx])])

    DISPLAY_PER_SHAPE_STATS = False
    if DISPLAY_PER_SHAPE_STATS:
        display(distrb)
        print("Size (calculated, rounded to byte, real):", theoretical_size, f"({((theoretical_size + 7) & (-8))})", bin_len * 8)
        print("Max Values:", dict(max_values))

print("---- AVERAGE COMPRESSED SHAPE ----")
pd.set_option('display.precision', 2)
display(stats.mean())
print("Global: Min Vertices in Chk / Max Vertices in Chk", np.min(stats['Min Vertices in Chk']), np.max(stats['Max Vertices in Chk']))

print("---- DATA DISTRIBUTION ----")
final_distribution = stats_distribution.mean()
display(final_distribution)
display(final_distribution.apply(lambda x: 100 * x / sum(final_distribution)))

print("---- TO SET MANUAL PARAMS ----")
final_max_values = stats_max_values.max()
min_bits = final_max_values.apply(lambda x: required_bits(x))
min_bits = min_bits.add_prefix("Bits ")
display(pd.concat([final_max_values, min_bits]))

100%|██████████| 25884/25884 [01:18<00:00, 330.97it/s]

---- AVERAGE COMPRESSED SHAPE ----





No Comp Size           228.54
Comp Size              109.59
Comp Factor              1.64
Vertex Cnt              13.72
Chunk Cnt                1.66
Avg Vertices in Chk      6.67
Min Vertices in Chk      5.24
Max Vertices in Chk      7.85
dtype: float64

Global: Min Vertices in Chk / Max Vertices in Chk 1.0 16.0
---- DATA DISTRIBUTION ----


Global Header Bitsize       396.84
Chk Deltas Cnt Bitsize        8.31
Full Coordinates Bitsize    106.42
Deltas Bitsize              361.15
dtype: float64

Global Header Bitsize       45.47
Chk Deltas Cnt Bitsize       0.95
Full Coordinates Bitsize    12.19
Deltas Bitsize              41.38
dtype: float64

---- TO SET MANUAL PARAMS ----


Chk Deltas Cnt Max         15
Bits Chk Deltas Cnt Max     4
dtype: int64