## Analysis of datasets + stats for compressing with FPDE
This notebook contains methods for passing a .shp-file (or directory) and obtaining stats for the FPDE properties of the compression. Stats include the number of chunks, number of deltas within each chunk, average number of vertices, and a distribution of the overhead within the format.

## Evaluating one shape-file
The methods take one .shp-file and return the stats. Multiple calls can be merged further done if a set of .shp-files are to be analysed.

In [10]:
import jsonlines
import glob
import osmnx as ox
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import folium
import tqdm
import shapely
import random
import os
from shapely.wkt import loads
from algos.fpd_extended_lib.cfg import *
import numpy as np
import re
import seaborn as sns
from algos.alg_fpd_extended import FpdExtended

In [11]:
#df = gpd.read_file('data/ne_10m_admin_1_states_provinces.shp')
#df = gpd.read_file('data/sweden-latest-free/gis_osm_buildings_a_free_1.shp')
#df = gpd.read_file('data/sweden-latest-free/gis_osm_natural_a_free_1.shp')
MAX_ITER = -1

def load_shp(file_name, DO_VERTEX_CNT):
    df = gpd.read_file(file_name)
    print("Count of entries:", len(df))
    df = df[df.type != "Point"]
    print("---", file_name, "---")
    print("Count of entries (no points):", len(df))
    shapes = df.geometry

    vertex_cnt = []
    if DO_VERTEX_CNT:
        for s in tqdm.tqdm(shapes[0:MAX_ITER]):
            vertex_cnt.append(shapely.get_num_coordinates(s))

        vertex_cnt = np.array(vertex_cnt)
        if len(vertex_cnt) != 0:
            res = f"Total: {len(vertex_cnt)}, avg: {round(np.average(vertex_cnt), 2)}, std: {round(np.std(vertex_cnt), 2)}, median: {np.median(vertex_cnt)}, min: {np.min(vertex_cnt)}, max: {np.max(vertex_cnt)}, #>=25: {(vertex_cnt >= 25).sum()} ({round(100 * (vertex_cnt >= 25).sum() / len(vertex_cnt), 2)}), #>=100: {(vertex_cnt >= 100).sum()} ({round(100 * (vertex_cnt >= 100).sum() / len(vertex_cnt), 2)}), #>=200: {(vertex_cnt >= 200).sum()} ({round(100 * (vertex_cnt >= 200).sum() / len(vertex_cnt), 2)}), #>=500: {(vertex_cnt >= 500).sum()} ({round(100 * (vertex_cnt >= 500).sum() / len(vertex_cnt), 2)}), #>=1000: {(vertex_cnt >= 1000).sum()} ({round(100 * (vertex_cnt >= 1000).sum() / len(vertex_cnt), 2)})"
            print(res)
            with open('vertex_cnt_for_datasets.txt', 'a') as f:
                f.write(f"{file_name}\n{res}\n\n")
        else:
            print("No geometries (except Point(s)) in dataset.")
    return shapes

In [12]:
def fpde_analyze(shapes):
    stats = pd.DataFrame(columns=["No Comp Size", "Comp Size", "Comp Factor", "Vertex Cnt", "Chunk Cnt", "Avg Vertices in Chk", "Min Vertices in Chk", "Max Vertices in Chk"])
    stats_distribution = pd.DataFrame()
    stats_max_values = pd.DataFrame()
    alg = FpdExtended()

    for idx, s in enumerate(tqdm.tqdm(shapes[0:MAX_ITER])):
        bin = alg.compress(s)[1]
        wkb_len = len(shapely.to_wkb(s))
        bin_len = len(bin)
        coords_len = shapely.get_num_coordinates(s)

        chks, _, overhead_stats = alg.get_chunks(bin, include_ring_start=False, verbose=True)
        chk_cnt = len(chks)
        chk_lens = list(map(lambda x: len(x), chks))

        stats.loc[len(stats)] = [wkb_len, bin_len, wkb_len / bin_len, coords_len, chk_cnt, np.average(chk_lens), np.min(chk_lens), np.max(chk_lens)]

        # Analysis of space
        max_values, distrb = overhead_stats
        theoretical_size = sum(distrb.values())
        distrb = pd.DataFrame(distrb, index=[idx])
        stats_distribution = pd.concat([stats_distribution, distrb])
        stats_max_values = pd.concat([stats_max_values, pd.DataFrame(max_values, index=[idx])])

        DISPLAY_PER_SHAPE_STATS = False
        if DISPLAY_PER_SHAPE_STATS:
            display(distrb)
            print("Size (calculated, rounded to byte, real):", theoretical_size, f"({((theoretical_size + 7) & (-8))})", bin_len * 8)
            print("Max Values:", dict(max_values))

    print("---- AVERAGE COMPRESSED SHAPE ----")
    pd.set_option('display.precision', 2)
    display(stats.mean())
    print("Global: Min Vertices in Chk / Max Vertices in Chk", np.min(stats['Min Vertices in Chk']), np.max(stats['Max Vertices in Chk']))

    print("---- DATA DISTRIBUTION ----")
    final_distribution = stats_distribution.mean()
    display(final_distribution)
    display(final_distribution.apply(lambda x: 100 * x / sum(final_distribution)))

    print("---- TO SET MANUAL PARAMS ----")
    final_max_values = stats_max_values.max()
    min_bits = final_max_values.apply(lambda x: required_bits(x))
    min_bits = min_bits.add_prefix("Bits ")
    display(pd.concat([final_max_values, min_bits]))

In [18]:
DATASET = 'data/sweden-latest-free/gis_osm_railways_free_1.shp'
#DATASET = 'data/sweden-latest-free'
#DATASET = 'data/sweden-latest-free/gis_osm_natural_free_1.shp'
DATASET = "data/new-york-latest-free"
ONLY_VERTEX_CNT = True # Can be used if only the amount of vertices in the dataset is required

if DATASET.endswith(".shp"):
    files = [DATASET]
else:
    files = glob.glob(DATASET + '/*.shp')

for f in tqdm.tqdm(files):
    shapes = load_shp(f, ONLY_VERTEX_CNT)
    if not ONLY_VERTEX_CNT:
        fpde_analyze(shapes)

  0%|          | 0/18 [00:00<?, ?it/s]

Count of entries: 131910
--- data/new-york-latest-free/gis_osm_landuse_a_free_1.shp ---
Count of entries (no points): 131910


100%|██████████| 131909/131909 [00:01<00:00, 110769.80it/s]
  6%|▌         | 1/18 [00:28<08:06, 28.62s/it]

Total: 131909, avg: 31.9, std: 125.63, median: 16.0, min: 4, max: 16897, #>=25: 39826 (30.19), #>=100: 5861 (4.44), #>=200: 2073 (1.57), #>=500: 501 (0.38), #>=1000: 156 (0.12)
Count of entries: 57458
--- data/new-york-latest-free/gis_osm_natural_free_1.shp ---
Count of entries (no points): 0


0it [00:00, ?it/s]
 11%|█         | 2/18 [00:35<04:10, 15.68s/it]

No geometries (except Point(s)) in dataset.
Count of entries: 4056806
--- data/new-york-latest-free/gis_osm_buildings_a_free_1.shp ---
Count of entries (no points): 4056806


100%|██████████| 4056805/4056805 [00:59<00:00, 67798.66it/s]


Total: 4056805, avg: 7.39, std: 5.15, median: 5.0, min: 4, max: 1031, #>=25: 38931 (0.96), #>=100: 596 (0.01), #>=200: 114 (0.0), #>=500: 20 (0.0), #>=1000: 13 (0.0)


 17%|█▋        | 3/18 [13:38<1:31:31, 366.08s/it]

Count of entries: 35680
--- data/new-york-latest-free/gis_osm_waterways_free_1.shp ---
Count of entries (no points): 35680


100%|██████████| 35679/35679 [00:00<00:00, 104525.89it/s]
 22%|██▏       | 4/18 [13:48<52:38, 225.60s/it]  

Total: 35679, avg: 33.33, std: 78.64, median: 9.0, min: 2, max: 1772, #>=25: 10254 (28.74), #>=100: 2776 (7.78), #>=200: 1082 (3.03), #>=500: 196 (0.55), #>=1000: 24 (0.07)
Count of entries: 4318
--- data/new-york-latest-free/gis_osm_pofw_free_1.shp ---
Count of entries (no points): 0


0it [00:00, ?it/s]
 28%|██▊       | 5/18 [13:49<31:17, 144.45s/it]

No geometries (except Point(s)) in dataset.
Count of entries: 41358
--- data/new-york-latest-free/gis_osm_water_a_free_1.shp ---
Count of entries (no points): 41358


100%|██████████| 41357/41357 [00:00<00:00, 109553.50it/s]
 33%|███▎      | 6/18 [13:58<19:42, 98.51s/it] 

Total: 41357, avg: 51.97, std: 606.8, median: 20.0, min: 4, max: 90117, #>=25: 16453 (39.78), #>=100: 2971 (7.18), #>=200: 1246 (3.01), #>=500: 354 (0.86), #>=1000: 147 (0.36)
Count of entries: 14969
--- data/new-york-latest-free/gis_osm_transport_free_1.shp ---
Count of entries (no points): 0


0it [00:00, ?it/s]
 39%|███▉      | 7/18 [14:00<12:15, 66.87s/it]

No geometries (except Point(s)) in dataset.
Count of entries: 1452
--- data/new-york-latest-free/gis_osm_places_a_free_1.shp ---
Count of entries (no points): 1452


100%|██████████| 1451/1451 [00:00<00:00, 91720.57it/s]
 44%|████▍     | 8/18 [14:00<07:37, 45.75s/it]

Total: 1451, avg: 177.83, std: 1077.76, median: 88.0, min: 4, max: 40109, #>=25: 1151 (79.32), #>=100: 667 (45.97), #>=200: 334 (23.02), #>=500: 61 (4.2), #>=1000: 12 (0.83)
Count of entries: 1064
--- data/new-york-latest-free/gis_osm_natural_a_free_1.shp ---
Count of entries (no points): 1064


100%|██████████| 1063/1063 [00:00<00:00, 91089.24it/s]
 50%|█████     | 9/18 [14:00<04:43, 31.51s/it]

Total: 1063, avg: 41.07, std: 68.04, median: 20.0, min: 4, max: 1247, #>=25: 458 (43.09), #>=100: 100 (9.41), #>=200: 30 (2.82), #>=500: 2 (0.19), #>=1000: 1 (0.09)
Count of entries: 6741
--- data/new-york-latest-free/gis_osm_places_free_1.shp ---
Count of entries (no points): 0


0it [00:00, ?it/s]
 56%|█████▌    | 10/18 [14:01<02:56, 22.04s/it]

No geometries (except Point(s)) in dataset.
Count of entries: 4020
--- data/new-york-latest-free/gis_osm_pofw_a_free_1.shp ---
Count of entries (no points): 4020


100%|██████████| 4019/4019 [00:00<00:00, 109632.01it/s]
 61%|██████    | 11/18 [14:02<01:48, 15.51s/it]

Total: 4019, avg: 14.24, std: 10.59, median: 11.0, min: 5, max: 139, #>=25: 473 (11.77), #>=100: 4 (0.1), #>=200: 0 (0.0), #>=500: 0 (0.0), #>=1000: 0 (0.0)
Count of entries: 88722
--- data/new-york-latest-free/gis_osm_pois_free_1.shp ---
Count of entries (no points): 0


0it [00:00, ?it/s]
 67%|██████▋   | 12/18 [14:12<01:23, 13.88s/it]

No geometries (except Point(s)) in dataset.
Count of entries: 42206
--- data/new-york-latest-free/gis_osm_traffic_a_free_1.shp ---
Count of entries (no points): 42206


100%|██████████| 42205/42205 [00:00<00:00, 110995.35it/s]
 72%|███████▏  | 13/18 [14:19<00:59, 11.88s/it]

Total: 42205, avg: 12.4, std: 25.64, median: 8.0, min: 4, max: 1815, #>=25: 3019 (7.15), #>=100: 283 (0.67), #>=200: 109 (0.26), #>=500: 21 (0.05), #>=1000: 2 (0.0)
Count of entries: 263036
--- data/new-york-latest-free/gis_osm_traffic_free_1.shp ---
Count of entries (no points): 0


0it [00:00, ?it/s]
 78%|███████▊  | 14/18 [14:51<01:11, 17.77s/it]

No geometries (except Point(s)) in dataset.
Count of entries: 21132
--- data/new-york-latest-free/gis_osm_railways_free_1.shp ---
Count of entries (no points): 21132


100%|██████████| 21131/21131 [00:00<00:00, 103764.39it/s]
 83%|████████▎ | 15/18 [14:54<00:40, 13.45s/it]

Total: 21131, avg: 8.91, std: 16.13, median: 4.0, min: 2, max: 526, #>=25: 1437 (6.8), #>=100: 111 (0.53), #>=200: 20 (0.09), #>=500: 1 (0.0), #>=1000: 0 (0.0)
Count of entries: 586
--- data/new-york-latest-free/gis_osm_transport_a_free_1.shp ---
Count of entries (no points): 586


100%|██████████| 585/585 [00:00<00:00, 92458.66it/s]
 89%|████████▉ | 16/18 [14:54<00:18,  9.44s/it]

Total: 585, avg: 22.29, std: 42.44, median: 11.0, min: 4, max: 492, #>=25: 128 (21.88), #>=100: 16 (2.74), #>=200: 6 (1.03), #>=500: 0 (0.0), #>=1000: 0 (0.0)
Count of entries: 120012
--- data/new-york-latest-free/gis_osm_pois_a_free_1.shp ---
Count of entries (no points): 120012


100%|██████████| 120011/120011 [00:01<00:00, 106622.53it/s]
 94%|█████████▍| 17/18 [15:19<00:13, 13.91s/it]

Total: 120011, avg: 12.64, std: 23.13, median: 9.0, min: 4, max: 4095, #>=25: 7525 (6.27), #>=100: 540 (0.45), #>=200: 131 (0.11), #>=500: 25 (0.02), #>=1000: 5 (0.0)
Count of entries: 1414963
--- data/new-york-latest-free/gis_osm_roads_free_1.shp ---
Count of entries (no points): 1414963


100%|██████████| 1414962/1414962 [00:11<00:00, 127227.06it/s]
100%|██████████| 18/18 [20:10<00:00, 67.26s/it]

Total: 1414962, avg: 8.82, std: 15.32, median: 5.0, min: 2, max: 1623, #>=25: 84106 (5.94), #>=100: 6420 (0.45), #>=200: 1043 (0.07), #>=500: 54 (0.0), #>=1000: 7 (0.0)



