#### Imports

In [1]:
import numpy as np
import shapely
import bisect
import shapely.wkt
import matplotlib.pyplot as plt
import math
import geopandas as gpd
import json
import pandas as pd
import seaborn as sns
from enum import Enum
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

import intersection.first_bin_search
import intersection.chunk_bbox_intersection
from bench_utils import parse_intersection_data 
common_bbox = intersection.chunk_bbox_intersection.common_bbox
chunk_bbox_is_intersecting = intersection.chunk_bbox_intersection.is_intersecting
chunk_bbox_intersection = intersection.chunk_bbox_intersection.intersection



from intersection.plotting import *
import algos.fpd_extended_lib.cfg as cfg
from algos.alg_fpd_extended import FpdExtended


pd.options.mode.chained_assignment = None  # default='warn'

#### Import data

In [44]:
fpd = FpdExtended()

# Geometries
manual_data, _ = parse_intersection_data("manual")
special_cases, _ = parse_intersection_data("latest_export.json", strip_precision=True)
world_data, world_data_stats = parse_intersection_data("world.json",100, strip_precision=True)
lund_data, lund_data_stats = parse_intersection_data("lund.json",1000)
sweden_data, sweden_data_stats = parse_intersection_data("sweden-places-a.json",100)
new_york_data, new_york_data_stats = parse_intersection_data("new-york-natural.json",1000)
waterways_data, waterways_data_stats = parse_intersection_data("nord-est-waterways.json",1000)

datasets = [
    (manual_data, 3),
    (special_cases, 3),
    (world_data, 15),
    (lund_data, 3),
    (sweden_data, 15),
    (new_york_data, 10),
    (waterways_data, 3)
]


#### Constants

In [45]:
# Number of total iterations = NBR_ITER * N * number_of_datasets
NBR_ITER = -1 # Number of iterations for each dataset
N = 1 # Number of times to recalculate each geometry for reliable mean value

# Max DELTA_SIZE_VALUE in MAX_NUM_DELTAS Analysis
MAX_DELTA_SIZE = 100

#Geometry classification sizes
MEDIUM_THRESHOLD = 50
LARGE_THRESHOLD = 100


SAVE_DFS = True
USE_EXISTING_DF = True

## Functions Section

#### Config Setter Functions

In [46]:
def set_max_num_delta(val):
        cfg.MAX_NUM_DELTAS = val
        cfg.D_CNT_SIZE = cfg.required_bits(val)

def act_deact_baseline(activate=True):
        cfg.BASELINE_ON = activate
        cfg.DISABLE_OPTIMIZED_INTERSECTION = False if not activate else True
        cfg.DISABLE_OPTIMIZED_ADD_VERTEX = False if not activate else True
        cfg.DISABLE_OPTIMIZED_BOUNDING_BOX = False if not activate else True

#### Stats Collector Functions

In [47]:
def get_size_category(size):
    if size < LARGE_THRESHOLD and size >= MEDIUM_THRESHOLD:
            return "Medium", 1
    elif size >= LARGE_THRESHOLD:
         return "Large", 2
    else:
         return "Small", 0
    
def get_intersection_category(size1, size2):
    size1_cat, type1 = get_size_category(size1)
    size2_cat, type2 = get_size_category(size2)

    if type1 <= type2:
        return size1_cat + "/" + size2_cat
    return size2_cat + "/" + size1_cat


def get_context_category(g1, g2, bins=None):
    if bins == None:
        _, b1_fpde = fpd.compress(g1)
        _, b2_fpde = fpd.compress(g2) 
    else:
        b1_fpde, b2_fpde = bins

    bbox = common_bbox((b1_fpde, b2_fpde))
    intersects = shapely.intersects(g1, g2)
    contains = shapely.contains(g1, g2) or shapely.contains(g2, g1)
    if not bbox:
        return "No Overlap (FALSE)"
    elif not intersects:
        return "Overlap (FALSE)"
    elif not contains:
        return "Partial Overlap (TRUE)"
    else:
        return "Fully Inside Other (TRUE)"

def get_category_stat(g1, g2):
        s1, s2 = (shapely.get_num_coordinates(g1), shapely.get_num_coordinates(g2))
        size_category = get_intersection_category(s1, s2)
        context_category = get_context_category(g1, g2)
        return [size_category, context_category]


#For taking mean of all stats, not just total time
def get_time_stat(bins, predicate=False):
    alg = chunk_bbox_is_intersecting if predicate else chunk_bbox_intersection
    # 'decomp', 'nbr_recieved_chks', 'nbr_total_chks', 'total_time'
    mean_stat = [0, 0, 0, 0]
    for _ in range(N):
        stat, _ = alg(bins, get_stats=True)
        mean_stat = [sum(x) for x in zip(mean_stat, stat)]
    
    return [num / N for num in mean_stat]


#### Dataset Evaluation

In [48]:
def evaluate_dataset(data, stats_df, delta_size=None, not_predicate=False, add_size=False):
    for idx, (g1, g2) in enumerate(data):
        if delta_size != None:
            set_max_num_delta(delta_size)
        bins = (fpd.compress(g1)[1], fpd.compress(g2)[1])

        for is_predicate in [False] if not_predicate else [True, False]:
            stats = []
            stats.extend(get_time_stat(bins=bins, predicate=is_predicate))
            stats.extend(get_category_stat(g1, g2))
            stats.append(delta_size if delta_size != None else cfg.MAX_NUM_DELTAS)
            stats.append(is_predicate)
            stats.append(idx)
            stats.append(cfg.BASELINE_ON)
            if add_size:
                stats.append(len(bins[0]) + len(bins[1]))
            stats_df.loc[len(stats_df)] = stats

# Statistics Visualization

#### Best MAX_NUM_DELTAS per Dataset Analysis

In [49]:
from tqdm import tqdm

if USE_EXISTING_DF:
    max_deltas_df = pd.DataFrame(columns=['decomp', 'nbr_recieved_chks', 'nbr_total_chks', 'total_time', "sizes", "context", "delta_size", "predicate", "dataset_idx", "baseline", "sum_size"])
    for max_delta_size in tqdm(range(0,100)):
        for dataset in [val[0] for val in datasets]:
            act_deact_baseline(activate=False)        
            evaluate_dataset(data=dataset, stats_df=max_deltas_df, delta_size=max_delta_size, not_predicate=True, add_size=True) 
    
    #For baseline
    for dataset in [val[0] for val in datasets]:
        act_deact_baseline(activate=True)        
        evaluate_dataset(data=dataset, stats_df=max_deltas_df, delta_size=100000, not_predicate=True, add_size=True) 

    max_deltas_df.to_csv('max_deltas_df3.csv', index=False)
else:
    max_deltas_df = pd.read_csv('max_deltas_df2.csv')

  7%|▋         | 7/100 [02:16<30:19, 19.57s/it]


KeyboardInterrupt: 

In [34]:
max_deltas_filt_df = max_deltas_df[['total_time', 'sum_size', "sizes", "delta_size", "predicate", "baseline"]]

### Helper

In [35]:

import plotly.graph_objs as objs

def make_general_facet_title(fig, x_title, y_title, secondary=None, barplot=False, hbarplot=False):
    #Remove current axis titles
    for axis in fig.layout:
        if type(fig.layout[axis]) == objs.layout.XAxis:
            fig.layout[axis].title.text = ''

    for axis in fig.layout:
        if type(fig.layout[axis]) == objs.layout.YAxis:
            fig.layout[axis].title.text = '' 

    fig.add_annotation(
    dict(
        x=0.48 if not barplot else 0.5,
        y=-0.04 if not barplot else -0.15,  # adjust as needed
        showarrow=False,
        text=x_title,
        xref="paper",
        yref="paper",
        font=dict(size=20),
        xanchor="center", 
        yanchor="top"
    )
    )

    if secondary != None:
        fig.add_annotation(
        dict(
            x=1.01,  # adjust as needed
            y=0.5,
            showarrow=False,
            text=secondary,
            textangle=-90,  # to rotate the text for the vertical y-axis
            xref="paper",
            yref="paper",
            font=dict(size=20),
            xanchor="center",
            yanchor="middle"
        )
    )

    fig.add_annotation(
    dict(
        x=-0.08 if not barplot else -0.07,  # adjust as needed
        y=0.5,
        showarrow=False,
        text=y_title,
        textangle=-90,
        xref="paper",
        yref="paper",
        font=dict(size=20),
        xanchor="center",
        yanchor="middle"
    )
)

In [36]:
def plot_max_delta_size_speed(df, is_predicate=False, no_mixed=False, relative=False):
    if no_mixed:
        df = df[df['sizes'].isin(["Small/Small", "Medium/Medium", "Large/Large"])]
    specs_element = [{"secondary_y": True}] if no_mixed else [{"secondary_y": True}, {"secondary_y": True}]

    df = df[df['predicate'] == is_predicate]
    df = df.groupby(['delta_size', 'sizes', 'baseline']).mean()
    df.reset_index(inplace=True)
    
    sizes = dict(zip(list(df.sizes.unique()), range(len(df.sizes.unique()))))

    fig = make_subplots(rows=3, cols=1 if no_mixed else 2, 
                        subplot_titles=(list(map(lambda x: "Geometry Sizes=" + x.split('/')[0], sizes.keys()))), 
                        horizontal_spacing=0.08, 
                        vertical_spacing=0.07, 
                        #x_title="Max Chunk Size",  y_title="Mean Execution Time (s)", 
                        specs=[specs_element,
                              specs_element,
                              specs_element])
    
    #For not showing the same legend twice
    legend_showed = False

    for size in sizes.keys():
        curr_context_df = df[(df['sizes'] == size)]

        for is_baseline in [False] if relative else [True, False]:

            line = dict(color='blue' if is_baseline else 'red', dash='dash' if is_baseline else None)

            curr_df = curr_context_df[curr_context_df['baseline'] == is_baseline]

            if relative:
                baseline_formatting = "FPDE" if not is_baseline else "Baseline"
                baseline_value = curr_context_df[(curr_context_df['baseline'] == True) & (curr_context_df['delta_size'] == 100000)]
                curr_df['total_time'] = curr_df['total_time'].divide(baseline_value.total_time.values[0], axis=0)
                

            curr_df["total_time"] = curr_df.sort_values(by='delta_size')['total_time'].rolling(window=5, min_periods=1).mean()

            fig.add_trace(go.Scatter(x=curr_df.delta_size.values, y= curr_df.total_time.values,
                                    mode='lines',
                                    line=line,
                                    showlegend=not legend_showed,
                                    name=f"{baseline_formatting}"),col=sizes[size] // 3 + 1 if no_mixed else 1, row=sizes[size] % 3 + 1,
                                    secondary_y=False)
            
            if not is_baseline:
                fig.add_trace(go.Scatter(x=curr_df.delta_size.values, y=curr_df.sum_size.values,
                                        mode='lines',
                                        line=dict(color='green', dash='dash'),
                                        showlegend=not legend_showed,
                                        name=f"Average size (bits)"),col=sizes[size] // 3 + 1 if no_mixed else 1, row=sizes[size] % 3 + 1,
                                        secondary_y=True)
                legend_showed = True

                mix_delta_size = curr_df.sort_values(by='total_time').delta_size.values[0]

                fig.add_vrect(x0=mix_delta_size - 1, x1=mix_delta_size + 1, col=sizes[size] // 3 + 1 if no_mixed else 1, row=sizes[size] % 3 + 1,
                annotation_text=f"Min={mix_delta_size}", annotation_position="inside",
                fillcolor="green", opacity=0.25, line_width=0, annotation = dict(textangle=-90, font=dict(size=16)))
    
    #For changing facet size
    # for annotation in fig['layout']['annotations']:
    #     annotation['font']['size'] = 18  # Set your desired font size here

    intersection_format = "Intersection" if not is_predicate else "IsIntersection"
    baseline_format = "" if not is_baseline else "(Baseline)"
    fig.update_layout(
                      title_x=0.5,
                      width=1000, 
                      height=1000, 
                      legend=dict(
                            yanchor="top",
                            y=-0.037,
                            xanchor="left",
                            x=0.7,
                            orientation="h",
                            font=dict(size=15),
                        ),
                        title=dict(
                            text=f'Relative Execution Time for {intersection_format} by Max Chunk Size {baseline_format}',
                            x=0.5,
                            font=dict(size=22)
                            )
                        )

    #fig.update_annotations(textangle=-90, font=dict(size=14)) 
    fig.update_annotations(font=dict(size=16))
    fig.update_yaxes(secondary_y=False, type="log", tickfont=dict(size=16))
    fig.update_xaxes(tickfont=dict(size=16))

    make_general_facet_title(fig, "Max Chunk Size", "Relative Mean Execution Time (%)", secondary="Mean Size (bits)")
   

    fig.show()

In [37]:
plot_max_delta_size_speed(max_deltas_filt_df, is_predicate=False, no_mixed=True, relative=True)

#### Dataset Stats Collection

In [8]:
if USE_EXISTING_DF:
    stats_df = pd.DataFrame(columns=['decomp', 'nbr_recieved_chks', 'nbr_total_chks', 'total_time', "sizes", "context", "delta_size", "predicate", "dataset_idx", "baseline"])
    for baseline_on in [True, False]:
        act_deact_baseline(activate=baseline_on)
        for dataset in tqdm(datasets):
            evaluate_dataset(dataset[0], stats_df, delta_size=dataset[1]  if not baseline_on else 10000) # Second param: PLOT?
    stats_df.to_csv('stats_df.csv', index=False)

else:
    stats_df = pd.read_csv('stats_df.csv')

100%|██████████| 7/7 [01:13<00:00, 10.53s/it]
100%|██████████| 7/7 [00:45<00:00,  6.56s/it]


#### Total Execution Time by Context

##### Bars

In [12]:
def plot_total_time_bars(df, is_predicate=False, size_cat_excl=[], log_scale=True):
    df = df[(df['predicate'] == is_predicate)] #(df['sizes'].isin(["Small/Small", "Medium/Medium", "Large/Large"]))]

    df = df.groupby(['context', 'baseline', 'sizes']).mean()
    df.reset_index(inplace=True)
    baseline_df = df[df['baseline']]
    not_baseline_df = df[~df['baseline']]
    baseline_df.sort_values(by=['context', 'sizes'])
    not_baseline_df.sort_values(by=['context', 'sizes'])
    not_baseline_df['total_time_ratio'] = not_baseline_df['total_time'].values / baseline_df['total_time'].values
    df = not_baseline_df



    df.loc[~df['baseline'], 'decomp_ratio'] = df.loc[~df['baseline'], 'total_time_ratio'] *  df.loc[~df['baseline'], 'decomp'] / df.loc[~df['baseline'], 'total_time']    
    df['total_time_ratio'] = df.loc[~df['baseline'], 'total_time_ratio'] - df.loc[~df['baseline'], 'decomp_ratio']
    df = df[df['baseline'] == False]
    intersection_formatting = "Intersection" if not is_predicate else "Is_Intersection"
    df = pd.concat([df] * 2, ignore_index=True)

    new_column_values = (["Decompression"] * (len(df) // 2))
    new_column_values.extend((["Intersection"] * (len(df) // 2)))

    df['Stage'] = new_column_values
    df['time'] = df.apply(lambda x: x['decomp_ratio'] if x["Stage"] == "Decompression" else x['total_time_ratio'], axis=1)
    df = df[['time', 'context', 'sizes', 'Stage']]
    fig = px.bar(df, 
                       x="context", 
                       y="time", 
                       facet_col="sizes", 
                       color="Stage",
                       barmode='group',
                       log_y=log_scale, 
                       facet_col_spacing=0.1, 
                       #histfunc="avg", 
                       title=f"Execution Time for {intersection_formatting} in Different Contexts & Sizes",
                       labels=dict(sizes="Geometry Sizes", context="Context", baseline="Baseline", Stage="Stage: "),
                       facet_col_wrap=3,
                       facet_row_spacing=0.1,
                       height=1000,
                       width=1000,
                       pattern_shape='Stage',
                       #text="time"
                       )

    make_general_facet_title(fig, "Bounding Box Context", "Relative Mean Execution Time (%)", barplot=True)
    #fig.update_annotations(font=dict(size=14))
    fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
    fig.update_yaxes(matches=None, showticklabels=True, visible=True,  tickfont=dict(size=14))
    fig.update_xaxes(tickfont=dict(size=14))
   # fig.update_traces(texttemplate='%{text:.2}', textposition='inside', textfont=dict(color='white'))

    fig.update_layout(barmode="relative",
                      title_x=0.5,
                      width=1000, 
                      height=1000, 
                      legend=dict(
                            yanchor="top",
                            y=-0.15,
                            xanchor="left",
                            x=0.7,
                            orientation="h",
                            font=dict(size=15),
                        ),
                        title=dict(
                            text=f'Relative Mean Execution Time for Intersection in Different Contexts',
                            x=0.5,
                            font=dict(size=22)
                            )
                        )
   
    fig.show()


In [13]:
total_time_df = stats_df[['context', 'total_time', 'sizes', 'baseline', 'predicate', 'decomp']]
plot_total_time_bars(total_time_df, is_predicate=False, size_cat_excl=[], log_scale=False) #

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_time,predicate,decomp
context,baseline,sizes,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fully Inside Other (TRUE),False,Small/Large,0.00275,0.0,0.001596
Fully Inside Other (TRUE),False,Small/Medium,0.000666,0.0,0.000175
Fully Inside Other (TRUE),False,Small/Small,0.000556,0.0,0.00013
Fully Inside Other (TRUE),True,Small/Large,0.004632,0.0,0.003661
Fully Inside Other (TRUE),True,Small/Medium,0.000919,0.0,0.000405
Fully Inside Other (TRUE),True,Small/Small,0.000641,0.0,0.000188
Overlap (FALSE),False,Large/Large,0.00052,0.0,0.000365
Overlap (FALSE),False,Medium/Large,0.000448,0.0,0.000254
Overlap (FALSE),False,Medium/Medium,0.000342,0.0,0.00016
Overlap (FALSE),False,Small/Large,0.000344,0.0,0.000241


#### Chunk Unfolding Analyzation

In [None]:
chunk_unfolded_df = stats_df[['nbr_recieved_chks', 'nbr_total_chks','predicate','baseline', 'context', 'sizes']]
chunk_unfolded_df["chk_fraction_unfolded"] = chunk_unfolded_df["nbr_recieved_chks"].div(chunk_unfolded_df.nbr_total_chks, axis=0) * 100

In [14]:
def plot_chunk_unfolded_frac(df, is_predicate=False, is_baseline=False):
    df["chk_fraction_unfolded"] = df["nbr_recieved_chks"].div(chunk_unfolded_df.nbr_total_chks, axis=0) * 100
    df = df[(df['predicate'] == is_predicate) & (df['sizes'].isin(["Small/Small", "Medium/Medium", "Large/Large"]))]
    df = df[df['baseline'] == is_baseline]
    df = df.groupby(['context', 'baseline', 'sizes']).mean()
    df.reset_index(inplace=True)



    intersection_format = "Intersection" if not is_predicate else "IsIntersection"

    fig = px.bar(df, 
                       x="chk_fraction_unfolded", 
                       y="context", 
                       facet_row="sizes", 
                       barmode='group',
                       facet_col_spacing=0.1, 
                       title=f"Fraction of Chunks Unfolded for {intersection_format}",
                       labels=dict(sizes="Geometry Sizes", context="Context", baseline="Baseline", Stage="Stage: "),
                       facet_col_wrap=1,
                       height=1000,
                       width=1000,
                       orientation="h",
                       text="chk_fraction_unfolded"
                       )
    
    fig.update_annotations(font=dict(size=16))
    fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
    fig.update_yaxes(showticklabels=True, visible=True,  tickfont=dict(size=16))
    fig.update_xaxes(tickfont=dict(size=16))
    fig.update_traces(texttemplate='%{text:.2f}%' , textposition='inside',textfont=dict(size=14))
    fig.update_layout(barmode="relative",
                      title_x=0.5,
                      width=1000, 
                      height=800, 
                      legend=dict(
                            yanchor="top",
                            y=1,
                            xanchor="left",
                            x=0.5,
                            orientation="h",
                            font=dict(size=16)
                        ),
                        title=dict(
                            text=f'Fraction of Chunks Unfolded for Intersection in Different Contexts and Geometry Sizes',
                            x=0.5,
                            font=dict(size=22)
                            ),
                        )

    for axis in fig.layout:
        if type(fig.layout[axis]) == objs.layout.XAxis:
            fig.layout[axis].title.text = ''

    for axis in fig.layout:
        if type(fig.layout[axis]) == objs.layout.YAxis:
            fig.layout[axis].title.text = '' 

    fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1].split("/")[0]))
    #fig.for_each_trace(lambda t: t.update(name=t.name.split("=")[0]))
    fig.add_annotation(
    dict(
        x=0.5,
        y=-0.04,  # adjust as needed
        showarrow=False,
        text="Chunks Unfolded (%)",
        xref="paper",
        yref="paper",
        font=dict(size=20),
        xanchor="center", 
        yanchor="top"
    )
    )

    fig.add_annotation(
    dict(
        x=1.05,
        y=0.62,  # adjust as needed
        showarrow=False,
        text="Geometry Size",
        xref="paper",
        yref="paper",
        textangle=90,
        font=dict(size=20),
        xanchor="center", 
        yanchor="top"
    )
    )

    fig.add_annotation(
    dict(
        x=-0.35,  # adjust as needed
        y=0.5,
        showarrow=False,
        text="Bouning Box Context",
        textangle=-90,
        xref="paper",
        yref="paper",
        font=dict(size=20),
        xanchor="center",
        yanchor="middle"
    ))
   
    fig.show()

## Results

#### Total time Analysis

#### (%) Unfolding of chunks

In [15]:
plot_chunk_unfolded_frac(chunk_unfolded_df, is_predicate=False, is_baseline=False)

NameError: name 'chunk_unfolded_df' is not defined