#### Imports

In [1]:
%pip install dill


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import shapely
import bisect
import shapely.wkt
import matplotlib.pyplot as plt
import math
import geopandas as gpd
import json
import pandas as pd
import seaborn as sns
from enum import Enum
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

import intersection.first_bin_search
import intersection.chunk_bbox_intersection
from bench_utils import parse_intersection_data 
common_bbox = intersection.chunk_bbox_intersection.common_bbox
chunk_bbox_is_intersecting = intersection.chunk_bbox_intersection.is_intersecting
chunk_bbox_intersection = intersection.chunk_bbox_intersection.intersection



from intersection.plotting import *
import algos.fpd_extended_lib.cfg as cfg
from algos.alg_fpd_extended import FpdExtended


pd.options.mode.chained_assignment = None  # default='warn'

#### Import data

In [3]:
fpd = FpdExtended()

# Geometries
# manual_data, _ = parse_intersection_data("manual")
# special_cases, _ = parse_intersection_data("latest_export.json", strip_precision=True)
world_data, world_data_stats = parse_intersection_data("world.json",40, strip_precision=True)
lund_data, lund_data_stats = parse_intersection_data("lund.json")
# sweden_data, sweden_data_stats = parse_intersection_data("sweden-places-a.json",40)
# new_york_data, new_york_data_stats = parse_intersection_data("new-york-natural.json",40)
# waterways_data, waterways_data_stats = parse_intersection_data("nord-est-waterways.json",40)

#sweden_all_data, sweden_all_data_stats = parse_intersection_data("final/latest_export.pkl")
#country_borders_data, country_borders_data_stats = parse_intersection_data("final/country_borders.pkl")
#china_water_data, china_water_data_stats = parse_intersection_data("final/china_water_100k.pkl")

datasets = [
    ##(china_water_data, 13, "China Water"),
    #(country_borders_data, 13, "Country Borders"),
    # (manual_data, 3, "manual_data"),
    # (special_cases, 3, "special_cases"),
    #(world_data, 15, "world_data"),
    #(lund_data, 3, "lund_data"),
    # (sweden_data, 15, "sweden_data"),
    # (new_york_data, 10, "new_york_data"),
    # (waterways_data, 3, "waterways")
]


#### Constants

In [4]:
# Number of total iterations = NBR_ITER * N * number_of_datasets
NBR_ITER = -1 # Number of iterations for each dataset
N = 1 # Number of times to recalculate each geometry for reliable mean value
K = 5000

# Max DELTA_SIZE_VALUE in MAX_NUM_DELTAS Analysis
MAX_DELTA_SIZE = 100

#Geometry classification sizes
MEDIUM_THRESHOLD = 30
LARGE_THRESHOLD = 100


SAVE_DFS = True
USE_EXISTING_DF = True

## Functions Section

#### Config Setter Functions

In [5]:
def set_max_num_delta(val):
        cfg.MAX_NUM_DELTAS = val
        cfg.D_CNT_SIZE = cfg.required_bits(val)

def act_deact_baseline(activate=True):
        cfg.BASELINE_ON = activate
        cfg.DISABLE_OPTIMIZED_INTERSECTION = False if not activate else True
        cfg.DISABLE_OPTIMIZED_ADD_VERTEX = False if not activate else True
        cfg.DISABLE_OPTIMIZED_BOUNDING_BOX = False if not activate else True

#### Stats Collector Functions

In [6]:
def get_size_category(size):
    if size < LARGE_THRESHOLD and size >= MEDIUM_THRESHOLD:
            return "Medium", 1
    elif size >= LARGE_THRESHOLD:
         return "Large", 2
    else:
         return "Small", 0
    
def get_intersection_category(size1, size2):
    size1_cat, type1 = get_size_category(size1)
    size2_cat, type2 = get_size_category(size2)

    if type1 <= type2:
        return size1_cat + "/" + size2_cat
    return size2_cat + "/" + size1_cat


def get_context_category(g1, g2, bins=None):
    if bins == None:
        _, b1_fpde = fpd.compress(g1)
        _, b2_fpde = fpd.compress(g2) 
    else:
        b1_fpde, b2_fpde = bins

    bbox = common_bbox((b1_fpde, b2_fpde))
    intersects = shapely.intersects(g1, g2)
    contains = shapely.contains(g1, g2) or shapely.contains(g2, g1)
    if not bbox:
        return "No Overlap (FALSE)"
    elif not intersects:
        return "Overlap (FALSE)"
    elif not contains:
        return "Partial Overlap (TRUE)"
    else:
        return "Fully Inside Other (TRUE)"

def get_category_stat(g1, g2):
        s1, s2 = (shapely.get_num_coordinates(g1), shapely.get_num_coordinates(g2))
        size_category = get_intersection_category(s1, s2)
        context_category = get_context_category(g1, g2)
        return [size_category, context_category]


#For taking mean of all stats, not just total time
def get_time_stat(bins, predicate=False):
    alg = chunk_bbox_is_intersecting if predicate else chunk_bbox_intersection
    # 'decomp', 'nbr_recieved_chks', 'nbr_total_chks', 'total_time'
    mean_stat = [0, 0, 0, 0]
    for _ in range(N):
        stat, _ = alg(bins, get_stats=True)
        mean_stat = [sum(x) for x in zip(mean_stat, stat)]
    
    return [num / N for num in mean_stat]


#### Dataset Evaluation

In [7]:
def evaluate_dataset(stats_df, delta_size=None, not_predicate=True, add_size=False, dataset_name="", args=None):
        idx, g1, g2 = args
        if delta_size != None:
            set_max_num_delta(delta_size)
        bins = (fpd.compress(g1)[1], fpd.compress(g2)[1])

        for is_predicate in [False] if not_predicate else [True, False]:
            stats = []
            stats.extend(get_time_stat(bins=bins, predicate=is_predicate))
            stats.extend(get_category_stat(g1, g2))
            stats.append(delta_size if delta_size != None else cfg.MAX_NUM_DELTAS)
            if is_predicate:
                print(is_predicate)
            stats.append(is_predicate)
            stats.append(idx)
            stats.append(cfg.BASELINE_ON)
            if add_size:
                stats.append(len(bins[0]) + len(bins[1]))
            stats.append(dataset_name)
            stats_df.loc[len(stats_df)] = stats

# Statistics Visualization

#### Best MAX_NUM_DELTAS per Dataset Analysis

In [120]:
from tqdm import tqdm

if not USE_EXISTING_DF:
    #For baseline
    max_deltas_dfs = []
    for dataset in datasets:
        max_deltas_df = pd.DataFrame(columns=['decomp', 'nbr_recieved_chks', 'nbr_total_chks', 'total_time', "sizes", "context", "delta_size", "predicate", "dataset_idx", "baseline", "sum_size", "dataset"])

        dataset, _, dataset_name = dataset
        np.random.seed(2021)
        indicies = np.random.choice(len(dataset), size= min(K, len(dataset))) 
        dataset = np.array(dataset)[indicies] 
        for idx, (g1, g2) in enumerate(tqdm(dataset)):
            
            if g1.geom_type == "Point" or g2.geom_type == "Point":
                continue

            cat =  get_category_stat(g1, g2)[0]
            if cat != 'Small/Small' and cat != 'Medium/Medium' and cat != 'Large/Large':
                continue

            act_deact_baseline(activate=False)        
            for max_delta_size in range(0,MAX_DELTA_SIZE):
                evaluate_dataset(stats_df=max_deltas_df, delta_size=max_delta_size, not_predicate=True, add_size=True, dataset_name=dataset_name, args=(idx, g1, g2)) 

            act_deact_baseline(activate=True)        
            evaluate_dataset(stats_df=max_deltas_df, delta_size=10000000, not_predicate=True, add_size=True, dataset_name=dataset_name, args=(idx, g1, g2)) 
        
        max_deltas_df.to_csv(f'max_deltas_{dataset_name}.csv', index=False)
        max_deltas_dfs.append(max_deltas_df)
    max_deltas_df = pd.concat(max_deltas_dfs)
    display(max_deltas_df)

else:
    country_max_deltas_df = pd.read_csv('max_deltas_Country Borders3.csv')
    country_max_deltas_df['delta_size'] = country_max_deltas_df['delta_size'].replace(100000, 10000000)
    china_max_deltas_df = pd.read_csv('max_deltas_China Water3.csv')
    sweden_max_deltas_df = pd.read_csv('max_deltas_Sweden All3.csv')
    lund_max_deltas_df = pd.read_csv('max_deltas_lund_data3.csv')

    max_deltas_df = pd.concat([china_max_deltas_df, 
                               sweden_max_deltas_df, 
                               lund_max_deltas_df, 
                               country_max_deltas_df
                               ])




In [121]:
max_deltas_filt_df = max_deltas_df[['total_time', 'sum_size', "sizes", "delta_size", "predicate", "baseline"]]

### Helper

In [122]:

import plotly.graph_objs as objs

def make_general_facet_title(fig, x_title, y_title, secondary=None, barplot=False, hbarplot=False):
    #Remove current axis titles
    for axis in fig.layout:
        if type(fig.layout[axis]) == objs.layout.XAxis:
            fig.layout[axis].title.text = ''

    for axis in fig.layout:
        if type(fig.layout[axis]) == objs.layout.YAxis:
            fig.layout[axis].title.text = '' 

    fig.add_annotation(
    dict(
        x=0.48 if not barplot else 0.5,
        y=-0.03 if not barplot else -0.10,  # adjust as needed
        showarrow=False,
        text=x_title,
        xref="paper",
        yref="paper",
        font=dict(size=24),
        xanchor="center", 
        yanchor="top",
        textangle=0,  # to rotate the text for the vertical y-axis

    )
    )

    if secondary != None:
        fig.add_annotation(
        dict(
            x=1.02,  # adjust as needed
            y=0.5,
            showarrow=False,
            text=secondary,
            textangle=-90,  # to rotate the text for the vertical y-axis
            xref="paper",
            yref="paper",
            font=dict(size=24),
            xanchor="center",
            yanchor="middle"
        )
    )

    fig.add_annotation(
    dict(
        x=-0.07 if not barplot else -0.08,  # adjust as needed
        y=0.5,
        showarrow=False,
        text=y_title,
        textangle=-90,
        xref="paper",
        yref="paper",
        font=dict(size=24),
        xanchor="center",
        yanchor="middle"
    )
)

In [130]:
def plot_max_delta_size_speed(df, is_predicate=False, no_mixed=False, relative=False):
    if no_mixed:
        df = df[df['sizes'].isin(["Small/Small", "Medium/Medium", "Large/Large"])]



    specs_element = [{"secondary_y": True}] if no_mixed else [{"secondary_y": True}, {"secondary_y": True}]

    df = df[df['predicate'] == is_predicate]
    df = df.groupby(['delta_size', 'sizes', 'baseline']).mean()
    df.reset_index(inplace=True)
    
    sizes = dict(zip(list(df.sizes.unique()), range(len(df.sizes.unique()))))

    fig = make_subplots(rows=3, cols=1 if no_mixed else 2, 
                        subplot_titles=(list(map(lambda x: "Geometry Sizes=" + x.split('/')[0], sizes.keys()))), 
                        horizontal_spacing=0.08, 
                        vertical_spacing=0.07, 
                        #x_title="Max Chunk Size",  y_title="Mean Execution Time (s)", 
                        specs=[specs_element,
                              specs_element,
                              specs_element])
    
    #For not showing the same legend twice
    legend_showed = False

    for size in sizes.keys():
        curr_context_df = df[(df['sizes'] == size)]

        for is_baseline in [False] if relative else [True, False]:

            line = dict(color='blue' if is_baseline else 'red', dash='dash' if is_baseline else None)

            curr_df = curr_context_df[curr_context_df['baseline'] == is_baseline]
            baseline_formatting = "FPDE" if not is_baseline else "Baseline"

            if relative:
                baseline_time_value = curr_context_df[(curr_context_df['baseline'] == True) & (curr_context_df['delta_size'] == 10000000)]
                curr_df['total_time'] = curr_df['total_time'].divide(baseline_time_value.total_time.values[0], axis=0)
                baseline_size_value = curr_context_df[(curr_context_df['baseline'] == True) & (curr_context_df['delta_size'] == 10000000)]
                curr_df['sum_size'] = curr_df['sum_size'].divide(baseline_size_value.sum_size.values[0], axis=0)
                

            curr_df["total_time"] = curr_df.sort_values(by='delta_size')['total_time'].rolling(window=3, min_periods=1).mean()

            fig.add_trace(go.Scatter(x=curr_df.delta_size.values, y= curr_df.total_time.values,
                                    mode='lines',
                                    line=line,
                                    showlegend=not legend_showed,
                                    name=f"{baseline_formatting}"),col=sizes[size] // 3 + 1 if no_mixed else 1, row=sizes[size] % 3 + 1,
                                    secondary_y=False)
            
            if not is_baseline:
                fig.add_trace(go.Scatter(x=curr_df.delta_size.values, y=curr_df.sum_size.values,
                                        mode='lines',
                                        line=dict(color='green', dash='dash'),
                                        showlegend=not legend_showed,
                                        name=f"Average size (bits)"),col=sizes[size] // 3 + 1 if no_mixed else 1, row=sizes[size] % 3 + 1,
                                        secondary_y=True)
                legend_showed = True

                mix_delta_size = curr_df.sort_values(by='total_time').delta_size.values[0]

                fig.add_vrect(x0=mix_delta_size - 1, x1=mix_delta_size + 1, col=sizes[size] // 3 + 1 if no_mixed else 1, row=sizes[size] % 3 + 1,
                annotation_text=f"Min={mix_delta_size}", annotation_position="inside",
                fillcolor="green", opacity=0.25, line_width=0, annotation = dict(textangle=-90, font=dict(size=16)))
    
    #For changing facet size
    # for annotation in fig['layout']['annotations']:
    #     annotation['font']['size'] = 18  # Set your desired font size here

    intersection_format = "Intersection" if not is_predicate else "IsIntersection"
    baseline_format = "" if not is_baseline else "(Baseline)"
    fig.update_layout(
                      title_x=0.5,
                      width=1000, 
                      height=1000, 
                      legend=dict(
                            yanchor="top",
                            y=-0.037,
                            xanchor="left",
                            x=0.7,
                            orientation="h",
                            font=dict(size=15),
                        ),
                        title=dict(
                            text=f'Relative Execution Time for {intersection_format} by Max Chunk Size {baseline_format}',
                            x=0.5,
                            font=dict(size=22)
                            )
                        )

    fig.update_annotations(font=dict(size=16)) 
    fig.update_annotations(font=dict(size=16))
    fig.update_yaxes(type="log", tickfont=dict(size=16))

    fig.update_xaxes(tickfont=dict(size=16))

    make_general_facet_title(fig, "Max Chunk Size", "Relative Mean Execution Time (%)", secondary="Mean Size (bits)")
   

    display(max_deltas_filt_df)
    fig.show()



In [131]:
plot_max_delta_size_speed(max_deltas_filt_df, is_predicate=False, no_mixed=True, relative=True)

Unnamed: 0,total_time,sum_size,sizes,delta_size,predicate,baseline
0,0.002065,4845,Medium/Medium,0,False,False
1,0.001002,3023,Medium/Medium,1,False,False
2,0.001081,2374,Medium/Medium,2,False,False
3,0.001098,2109,Medium/Medium,3,False,False
4,0.000834,1878,Medium/Medium,4,False,False
...,...,...,...,...,...,...
384401,0.000107,560,Small/Small,96,False,False
384402,0.000102,560,Small/Small,97,False,False
384403,0.000101,560,Small/Small,98,False,False
384404,0.000100,560,Small/Small,99,False,False


#### Dataset Stats Collection

In [142]:
if not USE_EXISTING_DF:
    stats_dfs = []

    for dataset in datasets:
        stats_df = pd.DataFrame(columns=['decomp', 'nbr_recieved_chks', 'nbr_total_chks', 'total_time', "sizes", "context", "delta_size", "predicate", "dataset_idx", "baseline", "dataset"])

        dataset, _, dataset_name = dataset
        np.random.seed(2021)
        indicies = np.random.choice(len(dataset), size=100000, replace=True)
        dataset = np.array(dataset)[indicies] 
        for idx, (g1, g2) in enumerate(tqdm(dataset)):
            if g1.equals(g2):
                continue
            
            if g1.geom_type == "Point" or g2.geom_type == "Point":
                continue

            for baseline_on in [True, False]:
                act_deact_baseline(activate=baseline_on)
                

                evaluate_dataset(stats_df, delta_size=15 if not baseline_on else 10000000, dataset_name=dataset_name, args=(idx, g1, g2)) # Second param: PLOT?
        stats_df.to_csv(f'stats_df_{dataset_name}.csv', index=False)
        stats_dfs.append(stats_df)
    stats_df = pd.concat(stats_dfs)
else:
    country_stats_df = pd.read_csv('stats_df_Country Borders4.csv')
    china_stats_df = pd.read_csv('stats_df_China Water4.csv')
    sweden_stats_df = pd.read_csv('stats_df_Sweden All3.csv')
    lund_stats_df = pd.read_csv('stats_df_lund_data3.csv')

    stats_df = pd.concat([
        china_stats_df, 
        sweden_stats_df, 
        #lund_stats_df, 
        country_stats_df
        ]
    )


### OLD VERSION For all mean relative execution time and fractions

In [143]:
# def plot_total_time_bars_relative(df, is_predicate=False, size_cat_excl=[], log_scale=True):
#     display(df[(df['predicate'] == True)])
#     df = df[(df['predicate'] == False)] #(df['sizes'].isin(["Small/Small", "Medium/Medium", "Large/Large"]))]

#     df.replace({"Fully Inside Other (TRUE)": "Contained", 
#             "Overlap (FALSE)": "Disjoint",
#             "Partial Overlap (TRUE)": "Crossing",
#             "Large/Large": "L/L",
#             "Small/Large": "S/L",
#             "Medium/Large": "M/L",
#             "Small/Medium": "S/M",
#             "Medium/Medium": "M/M",
#             "Small/Small": "S/S",
#             }, inplace=True)

#     #df.reset_index(inplace=True)
#     baseline_df = df[df['baseline']]
#     not_baseline_df = df[~df['baseline']]
 
#     baseline_df.sort_values(by=['dataset', 'dataset_idx'])
   
   
#     not_baseline_df.sort_values(by=['dataset', 'dataset_idx'])
#     not_baseline_df['total_time_ratio'] = not_baseline_df['total_time'].values / baseline_df['total_time'].values
#     df = not_baseline_df
#     df['frac'] =(df["decomp"].div(df.total_time, axis=0))
#     df = df.groupby(["baseline", "context", "sizes"])[['frac','total_time_ratio' ]].mean() 
    
#     baseline_df['frac'] =(baseline_df["decomp"].div(baseline_df.total_time, axis=0))
#     baseline_df_temp = baseline_df.groupby(["baseline", "context", "sizes"])[['frac']].mean() 
#     display(baseline_df_temp)
#     display(df)
#     df.reset_index(inplace=True)
#     df['decomp_ratio'] = (df["frac"].mul(df.total_time_ratio, axis=0))
#     df['intersection_ratio'] = (df["total_time_ratio"].sub(df.decomp_ratio, axis=0))
#     intersection_formatting = "Intersection" if not is_predicate else "Is_Intersection"
#     df = pd.concat([df] * 2, ignore_index=True)

#     new_column_values = (["Decompression"] * (len(df) // 2))
#     new_column_values.extend((["Intersection"] * (len(df) // 2)))

#     df['Stage'] = new_column_values
#     df['time'] = df.apply(lambda x: x['decomp_ratio'] * 100 if x["Stage"] == "Decompression" else x['intersection_ratio'] * 100, axis=1)
#     df = df[['time', 'context', 'sizes', 'Stage']]
#     fig = px.bar(df, 
#                        x="context", 
#                        y="time", 
#                        facet_col="sizes", 
#                        color="Stage",
#                        barmode='group',
#                        log_y=log_scale, 
#                        facet_col_spacing=0.1, 
#                        title=f"Execution Time for {intersection_formatting} in Different Contexts & Sizes",
#                        labels=dict(sizes="Geometry Sizes", context="Context", baseline="Baseline", Stage="Stage: "),
#                        facet_col_wrap=3,
#                        facet_row_spacing=0.1,
#                        height=1000,
#                        width=1000,
#                        pattern_shape='Stage',
#                        color_discrete_map={
#                             'Decompression': 'rgb(169,234,184)',
#                             'Intersection': 'rgb(255,183,180)',
#                             'Country Borders': 'rgb(254,198,161)'
#                         },
#                         category_orders={"context": ["Crossing", "Contained", "Disjoint"],}
#                        #text="time"
#                        )
#     fig.for_each_annotation(lambda a: a.update(text= "  " + a.text.split("=")[1]))

#     fig.update_annotations(font=dict(size=20))
#     make_general_facet_title(fig, "Intersection Context", "Relative Mean Execution Time (%)", barplot=True)
#     #fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
#     fig.update_yaxes(matches=None, showticklabels=True,  tickfont=dict(size=20), range=[0,105])
#     fig.update_xaxes(tickfont=dict(size=20))
#     #fig.update_traces(texttemplate='%{text:.2}%', textposition='inside', textfont=dict(color='white'))

#     fig.update_layout(barmode="relative",
#                       title_y=0.975,
#                     title_x=0.465,
#                       width=1000, 
#                       height=1000, 
#                       legend=dict(
#                             yanchor="top",
#                             y=1.15,
#                             xanchor="left",
#                             x=0.95,
#                            #orientation="h",
#                             font=dict(size=20),
#                         ),
#                         title=dict(
#                             text=f'Mean Relative Execution Time for Intersection',
                            
#                             font=dict(size=28)
#                             )
#                         )
#     fig.add_annotation(
#         dict(
#             x=0.5,
#             y=1.08,  # adjust as needed
#             showarrow=False,
#             text="Geometry Size",
#             xref="paper",
#             yref="paper",
#             font=dict(size=24),
#             xanchor="center", 
#             yanchor="top"
#         ))
#     for row in range(2):
#         for col in range(3):
#             fig.add_hline(y=100, line_dash="dot", row=row, col=col)

   
#     fig.show()


#### Relative Mean Execution time + Intersection, Decomp Frac

In [213]:
def plot_total_time_bars_relative(df, is_predicate=False, size_cat_excl=[], log_scale=True):
    df = df[(df['predicate'] == False)] #(df['sizes'].isin(["Small/Small", "Medium/Medium", "Large/Large"]))]
    df.replace({"Fully Inside Other (TRUE)": "Contained", 
            "Overlap (FALSE)": "Disjoint",
            "Partial Overlap (TRUE)": "Crossing",
            "Large/Large": "L/L",
            "Small/Large": "S/L",
            "Medium/Large": "M/L",
            "Small/Medium": "S/M",
            "Medium/Medium": "M/M",
            "Small/Small": "S/S",
            }, inplace=True)
    
    
    baseline_df = df[df['baseline']]
    not_baseline_df = df[~df['baseline']] 
    not_baseline_df['decomp_frac'] = (not_baseline_df["decomp"].div(not_baseline_df.total_time, axis=0))
    not_baseline_df.reset_index(inplace=True)
    baseline_df['decomp_frac'] = (baseline_df["decomp"].div(baseline_df.total_time, axis=0))
    baseline_df.reset_index(inplace=True)

    baseline_fracs_df = baseline_df.groupby(["context", "sizes"])[['decomp_frac']].mean() 
    not_baseline_fracs_df = not_baseline_df.groupby(["context", "sizes"])[['decomp_frac']].mean() 

    print("Baseline decompression fractions")
    display(baseline_fracs_df)
    print("FPDE decompression fractions")
    display(not_baseline_fracs_df)


    not_baseline_fracs_df.reset_index(inplace=True)
    df_means = df.groupby(['context', 'sizes', 'baseline']).mean()
    df_means.reset_index(inplace=True)
    df_means['rel_tot_exec'] = df_means[~df_means['baseline']]['total_time'].div(df_means[df_means['baseline']]['total_time'].values,axis=0)
    df_means = df_means[~df_means['baseline']]
    print("FPDE Relative total execution time")
    display(df_means)

    df_means['rel_decomp_exec'] = df_means['rel_tot_exec'].values * not_baseline_fracs_df['decomp_frac'].values
    df_means['rel_intersection_exec'] = (1 - not_baseline_fracs_df['decomp_frac']).values * df_means['rel_tot_exec'].values
    df = df_means[['context', 'sizes', 'rel_decomp_exec', 'rel_intersection_exec']]


    df = pd.concat([df] * 2, ignore_index=True)
    new_column_values = (["Decompression"] * (len(df) // 2))
    new_column_values.extend((["Intersection"] * (len(df) // 2)))

    df['Stage'] = new_column_values

    
    
    
    df['rel_exec_time'] = df.apply(lambda x: x['rel_decomp_exec'] * 100 if x["Stage"] == "Decompression" else x['rel_intersection_exec'] * 100, axis=1)
    df = df[['rel_exec_time', 'context', 'sizes', 'Stage']]
    fig = px.bar(df, 
                       x="context", 
                       y="rel_exec_time", 
                       facet_col="sizes", 
                       color="Stage",
                       barmode='relative',
                       log_y=log_scale, 
                       facet_col_spacing=0.1, 
                       labels=dict(sizes="Geometry Sizes", context="Context", baseline="Baseline", Stage="Stage: "),
                       facet_col_wrap=3,
                       facet_row_spacing=0.1,
                       height=1000,
                       width=1000,
                       pattern_shape='Stage',
                       color_discrete_map={
                            'Decompression': 'rgb(169,234,184)',
                            'Intersection': 'rgb(255,183,180)',
                            'Country Borders': 'rgb(254,198,161)'
                        },
                        category_orders={"context": ["Crossing", "Contained", "Disjoint"],}
                       #text="time"
                       )
    fig.for_each_annotation(lambda a: a.update(text= "  " + a.text.split("=")[1]))

    fig.update_annotations(font=dict(size=20))
    make_general_facet_title(fig, "Intersection Context", "Relative Mean Execution Time (%)", barplot=True)
    #fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
    fig.update_yaxes(matches=None, showticklabels=True,  tickfont=dict(size=20), range=[0,105])
    fig.update_xaxes(tickfont=dict(size=20))
    #fig.update_traces(texttemplate='%{text:.2}%', textposition='inside', textfont=dict(color='white'))

    fig.update_layout(title_y=0.975,
                    title_x=0.465,
                      width=1000, 
                      height=1000, 
                      legend=dict(
                            yanchor="top",
                            y=1.15,
                            xanchor="left",
                            x=0.95,
                           #orientation="h",
                            font=dict(size=20),
                        ),
                        title=dict(
                            text=f'Relative Mean Execution Time for Intersection',
                            
                            font=dict(size=28)
                            )
                        )
    fig.add_annotation(
        dict(
            x=0.5,
            y=1.08,  # adjust as needed
            showarrow=False,
            text="Geometry Size",
            xref="paper",
            yref="paper",
            font=dict(size=24),
            xanchor="center", 
            yanchor="top"
        ))
    for row in range(2):
        for col in range(3):
            fig.add_hline(y=100, line_dash="dot", row=row, col=col)

    fig.show()


In [214]:
total_time_df = stats_df[['context', 'total_time', 'sizes', 'baseline', 'predicate', 'decomp', 'dataset', 'dataset_idx']]
plot_total_time_bars_relative(total_time_df, is_predicate=False, size_cat_excl=[], log_scale=False) #

Baseline decompression fractions


Unnamed: 0_level_0,Unnamed: 1_level_0,decomp_frac
context,sizes,Unnamed: 2_level_1
Contained,L/L,0.594396
Contained,M/L,0.818214
Contained,M/M,0.538491
Contained,S/L,0.810741
Contained,S/M,0.520367
Contained,S/S,0.424762
Crossing,L/L,0.358745
Crossing,M/L,0.407025
Crossing,M/M,0.303516
Crossing,S/L,0.525387


FPDE decompression fractions


Unnamed: 0_level_0,Unnamed: 1_level_0,decomp_frac
context,sizes,Unnamed: 2_level_1
Contained,L/L,0.515842
Contained,M/L,0.650023
Contained,M/M,0.439622
Contained,S/L,0.612442
Contained,S/M,0.38586
Contained,S/S,0.364033
Crossing,L/L,0.168429
Crossing,M/L,0.194118
Crossing,M/M,0.177319
Crossing,S/L,0.25062


FPDE Relative total execution time


Unnamed: 0,context,sizes,baseline,total_time,predicate,decomp,dataset_idx,rel_tot_exec
0,Contained,L/L,False,0.019645,0.0,0.00647,47467.793103,0.392773
2,Contained,M/L,False,0.002743,0.0,0.001843,51288.654723,0.098335
4,Contained,M/M,False,0.002006,0.0,0.000677,51964.476636,0.860414
6,Contained,S/L,False,0.002183,0.0,0.00175,49871.579229,0.075337
8,Contained,S/M,False,0.000756,0.0,0.00029,50224.313058,0.608377
10,Contained,S/S,False,0.000689,0.0,0.000248,50375.105612,0.821023
12,Crossing,L/L,False,0.007491,0.0,0.001031,49948.150847,0.34344
14,Crossing,M/L,False,0.002827,0.0,0.000511,50280.592739,0.502412
16,Crossing,M/M,False,0.002106,0.0,0.000344,49976.410339,0.764157
18,Crossing,S/L,False,0.002409,0.0,0.000807,49826.937342,0.066236


#### Mean Execution time

In [82]:
def plot_total_time_bars(df, is_predicate=False, size_cat_excl=[], log_scale=True):
    df = df[(df['predicate'] == is_predicate)] #(df['sizes'].isin(["Small/Small", "Medium/Medium", "Large/Large"]))]
    df = df.groupby(['context', 'baseline', 'sizes']).mean()
    display(df)
    df.reset_index(inplace=True)
    df.replace({"Fully Inside Other (TRUE)": "Contained", 
            "Overlap (FALSE)": "Disjoint",
            "Partial Overlap (TRUE)": "Crossing",
            "Large/Large": "L/L",
            "Small/Large": "S/L",
            "Medium/Large": "M/L",
            "Small/Medium": "S/M",
            "Medium/Medium": "M/M",
            "Small/Small": "S/S",
            True: "Baseline",
            False: "FPDE"}, inplace=True)
    

    intersection_formatting = "Intersection" if not is_predicate else "Is_Intersection"
    fig = px.histogram(df, 
                       x="context", 
                       y="total_time", 
                       facet_col="sizes", 
                       color="baseline",
                       barmode='group',
                       log_y=False, 
                       facet_col_spacing=0.1, 
                       histfunc="avg", 
                       title=f"Execution Time for {intersection_formatting} in Different Contexts & Sizes",
                       labels=dict(sizes="Geometry Sizes", context="Context", baseline="Algorithm", Stage="Stage: "),
                       facet_col_wrap=3,
                       facet_row_spacing=0.1,
                       height=1000,
                       width=1000,
                       color_discrete_map={
                            "Baseline": 'rgb(169,234,184)',
                            "FPDE": 'rgb(255,183,180)',
                        },
                        category_orders={"context": ["Crossing", "Contained", "Disjoint"],}
                       )
    fig.for_each_annotation(lambda a: a.update(text= "  " + a.text.split("=")[1]))

    fig.update_annotations(font=dict(size=20))
 

    for axis in fig.layout:
        if type(fig.layout[axis]) == objs.layout.XAxis:
            fig.layout[axis].title.text = ''

    for axis in fig.layout:
        if type(fig.layout[axis]) == objs.layout.YAxis:
            fig.layout[axis].title.text = '' 

    fig.add_annotation(
        dict(
            x=0.5,
            y=-0.1,  # adjust as needed
            showarrow=False,
            text="Intersection Context",
            xref="paper",
            yref="paper",
            font=dict(size=24),
            xanchor="center", 
            yanchor="top",
            textangle=0,  # to rotate the text for the vertical y-axis

        )
    )

    fig.add_annotation(
    dict(
        x=-0.08,  # adjust as needed
        y=0.5,
        showarrow=False,
        text="Mean Execution Time (s)",
        textangle=-90,
        xref="paper",
        yref="paper",
        font=dict(size=24),
        xanchor="center",
        yanchor="middle"
    )
)



    #fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
    fig.update_yaxes(matches=None, showticklabels=True,  tickfont=dict(size=16))
    fig.update_xaxes(tickfont=dict(size=16))
    #fig.update_traces(texttemplate='%{text:.2}%', textposition='inside', textfont=dict(color='white'))
    fig.add_annotation(
    dict(
        x=0.5,
        y=1.08,  # adjust as needed
        showarrow=False,
        text="Geometry Size",
        xref="paper",
        yref="paper",
        font=dict(size=24),
        xanchor="center", 
        yanchor="top"
    ))
    fig.update_xaxes(tickfont=dict(size=20))
    fig.update_yaxes(tickfont=dict(size=20))


    
    fig.update_layout(barmode="group",
                      title_x=0.5,
                      title_y=0.975,
                      width=1000, 
                      height=1000, 
                      
                      legend=dict(
                            yanchor="top",
                            y=1.15,
                            xanchor="left",
                            x=0.95,
                           #orientation="h",
                            font=dict(size=20),
                        ),
                        title=dict(
                            text=f'Mean Execution Time for Intersection',
                            font=dict(size=28)
                            )
                        )
    
 
    fig.show()
    fig.write_image("fig3.svg")


In [84]:
total_time_df = stats_df[['context', 'total_time', 'sizes', 'baseline', 'predicate', 'decomp']]
plot_total_time_bars(total_time_df, is_predicate=False, size_cat_excl=[], log_scale=False) #

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_time,predicate,decomp
context,baseline,sizes,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fully Inside Other (TRUE),False,Large/Large,0.019645,0.0,0.00647
Fully Inside Other (TRUE),False,Medium/Large,0.002743,0.0,0.001843
Fully Inside Other (TRUE),False,Medium/Medium,0.002006,0.0,0.000677
Fully Inside Other (TRUE),False,Small/Large,0.002183,0.0,0.00175
Fully Inside Other (TRUE),False,Small/Medium,0.000756,0.0,0.00029
Fully Inside Other (TRUE),False,Small/Small,0.000689,0.0,0.000248
Fully Inside Other (TRUE),True,Large/Large,0.050015,0.0,0.017129
Fully Inside Other (TRUE),True,Medium/Large,0.027894,0.0,0.026159
Fully Inside Other (TRUE),True,Medium/Medium,0.002332,0.0,0.001
Fully Inside Other (TRUE),True,Small/Large,0.028982,0.0,0.027926


In [75]:
def get_percentage_cases(df, is_predicate=False):
    df = df[(df['predicate'] == is_predicate) & (df['baseline'] == False)] #(df['sizes'].isin(["Small/Small", "Medium/Medium", "Large/Large"]))]
    df = df.groupby(['context', 'sizes']).count()
    return df, sum(df['dataset'])
df, cnt = get_percentage_cases(stats_df)
df = 100 * df / (cnt)
#display(df['dataset'])

stats_df = stats_df[(stats_df['predicate'] == False)] #(df['sizes'].isin(["Small/Small", "Medium/Medium", "Large/Large"]))]

stats_df['frac'] =(stats_df["decomp"].div(stats_df.total_time, axis=0) * 100)
#display(stats_df.groupby(["baseline", "context", "sizes"])['frac'].mean())
#display(stats_df.groupby(["baseline", "context", "sizes"])[['total_time', 'decomp']].mean())

#display(stats_df[(stats_df['context'] == "Fully Inside Other (TRUE)") & (stats_df['baseline'] == False) & (stats_df['sizes'] == "Large/Large")])


#dataset, _, dataset_name = datasets[2]
#np.random.seed(2021)
#indicies = np.random.choice(len(dataset), size=100000, replace=True)
#dataset = np.array(dataset)[indicies] 
#g1, g2 = dataset[65461]
#create_canvas()
#plot_geometry(g1)
#plot_geometry(g2)
#plt.show()





#### Chunk Unfolding Analyzation

In [76]:
chunk_unfolded_df = stats_df[['nbr_recieved_chks', 'nbr_total_chks','predicate','baseline', 'context', 'sizes']]
chunk_unfolded_df["chk_fraction_unfolded"] = chunk_unfolded_df["nbr_recieved_chks"].div(chunk_unfolded_df.nbr_total_chks, axis=0) * 100

In [77]:
def plot_chunk_unfolded_frac(df, is_predicate=False, is_baseline=False):
    df = df[df['baseline'] == False]
    #df["chk_fraction_unfolded"] = df["nbr_recieved_chks"].div(df.nbr_total_chks, axis=0) * 100
    #df = df[(df['predicate'] == is_predicate) & (df['sizes'].isin(["Small/Small", "Medium/Medium", "Large/Large"]))]
    df.replace({"Fully Inside Other (TRUE)": "Contained", 
                "Overlap (FALSE)": "Disjoint",
                "Partial Overlap (TRUE)": "Crossing",
                "Large/Large": "L/L",
                "Small/Large": "S/L",
                "Medium/Large": "M/L",
                "Small/Medium": "S/M",
                "Medium/Medium": "M/M",
                "Small/Small": "S/S",
                }, inplace=True)
    df = df.groupby(['context', 'baseline', 'sizes']).mean()
    df.reset_index(inplace=True)



    intersection_format = "Intersection" if not is_predicate else "IsIntersection"

    fig = px.bar(df, 
                       x="chk_fraction_unfolded", 
                       y="context", 
                       facet_row="sizes", 
                       barmode='group',
                       facet_col_spacing=0.1, 
                       title=f"Fraction of Chunks Unfolded for {intersection_format}",
                       labels=dict(sizes="Geometry Sizes", context="Context", baseline="Baseline", Stage="Stage: "),
                       facet_col_wrap=1,
                       height=1000,
                       width=1300,
                       orientation="h",
                       text="chk_fraction_unfolded",
                       color_discrete_sequence=['rgb(255,183,180)']*len(df),
                       category_orders={"context": ["Crossing", "Contained", "Disjoint"],}
                       )
    
    fig.update_annotations(font=dict(size=24))
    fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
    fig.update_yaxes(showticklabels=True, visible=True,  tickfont=dict(size=24))
    fig.update_xaxes(tickfont=dict(size=24))
    fig.update_traces(texttemplate='%{text:.0f}%' ,textfont=dict(size=24))
    fig.update_layout(barmode="relative",
                      title_x=0.5,
                      width=1300, 
                      height=1300, 
                      legend=dict(
                            yanchor="top",
                            y=1,
                            xanchor="left",
                            x=0.5,
                            orientation="h",
                            font=dict(size=24)
                        ),
                        title=dict(
                            text=f'Fraction of Chunks Unfolded for Intersection',
                            x=0.5,
                            font=dict(size=32)
                            ),
                        annotations=[dict(                           
                            textangle=0
                            )
                        ],
                        margin=dict(
                        l=150,
                        r=200,
                        b=100,
                        t=100,
                        pad=5
                    )
                        )

    for axis in fig.layout:
        if type(fig.layout[axis]) == objs.layout.XAxis:
            fig.layout[axis].title.text = ''

    for axis in fig.layout:
        if type(fig.layout[axis]) == objs.layout.YAxis:
            fig.layout[axis].title.text = '' 
    
    

    fig.for_each_annotation(lambda a: a.update(text= "  " + a.text.split("=")[1]))
    #fig.for_each_trace(lambda t: t.update(name=t.name.split("=")[0]))
    fig.add_annotation(
    dict(
        x=0.5,
        y=-0.05,  # adjust as needed
        showarrow=False,
        text="Chunks Unfolded (%)",
        xref="paper",
        yref="paper",
        font=dict(size=24),
        xanchor="center", 
        yanchor="top"
    )
    )

    fig.add_annotation(
    dict(
        x=1.08,
        y=0.62,  # adjust as needed
        showarrow=False,
        text="Geometry Size",
        xref="paper",
        yref="paper",
        textangle=-90,
        font=dict(size=28),
        xanchor="center", 
        yanchor="top"
    )
    )

    fig.add_annotation(
    dict(
        x=-0.148,  # adjust as needed
        y=0.5,
        showarrow=False,
        text="Intersection Context",
        textangle=-90,
        xref="paper",
        yref="paper",
        font=dict(size=28),
        xanchor="center",
        yanchor="middle",
    ))
   
    fig.show()
    fig.write_image("fig2.svg")


## Results

#### Total time Analysis

#### (%) Unfolding of chunks

In [78]:
plot_chunk_unfolded_frac(chunk_unfolded_df, is_predicate=False, is_baseline=False)

In [25]:
def get_max_min_relative_speed(df):
    def calculate_ratio(x):
        base = x[x["baseline"] == True]['total_time']
        non_base = x[x["baseline"] == False]['total_time']
        if not base.empty and not non_base.empty:
            val = non_base.iloc[0] / base.iloc[0]
            return val
        else:
            return None

    df = df[stats_df['predicate'] == False]
    df_relative = df.groupby(["dataset_idx", "dataset"]).apply(calculate_ratio)

    df = df[~df['baseline']]
    df['total_time_ratio'] = df_relative.values
    maxes = df.sort_values(by='total_time_ratio',ascending=False).groupby(['context','sizes']).head(1)
    mins = df.sort_values(by='total_time_ratio',ascending=True).groupby(['context','sizes']).head(1)
    return mins, maxes

mins, maxes = get_max_min_relative_speed(stats_df)
print("_______________________\Lowest relative speed by context and size")
display(mins)
print("_______________________\nHighest relative speed by context and size")

display(maxes)


KeyboardInterrupt: 