# Benchmark libSps with simulated data

this should give us a feeling how everything scales for different dataspace sizes, amount of data, and number of dimensions

## Basic imports and settings first

In [147]:
import os
os.chdir("../build_benchmark")
from sps import VERSION, make_sps_index, MemSimpleVector, CachedSimpleVector
os.chdir("../notebooks")
import random
import time
import os
import time
import pickle


from bokeh.plotting import figure
from bokeh.palettes import viridis
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, FactorRange, LabelSet
from bokeh.layouts import column, row, gridplot
output_notebook()

print("libSps version:", VERSION)

K = 1000
G = K * K

N_QUERY = 10 * K
REPEAT_QUERY = 50


FILLS = [10 * K, 1 * G]
FILL_NAMES = ["K", "G"]
AREAS = [10 * K, 1 * G]
AREA_NAMES = ["K", "G"]

DIMS = [2, 3]
RECTANGLES = [False, True]
STORAGE = ["Ram", "Cached"]

FILES = [".prefix_sums", ".coords", ".overlays", ".datasets"]

COLOR_PALETTE = ["#0072B2", "#D55E00", "#009E73", "#E69F00", "#CC79A7", "#56B4E9", "#F0E442"]

OUTPUT_BACKEND = "svg"

libSps version: D-.-4d5d8bc-2023-03-31-11:05:18


## Setup testing functions

In [148]:
def fill_index(n, index, dims, is_ort, area):
    index.clear()
    t1 = time.perf_counter()
    for _ in range(n):
        if is_ort:
            pos_s = []
            pos_e = []
            for _ in range(0, 2):
                x = random.randrange(area)
                y = random.randrange(area)
                pos_s.append(min(x, y))
                pos_e.append(max(x, y))
            for _ in range(2, dims):
                pos_s.append(random.randrange(area))
            pos_e += pos_s[2:]
            index.add_point(pos_s, pos_e)
        else:
            pos_s = []
            for _ in range(0, dims):
                pos_s.append(random.randrange(area))
            index.add_point(pos_s)
    t2 = time.perf_counter()
    id = index.generate(verbosity=0)
    t3 = time.perf_counter()
    fill_time = (t2-t1)
    generate_tile = (t3-t2)
    # returns time in ms
    return fill_time, generate_tile, id

def query_index(index, id, dims, genome_size, n):
    ts = []
    for _ in range(REPEAT_QUERY):
        bins = []
        for _ in range(n):
            pos_s = []
            pos_e = []
            for _ in range(dims):
                x = random.randrange(genome_size)
                y = random.randrange(genome_size)
                pos_s.append(min(x, y))
                pos_e.append(max(x, y))
            bins.append((id, pos_s, pos_e))
        t1 = time.perf_counter()
        index.count_multiple(bins)
        t2 = time.perf_counter()
        ts.append(t2-t1)

    tm = sum(ts) / len(ts)
    # returns queries per ms
    return ( n / tm ) / 1000

def itr_order(*itr):
    if len(itr) == 0:
        yield ()
    else:
        for x in itr[0]:
            for t in itr_order(*itr[1:]):
                yield (x,) + t


## Compute data

In [149]:
data = {}

for dims in DIMS:
    for rectangels in RECTANGLES:
        num_ort_dims = 2 if rectangels else 0
        for storage in STORAGE:
            index = make_sps_index(".benchmark_index", dims, num_ort_dims, storage)
            for area in AREAS:
                area_size = int(area ** ( 1 / (dims + num_ort_dims)))
                for fill in FILLS:
                    print("dims:", dims, "rectangles:", rectangels, "storage:", storage, "area:", area, "fill:", fill)
                    fill_time, generate_time, idx = fill_index(fill, index, dims, rectangels, area_size)
                    data[(dims, rectangels, storage, area, fill, "fill_time")] = fill_time
                    data[(dims, rectangels, storage, area, fill, "generate_time")] = generate_time
                    queries_per_ms = query_index(index, idx, dims, area_size, N_QUERY)
                    data[(dims, rectangels, storage, area, fill, "queries_per_ms")] = queries_per_ms
                    data[(dims, rectangels, storage, area, fill, "index_size")] = index.get_size(idx) / 10**9 # in GB
                    index.clear()
            del index
            for file_suff in FILES:
                if os.path.isfile(".benchmark_index" + file_suff):
                    os.remove(".benchmark_index" + file_suff)

for area in AREAS:
    for storage in STORAGE:
        print("storage:", storage, "area:", area, "vector")
        if storage == "Ram":
            vec_d = MemSimpleVector("benchmark_index_c", True)
        else:
            vec_d = CachedSimpleVector("benchmark_index_c", True)
        for _ in range(area):
            vec_d.add(random.choice(range(area)))
        ts = []
        for _ in range(REPEAT_QUERY):
            bins = []
            for _ in range(N_QUERY):
                bins.append(random.choice(range(area)))
            t1 = time.perf_counter()
            vec_d.get_multiple(bins)
            t2 = time.perf_counter()
            ts.append(t2 - t1)
        tm = sum(ts) / len(ts)
        data[("simple_vec", area, storage, "queries_per_ms")] = ( N_QUERY / tm ) / 1000

        del vec_d
        if storage == "Cached":
            os.remove("benchmark_index_c.vals")

with open("benchmark.pickle", "wb") as out_file:
    pickle.dump(data, out_file)

dims: 2 rectangles: False storage: Ram area: 10000 fill: 10000
dims: 2 rectangles: False storage: Ram area: 10000 fill: 1000000
dims: 2 rectangles: False storage: Ram area: 1000000 fill: 10000
dims: 2 rectangles: False storage: Ram area: 1000000 fill: 1000000
dims: 2 rectangles: False storage: Cached area: 10000 fill: 10000
dims: 2 rectangles: False storage: Cached area: 10000 fill: 1000000
dims: 2 rectangles: False storage: Cached area: 1000000 fill: 10000
dims: 2 rectangles: False storage: Cached area: 1000000 fill: 1000000
dims: 2 rectangles: True storage: Ram area: 10000 fill: 10000
dims: 2 rectangles: True storage: Ram area: 10000 fill: 1000000
dims: 2 rectangles: True storage: Ram area: 1000000 fill: 10000
dims: 2 rectangles: True storage: Ram area: 1000000 fill: 1000000
dims: 2 rectangles: True storage: Cached area: 10000 fill: 10000
dims: 2 rectangles: True storage: Cached area: 10000 fill: 1000000
dims: 2 rectangles: True storage: Cached area: 1000000 fill: 10000
dims: 2 recta

## Checkpoint

In [150]:
with open("benchmark.pickle", "rb") as in_file:
    data = pickle.load(in_file)

## Plot query speed

In [151]:
height = 300
f = figure(width=0, height=height, y_range=[300, 100000], y_axis_type="log")
f.output_backend = OUTPUT_BACKEND
f.xaxis.visible = False
f.yaxis.axis_label = "[Queries / ms]"
f.x(0,0)
f.toolbar_location = None
f.outline_line_color = None

idx = 0
fs = [f]

order_names = [STORAGE, AREA_NAMES, FILL_NAMES]
order = [STORAGE, AREAS, FILLS]

for dims in DIMS:
    for data_type in RECTANGLES:
        cat_axis = [("RAM" if storage == "Ram" else "CACHE", area, amount) for storage, area, amount in itr_order(*order_names)]

        f = figure(x_range=FactorRange(*cat_axis), y_range=fs[0].y_range, width=125, height=height, y_axis_type="log")
        f.xgrid.grid_line_color = None
        f.toolbar_location = None
        f.xaxis.axis_label = ("Rectangle" if data_type else "Point") + " - " + str(dims) + "D"
        f.yaxis.visible = False
        tops = [data[(dims, data_type, storage, area, amount, "queries_per_ms")] for storage, area, amount in itr_order(*order)]
        f.vbar(x=cat_axis, 
                top=tops,
                bottom=0.1,
                width=0.8,
                color=COLOR_PALETTE[idx % 4])
        f.add_layout(LabelSet(x="x", y="y", text="text", y_offset=5,
                              source=ColumnDataSource(data={"x": cat_axis, 
                                                            "y": tops, 
                                                            "text": [f'{int(round(x, 0)):,}' for x in tops]}),
                              angle=90, angle_units="deg", text_baseline="middle", text_color="black",
                              text_font_size="10px"))
        f.output_backend = OUTPUT_BACKEND
        f.outline_line_color = None
        fs.append(f)
        idx += 1

cat_axis = [("R" if storage == "Ram" else "C", area, " ") for storage in STORAGE for area in AREA_NAMES]
f = figure(x_range=FactorRange(*cat_axis), y_range=fs[0].y_range, width=125 // 2, height=height, y_axis_type="log")
f.xgrid.grid_line_color = None
f.toolbar_location = None
f.xaxis.axis_label = "Array"
f.yaxis.visible = False
tops = [data[("simple_vec", area, storage, "queries_per_ms")] for storage in STORAGE for area in AREAS]
f.vbar(x=cat_axis, 
        top=tops,
        bottom=0.1,
        width=1.2,
        color=COLOR_PALETTE[4])
f.add_layout(LabelSet(x="x", y="y", text="text", y_offset=5,
                        source=ColumnDataSource(data={"x": cat_axis, 
                                                    "y": tops, 
                                                    "text": [f'{int(round(x, 0)):,}' for x in tops]}),
                        angle=90, angle_units="deg", text_baseline="middle", text_color="black",
                        text_font_size="10px"))
f.output_backend = OUTPUT_BACKEND
f.outline_line_color = None
fs.append(f)
idx += 1
show(gridplot([fs]))

## Plot index size and build times

In [152]:
height = 300
f = figure(width=0, height=height, y_range=[1, 100000], y_axis_type="log")
f.output_backend = OUTPUT_BACKEND
f.xaxis.visible = False
f.yaxis.axis_label = "Build Time [ms]"
f.x(0,0)
f.toolbar_location = None
f.outline_line_color = None

idx = 0
fs = [f]

f2 = figure(width=0, height=height, y_range=[100, 0.01], y_axis_type="log")
f2.output_backend = OUTPUT_BACKEND
f2.xaxis.visible = False
f2.yaxis.axis_label = "Size [MB]"
f2.x(0,0)
f2.toolbar_location = None
f2.outline_line_color = None
fs2 = [f2]

order_names = [FILL_NAMES, DIMS, STORAGE]
order = [FILLS, DIMS, STORAGE]

for area, area_name in zip(AREAS, AREA_NAMES):
    for data_type in RECTANGLES:
        cat_axis = [(amount_name, str(dims) + "D", "R" if storage == "Ram" else "C") for amount_name, dims, storage in itr_order(*order_names)]

        f = figure(x_range=FactorRange(*cat_axis), y_range=fs[0].y_range, width=125, height=height, y_axis_type="log")
        f.xgrid.grid_line_color = None
        f.toolbar_location = None
        f.xaxis.axis_label = area_name + " - " + ("Rectangle" if data_type else "Point")
        f.axis.visible = False
        tops = [data[(dims, data_type, storage, area, amount, "generate_time")] * 1000 for amount, dims, storage in itr_order(*order)]
        f.vbar(x=cat_axis, 
                top=tops,
                bottom=0.001,
                width=0.8,
                color=COLOR_PALETTE[idx % 4])
        f.add_layout(LabelSet(x="x", y="y", text="text", y_offset=5,
                              source=ColumnDataSource(data={"x": cat_axis, 
                                                            "y": tops, 
                                                            "text": [f'{int(round(x, 0)):,}' for x in tops]}),
                              angle=90, angle_units="deg", text_baseline="middle", text_color="black",
                              text_font_size="10px"))
        f.output_backend = OUTPUT_BACKEND
        f.outline_line_color = None
        fs.append(f)

        f = figure(x_range=FactorRange(*cat_axis), y_range=fs2[0].y_range, width=125, height=height, y_axis_type="log")
        f.xgrid.grid_line_color = None
        f.toolbar_location = None
        f.xaxis.axis_label = area_name + " - " + ("Rect" if data_type else "Point")
        f.yaxis.visible = False
        tops = [data[(dims, data_type, storage, area, amount, "index_size")] * 1000 for amount, dims, storage in itr_order(*order)]
        f.vbar(x=cat_axis, 
                top=tops,
                bottom=0.001,
                width=0.8,
                color=COLOR_PALETTE[idx % 4])
        f.add_layout(LabelSet(x="x", y="y", text="text", y_offset=-5, text_align="right",
                              source=ColumnDataSource(data={"x": cat_axis, 
                                                            "y": tops, 
                                                            "text": [f'{round(x, 3):,}' for x in tops]}),
                              angle=90, angle_units="deg", text_baseline="middle", text_color="black",
                              text_font_size="10px"))
        f.output_backend = OUTPUT_BACKEND
        f.outline_line_color = None
        fs2.append(f)

        idx += 1

idx += 1
show(gridplot([fs, fs2]))