In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import matplotlib.ticker as mticker
import itertools
import copy
import sys
import functools
from scipy import special
from sklearn import linear_model
import matplotlib.transforms as mtransforms
from sklearn.metrics import mean_squared_error
from cycler import cycler as cy
from collections import defaultdict
%run colors.ipynb

### Loading results of memory benchmarks and some munging

In [None]:
results = pd.read_csv("old server/memResults.csv").append(pd.read_csv("old server/memResultsAdditional.csv")).reset_index(drop=True)
results.replace("Scala", "ScalaV2", inplace=True, regex=True)
extend = pd.read_csv("old server/memResultsOldScala.csv")
results = results.append(extend[
    extend.name.str.contains("Scala")
]).reset_index(drop=True)

results = results.replace({"name": {
    'ClojureRrbMap': 'ClojureVectorMap'
}, "test": {
    "lin": "sequential",
    "lincumu": "sequentialCumulative",
    "rand": "random",
    "randcumu": "randomCumulative",
}})
# The 64 branching factor implementation was bugged
results = results[results.name.isin(['IntChamp64Java', 'IntHamt16Java', 'IntHamt64Java']) == False]

# results[(results.test == "lincumu") & (results.name.str.contains("Scala") & results.name.str.contains("Rrb"))]

### Function for comparing memory usage differences

In [None]:
def cmp(testa, namea, testb, nameb):
    a = results[(results.test == testa) & (results.name == namea)].set_index("amount")["size"]
    b = results[(results.test == testb) & (results.name == nameb)].set_index("amount")["size"]
    return a - b

In [None]:
cmp("sequentialCumulative", "ScalaRrbMap", "sequentialCumulative", "ScalaV2RrbMap")

In [None]:
cmp("random", "ScalaRrbMap", "sequential", "ScalaRrbMap")

### Variables used later on

In [None]:
tests = results["test"].unique()
print(tests)

amounts = results["amount"].unique()
amounts.sort()
print(amounts)

names = results["name"].unique()
names.sort()
print(tests)

### Function for comparing memory usage between different benchmarks

In [None]:
def compare(a, b):
#     fig, ax = plt.subplots(7, 4, figsize=(20,40))
#     axs = ax.flatten()

    ns = pd.DataFrame(filter(lambda n: n not in [
        "ScalaRrbMap", "ScalaTreeMap", "ScalaIntMap", "IntChamp32Kotlin", "IntHamt32Kotlin", "ArrayMap", "SdkMap"
    ], names))
    ns = ns.apply(lambda name: pd.Series([
            name[0],
            (lambda a, b:
                 name[0]
#                 np.max(b / a)
#                 np.sum(np.square(a[:min(len(a), len(b))] - b[:min(len(a), len(b))])) / 2
#                 mean_squared_error(
#                     a[:min(len(a), len(b))],
#                     b[:min(len(a), len(b))]
#                 )
            )(
                results[(results.test == a) & (results.name == name[0])]['size'].reset_index(drop=True),
                results[(results.test == b) & (results.name == name[0])]['size'].reset_index(drop=True)
            )
    ], ['name', 'err']), axis=1)
    ns = ns.sort_values("err", ascending=True).reset_index(drop=True)

    fig, ax = plt.subplots(figsize=(20,20))
    for i, n in ns.iterrows():
        name = n['name']
#         ax = axs[i]
        aa = results[(results.test == a) & (results.name == name)]
        bb = results[(results.test == b) & (results.name == name)]
        
        c = (bb.set_index("amount")["size"] / aa.set_index("amount")["size"]) * 100 - 100
        p = c.plot(x='amount', y='size', ax=ax, logx=True, logy=False, title=f"{a} vs {b}", label=name, **get_style(name))
#         p = c.plot(x='amount', y='size', ax=ax, logx=True, logy=False, label=f"{a} vs {b}", title=name, **styles[name])
        p.set(xlabel="size", ylabel="savings (in %)")
        ax.legend(loc='upper left')
#         stylea = copy.copy(styles[name])
#         c = stylea["color"]
#         stylea["color"] = (c[0] - 0.05 if c[0] > 0.5 else 0, c[1] - 0.05 if c[1] > 0.5 else 0, c[2] - 0.05 if c[2] > 0.5 else 0)
#         stylea["lw"] += 0.05

#         styleb = copy.copy(styles[name])
#         c = styleb["color"]
#         styleb["color"] = (c[0] + 0.05, c[1] + 0.05, c[2] + 0.05)
#         styleb["lw"] -= 0.05

#         log = True
#         aa.plot(x='amount', y='size', ax=ax, logx=log, logy=log, label=a, title=name, **stylea)
#         bb.plot(x='amount', y='size', ax=ax, logx=log, logy=log, label=b, title=name, **styleb)
        
    plt.savefig(f'./graphs/cmp-{a}-vs-{b}.pdf', format='pdf')
    plt.clf()

In [None]:
compare("sequential", "random")
compare("sequentialCumulative", "randomCumulative")
compare("sequential", "sequentialCumulative")
compare("random", "randomCumulative")

### Add cache sizes

In [None]:
results2 = results.append([
    {'test': test, 'name': 'L1 cache', 'amount': amount, 'size': 32768}
    for test in  tests
    for amount in  amounts
])
results2 = results2.append([
    {'test': test, 'name': 'L2 cache', 'amount': amount, 'size': 262144}
    for test in  tests
    for amount in  amounts
])
results2 = results2.append([
    {'test': test, 'name': 'L3 cache', 'amount': amount, 'size': 39321600}
    for test in  tests
    for amount in  amounts
])

### Main visualisation function

In [None]:
def visualise(results2, normalizeTo = False, filename = "memresults", render = True, logx = True):
    pd.options.mode.chained_assignment = None
    print(f"visualising {filename}")
    ignore_scale = ["L1 cache", "L2 cache", "L3 cache"]#, "ArrayMap", "SdkMap"]
    if normalizeTo != False:
        logy = False
        ylabel = "savings (in %)"
    else:
        logy = True
        ylabel = "bytes"
    fig, ax = plt.subplots(2, 3, figsize=(15,10))
    axs = ax.flatten()
    maxs = [0, 0, 0, 0]
    mins = [sys.maxsize, sys.maxsize, sys.maxsize, sys.maxsize]
    for n, test in enumerate(tests):
        if n < 2:
            ax = axs[n]
        else:
            ax = axs[n + 1]
        print(test)
        iterate = results2[results2.test == test].drop("test", 1)
#         if "Cumu" in test:
#             hs = headers.set_index('name')['size']
#             iterate['size'] = iterate['size'] - (iterate['name'].map(hs) - 96).fillna(0) * (iterate['amount'] - 1)

        if normalizeTo != False:
            d = iterate[iterate.name == normalizeTo]["size"]
        for name, group in iterate.groupby("name"):
            if normalizeTo != False:
                dd = d[:len(group.index)]
                dd.index = group.index[:len(dd.index)]
                group.size = dd.div(group["size"], axis=0) * 100 - 100
            if name not in ignore_scale:
                maxs[n] = max(maxs[n], np.max(group["size"]))
                mins[n] = min(mins[n], np.min(group["size"]))
            p = group.plot(x='amount', y='size', ax=ax, logx=logx, logy=logy, label=name, title=test, **get_style(name))
            p.set(xlabel="size", ylabel=ylabel)
        ax.get_legend().remove()
    padper = 0.05
    mx = max(maxs[0], maxs[2])
    mn = min(mins[0], mins[2])
    pad = padper * (mx - mn)
    for i in [0, 3]:
        axs[i].set_ylim(mn - pad, mx + pad)
    mx = max(maxs[1], maxs[3])
    mn = min(mins[1], mins[3])
    pad = padper * (mx - mn)
    for i in [1, 4]:
        axs[i].set_ylim(mn - pad, mx + pad)
    plt.tight_layout()
    pos = axs[2].get_position()
    fig.delaxes(axs[2])
    fig.delaxes(axs[5])
    handles, labels = ax.get_legend_handles_labels()
    hl = sorted(zip(handles, labels), key=functools.cmp_to_key(
        lambda a, b: -1 if ("cache" in a[1]) and ("cache" not in b[1]) else 0
    ))
    handles2, labels2 = zip(*hl)
    fig.legend(handles2, labels2, bbox_to_anchor=pos, loc='upper left')
    if filename != False:
        plt.savefig(f'./graphs/{filename}.pdf', format='pdf')
    if render:
        plt.show()
    else:
        plt.clf()

### Visualising dummy lines for understanding how the graph behaves

In [None]:
df = pd.DataFrame([
    {'test': test, 'name': 'ClojureTreeMap', 'amount': amount, 'size': amount}
    for test in  tests
    for amount in  amounts
])
df = df.append([
    {'test': test, 'name': 'PaguroTreeMap', 'amount': amount, 'size': amount * 2}
    for test in  tests
    for amount in  amounts
])
df = df.append([
    {'test': test, 'name': 'ScalaV2TreeMap', 'amount': amount, 'size': amount * 0.5}
    for test in  tests
    for amount in  amounts
])
visualise(df, normalizeTo = "ClojureTreeMap", filename = False)

### Playground for trying different visualisations

In [None]:
visualise(
    results2[
#         :
#         results2.name.isin(["IntHamt32Java", "RadixTreeRedux", "ScalaV2IntMap"])
        results2.name.isin(["ScalaV2RrbMap", "PaguroRrbMap", "PaguroVectorMap", "ClojureVectorMap"])
#         results2.name.isin(["ScalaRrbMap", "ScalaV2RrbMap"])
#         results2.name.isin(["PaguroVectorMap", "PaguroRrbMap"])
#         results2.name.isin(["PaguroVectorMap", "ClojureVectorMap"])
#         results2.name.isin(["RadixTree", "RadixTreeRedux"])
#         results2.name.str.contains("Scala")
#         results2.name.isin(["ClojureTreeMap", "PaguroTreeMap", "ScalaV2TreeMap"])
        | results2.name.str.contains("cache")
    ]
#     ,normalizeTo = "PaguroRrbMap"
#     ,normalizeTo = "ClojureVectorMap"
#     ,normalizeTo = "RadixTreeRedux"
    ,normalizeTo = "ScalaV2RrbMap"
#     ,normalizeTo = "ClojureTreeMap"
#     ,filename = False
#     ,logx = False
)

### The main routine for generating visualisations

In [None]:
for filename, normalizeTo, res in [
    ("memresults_own", "IntHamt32Java", results2.name.isin([
        'IntChamp32Kotlin','IntChamp32Java', 'IntHamt32Kotlin', 'IntHamt32Java',
        'IntImplicitKeyHamtKotlin', 'RadixTree', 'RadixTreeRedux'
    ])),
    ("memresults_generic", "ScalaHashMap", results2.name.isin([
        "ClojureHashMap", "ClojureTreeMap", "ScalaHashMap", "ScalaV2TreeMap",
        "ScalaV2HashMap", "PaguroHashMap", "PaguroTreeMap"
    ])),
    ("memresults_scala", "ScalaHashMap", results2.name.str.contains("Scala")),
    ("memresults_lib_vectors", "ScalaV2RrbMap", results2.name.isin([
        "ScalaV2RrbMap", "PaguroRrbMap", "PaguroVectorMap", "ClojureVectorMap"
    ])),
    ("memresults_specialized", "RadixTreeRedux", results2.name.isin([
        "ScalaV2RrbMap", 'PaguroVectorMap', "ScalaV2IntMap", 'IntHamt32Java', 'RadixTreeRedux'
    ])),
    ("memresults_best", "ScalaV2TreeMap", results2.name.isin([
        "ScalaV2RrbMap", 'PaguroVectorMap', "ScalaV2TreeMap", "ScalaHashMap"
    ])),
    ("memresults_perspective", "ScalaV2RrbMap", results2.name.isin([
        "ScalaV2RrbMap", "PaguroVectorMap", "ScalaHashMap", "ScalaV2HashMap", "ScalaV2TreeMap",
        "ArrayMap", "SdkMap"
    ]))
]:
    visualise(
        results2[res | results2.name.str.contains("cache")],
        normalizeTo = normalizeTo,
        filename = filename,
        render = False
    )