In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import matplotlib.patches as patches
import sys
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
from collections import defaultdict
%run colors.ipynb

### Takes JMH output and does some mangling on it

In [None]:
def fromJMH(df):
    df = df.rename(columns={
        "Param: density": "Density",
        "Param: size": "Size",
        "Score Error (99.9%)": "Error"
    })
    r = df.Benchmark.str.extract(".*_(.*)\.(.*)")
    r.columns = ['Structure', 'Benchmark']
    r.Structure = r.Structure
    r = r.assign(Benchmark = r.apply(lambda x: x[1].replace(x[0], ''), 1))
    df2 = df.drop("Benchmark", 1).join(r)
    df2 = df2[df2.Structure.isin(['Trie1j64']) == False]
    df2 = df2.replace({"Structure": {
        'Trie1': 'IntChamp32Kotlin',
        'Trie1j': 'IntChamp32Java',
        'Trie2': 'IntHamt32Kotlin',
        'Trie2j': 'IntHamt32Java',
        'Trie2j16': 'IntHamt16Java',
        'Trie2j64': 'IntHamt64Java',
        'Trie3': 'IntImplicitKeyHamtKotlin',
        'ClojurePersistentHashMap': 'ClojureHashMap',
        'ClojurePersistentTreeMap': 'ClojureTreeMap',
        'PaguroPersistentTreeMap': 'PaguroTreeMap',
        'PaguroPersistentHashMap': 'PaguroHashMap',
        'RadixBalancedTreeRedux': 'RadixTreeRedux',
        'RadixBalancedTree': 'RadixTree',
        'ClojureRrbMap': 'ClojureVectorMap'
    }, "Benchmark": {
        'missingGet': 'missingAccess',
        'hittingGet': 'hittingAccess',
        'hittingGetLinear': 'hittingAccessSequential',
        'iterateLinear': 'iterateSequential',
        'insertLinear': 'insertSequential',
    }})
    return df2

### Comparison of differences of two servers test were run on

In [None]:
results1 = fromJMH(pd.read_csv("old server/results.csv"))
results1 = results1[results1.Structure.isin(['IntHamt64Java']) == False]
a = pd.read_csv("new server/results1.csv")
b = pd.read_csv("new server/results2.csv")
c = pd.read_csv("new server/results3.csv")
newresults = fromJMH(a.append(b).append(c).reset_index(drop=True))

bench = "hittingAccess"

a = "PaguroTreeMap"
b = "PaguroHashMap"

l =  results1.Density == 0.5
ptm1 = results1[(results1.Benchmark == bench) & (results1.Structure == a) & l]
ptm1 = ptm1.drop(["Mode", "Threads", "Samples", "Unit", "Benchmark", "Structure"], 1).reset_index(drop=True)

phm1 = results1[(results1.Benchmark == bench) & (results1.Structure == b) & l]
phm1 = phm1.drop(["Mode", "Threads", "Samples", "Unit", "Benchmark", "Structure"], 1).reset_index(drop=True)

ptm1.Score = ptm1.Score / phm1.Score

ptm2 = newresults[(newresults.Benchmark == bench) & (newresults.Structure == a)]
ptm2 = ptm2.drop(["Mode", "Threads", "Samples", "Unit", "Benchmark", "Structure"], 1).reset_index(drop=True)

phm2 = newresults[(newresults.Benchmark == bench) & (newresults.Structure == b)]
phm2 = phm2.drop(["Mode", "Threads", "Samples", "Unit", "Benchmark", "Structure"], 1).reset_index(drop=True)

ptm2.Score = ptm2.Score / phm2.Score

fig, ax = plt.subplots(figsize=(15,15))
ptm1.plot("Size", "Score", logx=True, ax=ax, label="old")
ptm2.plot("Size", "Score", logx=True, ax=ax, label="new")

ptm1.Score = ptm1.Score / ptm2.Score

ptm1.plot("Size", "Score", logx=True)


# plt.plot(ptm1["Size"], ptm1["Score"]/phm1["Score"])
# plt.plot(ptm1["Size"], ptm2["Score"]/phm2["Score"])

# l = results1.Density.isnull() | (results1.Density == 0.5)
# s = "IntHamt64Java"
# a = results1[(results1.Benchmark == "hittingAccess") & results1.Structure.isin([s]) & l].drop(["Mode", "Threads", "Samples", "Unit"], 1)
# b = results2[(results2.Benchmark == "hitting") & results2.Structure.isin([s])].drop(["Mode", "Threads", "Samples", "Unit"], 1)
# plt.plot(b["Score"].reset_index(drop=True) / a["Score"].reset_index(drop=True))

### Loading the results of benchmarks. Some extra munging had to be done as they weren't done neatly in one go.

In [None]:
results = pd.read_csv("old server/results.csv")
extended = pd.read_csv("old server/resultsAdditionalAndUpdatedScala.csv")
preciseResults = fromJMH(pd.read_csv("old server/resultsPrecise.csv"))
extended.replace("Scala", "ScalaV2", inplace=True, regex=True)
results = results.append(extended[
    extended.Benchmark.str.contains("ScalaV2") |
    extended.Benchmark.str.contains("RadixBalancedTree") |
    extended.Benchmark.str.contains("PaguroVectorMap")
]).reset_index(drop=True)
up = extended[
    (extended.Benchmark.str.contains("ScalaV2") == False) &
    (extended.Benchmark.str.contains("RadixBalancedTree") == False) &
    (extended.Benchmark.str.contains("PaguroVectorMap") == False)
]
def key(row): return row['Benchmark'] + str(row['Param: size']) + str(row['Param: density'])
m = {}
for index, row in up.iterrows():
    m[key(row)] = (row['Score'], row['Score Error (99.9%)'])

results = results.apply(lambda row: {
                                'Benchmark': row['Benchmark'],
                                'Mode': row['Mode'],
                                'Threads': row['Threads'],
                                'Samples': row['Samples'],
                                'Score': m.get(key(row), (row['Score'], row['Score Error (99.9%)']))[0],
                                'Score Error (99.9%)': m.get(key(row), (row['Score'], row['Score Error (99.9%)']))[1],
                                'Unit': row['Unit'],
                                'Param: density': row['Param: density'],
                                'Param: size': row['Param: size'],
                            }
                        , axis = 1, result_type = "expand")
results2 = fromJMH(results)
results2 = results2[results2.Structure.isin(['IntHamt16Java', 'IntHamt64Java']) == False]

results2[
    results.Benchmark.str.contains("Scala") &
    (results.Benchmark.str.contains("V2") == False)
    & results.Benchmark.str.contains("BenchmarkGet_")
    & results.Benchmark.str.contains("hitting")
].head(100)

### Variables used later on

In [None]:
benchmarks = results2["Benchmark"].unique()
benchmarks.sort()
benchmarks

In [None]:
benchmarks = ['hittingAccess', 'insert', 'iterate',
              'hittingAccessSequential', 'insertSequential', 'iterateSequential',
              'missingAccess']

In [None]:
structures = results2["Structure"].unique()
structures.sort()
structures

### List data structures in the order of linelikeness in logarithm scale.

In [None]:
df = pd.DataFrame(columns = ["Benchmark", "Structure", "Line Error"])
for benchmark in benchmarks:
    iterate = results2[results2.Benchmark == benchmark].drop("Benchmark", 1)
    if "Sequential" not in benchmark:
        iterate = iterate[iterate.Density == 0.5]
    iterate = iterate.drop("Density", 1)
    for name, group in iterate.groupby("Structure"):
        regr = linear_model.LinearRegression()
#         print(group.Size, group.Score)
        x = np.log(group.Size.values.reshape(-1, 1))
        y = np.log(group.Score.values.reshape(-1, 1))
        regr.fit(x, y)
        # Yikes! Not using a testing set!!
        # But I don't care if I am generalizing or not. I just want to know how "line like" the data is.
        # After log scaling that is
        pred = regr.predict(x)
#         plt.scatter(x, y,  color='black')
#         plt.plot(x, pred, color='blue', linewidth=3)
#         plt.xticks(())
#         plt.yticks(())
#         plt.show()
        err = mean_squared_error(y, pred)
        if "Access" in benchmark:
            err = 2 * err
        df = df.append({
            "Benchmark": benchmark,
            "Structure": name,
            "Line Error": err
        }, ignore_index=True)
df2 = pd.DataFrame(columns = ["Structure", "Error"])
for name, group in df.groupby("Structure"):
    df2 = df2.append({
        "Structure": name,
        "Error": np.sqrt(np.sum(group["Line Error"].map(lambda x: x ** 2)))
    }, ignore_index=True)
df2.sort_values("Error")

### The main visualisation function

In [None]:
def visualise(results3, normalizeTo = False, error = False, filename = "results", render = True, logx = True, logy = None, densities = [0.5], stats = False):
    pd.options.mode.chained_assignment = None
    print(f"visualising {filename}")
    if logy == None:
        logy = normalizeTo == False
    if normalizeTo != False:
        ylabel = "savings (in %)"
    else:
        ylabel = "ops/s"
    fig, ax = plt.subplots(3, 3, sharex=False, sharey=False, figsize=(15,15))
    axs = ax.flatten()
    maxs = [0, 0, 0, 0, 0, 0, 0]
    mins = [sys.maxsize, sys.maxsize, sys.maxsize, sys.maxsize, sys.maxsize, sys.maxsize, sys.maxsize]
    for n, benchmark in enumerate(benchmarks):
        print(benchmark)
        ax = axs[n]
        has_density = "Sequential" not in benchmark
        ds = densities if has_density else [0.5]
        iterate = results3[results3.Benchmark == benchmark].drop("Benchmark", 1)
        for density in ds:
            if normalizeTo != False:
                d = iterate[(iterate.Structure == normalizeTo) & ((has_density == False) | (iterate.Density == 0.5))].Score
            iterate2 = iterate
            if "Sequential" not in benchmark:
                iterate2 = iterate2[iterate.Density == density]
            iterate2 = iterate2.drop("Density", 1)

            for name, group in iterate2.groupby("Structure"):
                if normalizeTo != False:
                    d.index = group.index
                    group.Score = group.Score.div(d, axis=0)
                    group.Error = group.Error.div(d, axis=0)
                    if name != normalizeTo and stats:
                        # If we calculated normal mean and variance they would correspond to log scaled ones.
                        # So instead we calculate weighted ones based on the size
                        rolld = np.roll(group.Size, 1)
                        rolld[0] = 0
                        weights = group.Size - rolld
                        ssum = np.sum(weights)
#                         gmean = np.exp(np.sum(weights * np.log(group.Score)) / ssum)
                        mean = np.sum(weights * group.Score) / ssum
                        var = np.sum(weights * (group.Score - mean) ** 2) / ssum
                        ax.axhline(mean * 100 - 100)
#                         ax.axhline(gmean * 100 - 100, ls='-')
                        ax.axhline((mean + np.sqrt(var)) * 100 - 100, ls='--')
                        ax.axhline((mean - np.sqrt(var)) * 100 - 100, ls='--')
                    group.Score = group.Score * 100 - 100
                    group.Error = group.Error * 50
                maxs[n] = max(maxs[n], np.max(group["Score"]))
                mins[n] = min(mins[n], np.min(group["Score"]))
                p = group.plot(
                    x='Size',
                    y='Score',
                    logx=logx,
                    logy=logy,
                    ax=ax,
                    label=name + (str(d) if len(ds) > 1 else ""),
                    title=benchmark,
                    zorder=1,
                    **get_style(name)
                )
                p.set(xlabel="size", ylabel=ylabel)
                style = get_style(name).copy()
                style['lw'] = None
                style['linestyle'] = '-'
                if error:
                    p.fill_between(group.Size, group.Score-group.Error, group.Score+group.Error, alpha=0.4, zorder=0, **style)
                if normalizeTo != False:
                    p.yaxis.set_major_formatter(mticker.ScalarFormatter())
                    p.yaxis.get_major_formatter().set_scientific(False)
                # TODO: Include indicator for the point where a data structure overflows different caches
        ax.get_legend().remove()
        normname = f'Normalized{normalizeTo}' if normalizeTo else ''
    padper = 0.05
    mx = max(maxs[0], maxs[3], maxs[6])
    mn = min(mins[0], mins[3], mins[6])
    pad = padper * (mx - mn)
    for i in [0, 3, 6]:
        axs[i].set_ylim(mn - pad, mx + pad)
    mx = max(maxs[1], maxs[4])
    mn = min(mins[1], mins[4])
    pad = padper * (mx - mn)
    for i in [1, 4]:
        axs[i].set_ylim(mn - pad, mx + pad)
    mx = max(maxs[2], maxs[5])
    mn = min(mins[2], mins[5])
    pad = padper * (mx - mn)
    for i in [2, 5]:
        axs[i].set_ylim(mn - pad, mx + pad)
    plt.tight_layout()
    pos = axs[7].get_position()
    fig.delaxes(axs[7])
    fig.delaxes(axs[8])
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, bbox_to_anchor=pos, loc='upper left')
    plt.savefig(f'./graphs/{filename}.pdf', format='pdf')
    if render:
        plt.show()
    else:
        plt.clf()

### Visualisations to compare between different precisions of size parameter.

In [None]:
visualise(
    preciseResults
    ,normalizeTo = "RadixTree"
    ,filename = "results_radix_precise"
    ,error = True
    ,stats = True
#     ,logx = False
    ,render = False
)
visualise(
    results2[results2.Structure.isin(["RadixTree", "RadixTreeRedux"])]
    ,normalizeTo = "RadixTree"
    ,filename = "results_radix"
    ,error = True
    ,stats = True
#     ,logx = False
    ,render = False
)

### Visualisations to demonstrate effects of log scale

In [None]:
visualise(
    results2[
        results2.Structure.isin(["ScalaV2IntMap", "ScalaV2TreeMap"])
    ]
    ,normalizeTo = "ScalaV2TreeMap"
    ,error = False
    ,render = False
    ,stats = True
    ,logx = True
    ,filename = "results_scala_int_tree"
)
visualise(
    results2[
        results2.Structure.isin(["ScalaV2IntMap", "ScalaV2TreeMap"])
    ]
    ,normalizeTo = "ScalaV2TreeMap"
    ,error = False
    ,render = False
    ,stats = True
    ,logx = False
    ,filename = "results_scala_int_tree_nonlog"
)

### Playground for trying different visualisations

In [None]:
visualise(
    results2[
#         :
#         results2.Structure.isin(["RadixTree", "RadixTreeRedux"])
#         results2.Structure.isin(["ScalaHashMap"])#, "ScalaV2HashMap"])
#         results2.Structure.isin(["ScalaHashMap", "ScalaV2HashMap", "IntHamt32Java", "IntChamp32Java"])
#         results2.Structure.str.contains("ScalaV2") == False
#         results2.Structure.str.contains("Scala") == True
#         results2.Structure.isin(["ScalaHashMap", "ScalaV2HashMap", "PaguroHashMap"])
        results2.Structure.isin(["ArrayMap", "SdkMap"]) == False
#         results2.Structure.isin(["ScalaV2RrbMap", 'PaguroVectorMap', "ScalaHashMap", "ScalaV2HashMap", "ScalaV2TreeMap", "PaguroHashMap"])
#         results2.Structure.isin(["ScalaV2IntMap", "ScalaV2TreeMap"])
#         results2.Structure.isin(['IntHamt32Java', 'IntHamt32Kotlin'])
    ]
#     ,normalizeTo = "RadixTreeRedux"
#     ,normalizeTo = "IntHamt32Java"
#     ,normalizeTo = "ScalaV2TreeMap"
    ,normalizeTo = "ScalaHashMap"
#     ,normalizeTo = "ScalaIntMap"
#     ,normalizeTo = "RadixTree"
    ,error = False
    ,render = False
#     ,stats = True
#     ,logx = False
#     ,densities=[0.25, 0.5, 0.75]
)

### The main routine for generating visualisations

In [None]:
for results, filename, normalizeTo, res in [
    (newresults, "new_server_results_own_hamt", "IntHamt32Java", newresults.Structure.isin([
        'IntHamt16Java', 'IntHamt32Java', 'IntHamt64Java'
    ])),
    (results2, "results_own_lang", "IntHamt32Java", results2.Structure.isin([
        'IntHamt32Java', 'IntHamt32Kotlin', 'IntChamp32Kotlin','IntChamp32Java'
    ])),
    (results2, "results_own", "IntHamt32Java", results2.Structure.isin([
        'IntHamt32Java', 'IntImplicitKeyHamtKotlin', 'RadixTree', 'RadixTreeRedux'
    ])),
    (results2, "results_generic", "ScalaHashMap", results2.Structure.isin([
        "ClojureHashMap", "ClojureTreeMap", "ScalaHashMap", "ScalaV2TreeMap",
        "ScalaV2HashMap", "PaguroHashMap", "PaguroTreeMap"
    ])),
    (results2, "results_scala", "ScalaHashMap", results2.Structure.str.contains("Scala") == True),
    (results2, "results_lib_vectors", "ScalaV2RrbMap", results2.Structure.isin([
        "ScalaV2RrbMap", "PaguroRrbMap", "PaguroVectorMap", "ClojureVectorMap"
    ])),
    (results2, "results_specialized", "RadixTreeRedux", results2.Structure.isin([
        "ScalaV2RrbMap", 'PaguroVectorMap', "ScalaV2IntMap", 'IntHamt32Java', 'RadixTreeRedux'
    ])),
    (results2, "results_best", "ScalaV2TreeMap", results2.Structure.isin([
        "ScalaV2RrbMap", 'PaguroVectorMap', "ScalaV2TreeMap", "ScalaHashMap"
    ])),
    (results2, "results_perspective", "ScalaV2TreeMap", results2.Structure.isin([
        "ScalaV2RrbMap", "PaguroVectorMap", "ScalaHashMap", "ScalaV2HashMap", "ScalaV2TreeMap", "PaguroHashMap",
        "ArrayMap", "SdkMap"
    ]))
]:
    pd.options.mode.chained_assignment = None
    visualise(
        results[res],
        normalizeTo = normalizeTo,
        filename = filename,
        render = False,
#         error = True,
        logx = True
    )