In [None]:
%%html
<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }

  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>

In [None]:
%config InlineBackend.figure_format = "svg"
from pathlib import Path
import os

from IPython.display import display
import numpy as np

from kerncraft.machinemodel import MachineModel
from kerncraft.kernel import KernelCode

from hpc_inspect.inspector import *
from hpc_inspect.report_helper import *

In [None]:
machine = MachineModel(path_to_yaml='machine.yml')
kernel = KernelCode(Path('kernel.c').read_text(), machine)
print('model name:', machine['model name'])
print('frequency:', machine['clock'])
print(os.getcwd().split('jobs/')[-1])

In [None]:
data = load_pickled_dataframe()
compilers = get_unique(data, 'compiler')
incore_models = get_unique(data, 'incore_model')
cache_predictors = get_unique(data, 'cache_predictor')

## Stencil Properties

In [None]:
# TODO present origin and property of kernel code
iterations_per_cacheline = get_iterations_per_cacheline(data)
print("Iterations per cacheline (unrolling and SIMD considered):", iterations_per_cacheline)

## Kernel Source Code
The C representation of the kernel, as it was passed to Kerncraft for anlysis.

In [None]:
Code(data.iloc[0].job.workload.kernel.get_code(), language='C')

### In-Core Analysis
For each compiler and in-core analysis model/application, the output is presented below. The analyzed assembly is the same per compiler, but outputs may differ as IACA analyzes bytecode and prints in Intel syntax, where as OSACA and LLVM-MCA use the original assembly for output.
Also note, that the resulting cycle counts need to be scaled accourding the highlevel iterations. The compiler may have unrolled or vectorized the code an thus folded multiple high-level iterations into this single assembly block.

In [None]:
cc_tab = widgets.Tab()
cc_tab_children = []
for i, cc in enumerate(compilers):
    cc_tab.set_title(i, cc)
    icm_tab = widgets.Tab(children=[])
    icm_tab_children = []
    cc_tab_children.append(icm_tab)
    for j, icm in enumerate(incore_models):
        icm_tab.set_title(j, icm)
        model_output = list(data.query("compiler == @cc and incore_model == @icm")['in-core model output'].iloc(0))
        if model_output:
            model_output = model_output[0]
        else:
            model_output = ''
        icm_tab_children.append(
            widgets.HTML(value='<pre style="line-height: 1; white-space: pre !important;">{}</pre>'.format(html.escape(model_output))))
    icm_tab.children = icm_tab_children
cc_tab.children = cc_tab_children
display(cc_tab)

## Layer Conditions
A general explaination of Layer Conditions (LC) can be found at https://rrze-hpc.github.io/layer-condition. These conditions assume inclusive least-recently-used caches, ignoring associativity effects.

Each row is one LC, if the condition holds, one may assume the number of hits and misses observed on the cache level and evicts going out of this level. If the condition is simply "True" it is the fallback streaming case, where almost no reuse can be served out of this cache.

In [None]:
# TODO present size of N for condition to be fullfilled

lc_data = []
for col in data.columns:
    if not col.endswith(' LCs'):
        continue
    cache = col.split(' ')[0]
    for lc in data.get(col).dropna()[0]:
        lc['cache'] = cache
        lc_data.append(lc)
lc_df = pd.DataFrame(lc_data)
if not lc_df.empty:
    idx = pd.MultiIndex.from_frame(lc_df[['cache', 'tail']])
    lc_df = pd.DataFrame(lc_data, columns=['condition', 'hits', 'misses', 'evicts'], index=idx)
    display(lc_df)
    display(HTML("<p>hits, misses and evicts are given in number of elements</p>"))

## Single Core Grid Scaling

### Model Prediction vs Performance
Comparing ECM and Roofline model predictions with measured performance data, in relation to the dimension size. The right-hand axis presenting "giga iterations per second" is an inverse of cycles per iteration and therefore a logarithmic scale. The x-axis is the dimension length for all dimensions variables in the code (e.g., if a code has an array `a[3][N][2*M]`, a dimension length of 1024 will result in `a[3][1024][2048]`).

In [None]:
# TODO find good metric (or name) for x-axis
# TODO highlight T_comp

# Extract overlapping and non-overlapping data-transfer components from
# machine mokdel definition
overlapping_ecm_transfers = [
    'T_{}{}'.format(from_['level'], to['level'])
    for from_, to in zip([{'level': 'Reg'}] + machine['memory hierarchy'][:-1],
                         machine['memory hierarchy'])
    if to['transfers overlap']]
nonoverlapping_ecm_transfers = [
    'T_{}{}'.format(from_['level'], to['level'])
    for from_, to in zip([{'level': 'Reg'}] + machine['memory hierarchy'][:-1],
                         machine['memory hierarchy'])
    if not to['transfers overlap']]
plt.ioff()
cp_tab = widgets.Tab()
cp_tab_children = []
for i, cp in enumerate(cache_predictors):
    cp_tab.set_title(i, cp)
    data_defines = data.sort_values(by=['define'])
    if compilers and incore_models:
        fig, axs = plt.subplots(len(compilers), len(incore_models),
                                squeeze=False,
                                figsize=(4*len(incore_models),3*len(compilers)),
                                sharey=True,
                                sharex=True)
        fig.subplots_adjust(hspace=0.3)

    ylimit = np.nanmax([
        data_defines.query(
            'pmodel=="RooflineIACA" and cores==1 and define > define.max()*0.9 and '
            'cache_predictor == @cp')['performance [cy/CL]'].mean(skipna=True),
        data_defines.query(
            'pmodel=="ECM" and cores==1 and define > define.max()*0.9 and '
            'cache_predictor == @cp')['performance [cy/CL]'].mean(skipna=True),
        data_defines.query(
            'pmodel=="Benchmark" and cores==1')['performance [cy/CL]'].max(skipna=True)])

    for i_cc, cc in enumerate(compilers):
        for i_icm, icm in enumerate(incore_models):
            ax = axs[i_cc, i_icm]
            ax.set_title("{} {}".format(cc, icm))
            ax.set_xscale("log")
            ax.set_ylim(0, ylimit*1.05)
            ax.xaxis.set_tick_params(labelbottom=True)
            #ax.yaxis.set_tick_params(labelleft=True)
            ax.grid()
            ax.set_axisbelow(True)  # places gridlines behind everything else

            # ECM
            ecm_data = data_defines.query(
                'pmodel=="ECM" and cores==1 and incore_model == @icm and '
                'compiler == @cc and cache_predictor == @cp')
            ax.stackplot(
                ecm_data['define'],
                *[ecm_data[t] for t in nonoverlapping_ecm_transfers],
                labels=[t for t in nonoverlapping_ecm_transfers])
            ax.plot(ecm_data['define'], ecm_data.T_comp, label='T_comp')
            for t in overlapping_ecm_transfers:
                ax.plot(ecm_data['define'], ecm_data[t], label=t)

            # Benchmark
            bench_data = data_defines.query('pmodel=="Benchmark" and cores==1 and compiler == @cc and cores==1')
            ax.plot(bench_data.define, bench_data['performance [cy/CL]'], '+', label='Measured')

            # RooflineIACA
            roof_data = data_defines.query('pmodel=="RooflineIACA" and incore_model == @icm and '
                                           'compiler == @cc and cache_predictor == @cp and cores==1')
            ax.plot(roof_data.define, roof_data['performance [cy/CL]'], label='RL pred.')

            ax.set_ylim(0)
            if i_icm == len(incore_models) - 1:
                T_to_P = lambda T: divide(float(machine['clock']), T) * iterations_per_cacheline
                ax_right = ax.twinx()
                ymin, ymax = ax.get_ylim()
                yticks = ax.get_yticks().tolist()
                ax_right.set_ylim(ax.get_ylim())
                ax_right.yaxis.set_major_locator(mticker.FixedLocator(yticks))
                ax.yaxis.set_major_locator(mticker.FixedLocator(yticks))
                with np.errstate(divide='ignore'):
                    yticks_its = [T_to_P(t) for t in yticks]
                if max([its for its in yticks_its if its < float('inf')]) > 5*1e8:
                    its_factor, its_factor_name = 1e9, "giga"
                elif max([its for its in yticks_its if its < float('inf')]) > 5*1e5:
                    its_factor, its_factor_name = 1e6, "mega"
                elif max([its for its in yticks_its if its < float('inf')]) > 5*1e2:
                    its_factor, its_factor_name = 1e3, "kilo"
                else:
                    its_factor, its_factor_name = 1, ""
                ax_right.set_yticklabels(["{:.2f}".format(t/its_factor) for t in yticks_its])
                ax_right.set_ylabel("{} iterations per second".format(its_factor_name))
            if i_icm == 0 and i_cc == 0:
                ax.legend(bbox_to_anchor=(0.5, 0.0), ncol=len(incore_models)*3,
                          loc='lower center', bbox_transform=fig.transFigure)
            if i_icm == 0:
                ax.set_ylabel("cycle per {} iterations".format(iterations_per_cacheline))
            if i_cc == len(compilers) - 1:
                ax.set_xlabel("dimension length")
    if len(incore_models) == 1:  # only one column
        fig.subplots_adjust(right=0.85)  # otherwise twinx will be partially hidden
    f = io.BytesIO()
    fig.savefig(f, format="svg")
    plt.close(fig)

    cp_tab_children.append(widgets.HTML(value=f.getvalue()))
cp_tab.children = cp_tab_children
plt.ion()
display(cp_tab)

### Data Transfers

In [None]:
for cp in cache_predictors:
    data.query('cache_predictor==@cp')

# TODO include predicted information into pandas
# TODO include measured informatin into pandas (inspector.py:574)

## Multi-core Thread Sacling
Scaling of OpenMP parallelized kernel, with tight placement (e.g., NUMA domains are filled up one after another).

In [None]:
#display(get_scaling_tabs(data, machine))

plt.ioff()
cp_tab = widgets.Tab()
cp_tab_children = []
for i, cp in enumerate(cache_predictors):
    cp_tab.set_title(i, cp)
    if compilers and incore_models:
        fig, axs = plt.subplots(len(compilers), len(incore_models), squeeze=False,
                                figsize=(4*len(incore_models),3*len(compilers)),
                                sharey=True,
                                sharex=True)
        fig.subplots_adjust(hspace=0.3)
    max_define = data.define.max()
    for i_cc, cc in enumerate(compilers):
        for i_icm, icm in enumerate(incore_models):
            ax = axs[i_cc, i_icm]
            ax.set_title("{} {}".format(cc, icm))
            ax.xaxis.set_tick_params(labelbottom=True)
            ax.grid()

            bench_data = data.query(
                'pmodel=="Benchmark" and define==@max_define and compiler == @cc'
            ).sort_values(by=['cores'])
            ax.plot(bench_data.cores, bench_data['performance [It/s]']/1e9, label='Measured')
            ecm_data = data.query(
                'pmodel=="ECM" and define==@max_define and incore_model==@icm and '
                'compiler == @cc and cache_predictor == @cp'
            ).sort_values(by=['cores'])
            ax.plot(ecm_data.cores, ecm_data['performance [It/s]']/1e9, label='ECM pred.')
            roof_data = data.query(
                'pmodel=="RooflineIACA" and define==@max_define and incore_model==@icm and '
                'compiler == @cc and cache_predictor == @cp'
            ).sort_values(by=['cores'])
            ax.plot(roof_data.cores, roof_data['performance [It/s]']/1e9, label='RL pred.')

            ax.set_ylim(0)
            ax.set_axisbelow(True)  # places gridlines behind everything else
            if i_icm != 0:
                ax.yaxis.set_tick_params(labelleft=False)
            if i_icm == len(incore_models) - 1:
                P_to_T = lambda P: iterations_per_cacheline*divide(float(machine['clock']), P)
                ax_right = ax.twinx()
                ymin, ymax = ax.get_ylim()
                ax_right.set_ylim(ax.get_ylim())
                yticks = ax.get_yticks().tolist()
                ax.yaxis.set_major_locator(mticker.FixedLocator(yticks))
                ax_right.yaxis.set_major_locator(mticker.FixedLocator(yticks))
                with np.errstate(divide='ignore'):
                    ax_right.set_yticklabels(["{:.2f}".format(P_to_T(t*1e9))
                                              for t in yticks])
                ax_right.set_ylabel("cycle per {} iterations".format(iterations_per_cacheline))
            # use default ticks, but always start at min_cores and end at max_cores
            min_cores, max_cores = int(data.cores.min()), int(data.cores.max())
            ax.set_xticks([t for t in ax.get_xticks()
                           if min_cores < t < max_cores] +
                          [min_cores, max_cores])
            #ax.minorticks_on()
            if i_icm == 0 and i_cc == 0:
                ax.legend(bbox_to_anchor=(0.5, 0.0), ncol=len(incore_models)*3,
                          loc='lower center', bbox_transform=fig.transFigure)
            if i_icm == 0:
                ax.set_ylabel("giga iterations per second")
            if i_cc == len(compilers) - 1:
                ax.set_xlabel("cores")
    if len(incore_models) == 1:  # only one column
        fig.subplots_adjust(right=0.85)  # otherwise twinx will be partially hidden
    f = io.BytesIO()
    fig.savefig(f, format="svg")
    plt.close(fig)

    cp_tab_children.append(widgets.HTML(
        value=f.getvalue()+"<p>with dimension length = {}</p>".format(int(max_define)).encode('utf8')))
cp_tab.children = cp_tab_children
plt.ion()
display(cp_tab)

## System Information

In [None]:
try:
    with open('machinestate.json') as f:
        ms_data = json.load(f)
        ms = machinestate.MachineState.from_dict(ms_data)
        display(HTML(ms.get_html()))
except Exception as e:
    import traceback
    print("Unable to load or process machinestate. Usually a version discrepancy.")
    traceback.print_exc()
# TODO replace by more elegent solution