In [None]:
%config InlineBackend.figure_format = "svg"
from pathlib import Path
import os

from IPython.display import display

from kerncraft.machinemodel import MachineModel
from kerncraft.kernel import KernelCode

from hpc_inspect.inspector import *
from hpc_inspect.report_helper import *

In [None]:
machine = MachineModel(path_to_yaml='machine.yml')
kernel = KernelCode(Path('kernel.c').read_text(), machine)
print('model name:', machine['model name'])
print('frequency:', machine['clock'])
print(os.getcwd().split('jobs/')[-1])

In [None]:
data = load_pickled_dataframe()
compilers = get_unique(data, 'compiler')
incore_models = get_unique(data, 'incore_model')
cache_predictors = get_unique(data, 'cache_predictor')

## Stencil Properties

In [None]:
# TODO present origin and property of kernel code
iterations_per_cacheline = get_iterations_per_cacheline(data)

## Kernel Source Code
The C representation of the kernel, as it was passed to Kerncraft for anlysis.

In [None]:
Code(data.iloc[0].job.workload.kernel.get_code(), language='C')

### In-Core Analysis
For each compiler and in-core analysis model/application, the output is presented below. The analyzed assembly is the same per compiler, but outputs may differ as IACA analyzes bytecode and prints in Intel syntax, where as OSACA and LLVM-MCA use the original assembly for output.
Also note, that the resulting cycle counts need to be scaled accourding the highlevel iterations. The compiler may have unrolled or vectorized the code an thus folded multiple high-level iterations into this single assembly block.

In [None]:
display(get_incore_analysis_tabs(data, compilers, incore_models))

## Layer Conditions
A general explaination of Layer Conditions (LC) can be found at https://rrze-hpc.github.io/layer-condition. These conditions assume inclusive least-recently-used caches, ignoring associativity effects.

Each row is one LC, if the condition holds, one may assume the number of hits and misses observed on the cache level and evicts going out of this level. If the condition is simply "True" it is the fallback streaming case, where almost no reuse can be served out of this cache.

In [None]:
display_lc_analysis(data)
# TODO present size of N for condition to be fullfilled

## Single Core Grid Scaling

### Model Prediction vs Performance
Comparing ECM and Roofline model predictions with measured performance data, in relation to the dimension size. The right-hand axis presenting "giga iterations per second" is an inverse of cycles per iteration and therefore a logarithmic scale. The x-axis is the dimension length for all dimensions variables in the code (e.g., if a code has an array `a[3][N][2*M]`, a dimension length of 1024 will result in `a[3][1024][2048]`).

In [None]:
display(get_model_analysis_tabs(data, machine))
# TODO find good metric (or name) for x-axis
# TODO highlight T_comp

### Data Transfers

In [None]:
for cp in cache_predictors:
    data.query('cache_predictor==@cp')

# TODO include predicted information into pandas
# TODO include measured informatin into pandas (inspector.py:574)

## Multi-core Thread Sacling
Scaling of OpenMP parallelized kernel, with tight placement (e.g., NUMA domains are filled up one after another).

In [None]:
display(get_scaling_tabs(data, machine))

## System Information

In [None]:
display(get_machinestate_html('machinestate.json'))
# TODO replace by more elegent solution

# Notebook things to look at:
 * https://ipywidgets.readthedocs.io/en/latest/examples/Widget%20Styling.html
