# Anlyse [`HiBench`](https://github.com/Intel-bigdata/HiBench) Measurements on [Gilgamesh](https://kb.hlrs.de/platforms/index.php/Urika_GX) (Cray URIKA GX)

## Prepare

### Get access to Mesos monitor

**Note**: To track the usage of Gilgamesh nodes, open http://127.0.0.1:5050/ in the [about:profiles](Gilgamesh profile). Authentificate with your username and password: `less /security/secrets/$USER.mesos`. 

In [4]:
!ssh gilgamesch 'echo -e " login:$(whoami)\npasswd:$(cat /security/secrets/$USER.mesos)"' 2> /dev/null

Clean up old measurements

In [69]:
!ssh gilgamesch 'rm -rf report/hibench.report' #2> /dev/null

bind: Address already in use
channel_setup_fwd_listener_tcpip: cannot listen to port: 8080
Could not request local forwarding.


### Initialize set up variables

In [1]:
import os
measurements_loacal_folder = os.path.join('.', 'data_x')
plotting_metric='throughput' # 'elapsed_time'
save_plots_to_file = "hibench-gilgamesh.html" # None # if none embed in the notebook

## Get Measurements

### Data Acquisition & Wrangling: Get data from multiple reports on cluster

Copy results to local folder

In [2]:
%%bash -s "$measurements_loacal_folder"
# rm -rf ./data/*
mkdir -p $1
scp gilgamesch:~/proj/hidalgo/wp3/soft/HiBench/report/summary/hibench*.report $1

Read and wrangle the data

In [3]:
import re
import io
import pandas

def read_hibench_report(filename, scale, ncores):
    recent_measurements = pandas.read_csv(filename, sep="\s+")
    recent_measurements.rename(columns={"Type" : "name", "Input_data_size" : "data_size", "Duration(s)": "duration", "Throughput(bytes/s)":"throughput", "Throughput/node":"node_throughput"}, inplace=True)
    recent_measurements.insert(1, 'scale', scale)
    recent_measurements.insert(2, 'ncores', ncores)
    recent_measurements['throughput'] = recent_measurements['throughput']/(1024**2) # convert B/s to MB/s
    recent_measurements['data_size'] = recent_measurements['data_size']/(1024**2) # convert B to MB
    return recent_measurements

re_filename = re.compile("hibench-(?P<scale>.+)-(?P<ncores>[0-9]+)\.report")
measurements = None
for file in os.listdir(measurements_loacal_folder):
    match_filename = re_filename.match(file)
    if match_filename:
        recent_measurements = read_hibench_report(os.path.join(measurements_loacal_folder, match_filename.group(0)),
                                                  match_filename.group('scale'), int(match_filename.group('ncores')))
        if measurements is None:
            measurements = recent_measurements
        else:
            measurements = measurements.append( recent_measurements, ignore_index=True )
# measurements = measurements.groupby(['scale','name','ncores'], as_index=False).agg({'duration':'mean', 'data_size':'mean', 'throughput':'mean', 'node_throughput':'mean'})
# measurements[measurements.scale=='gigantic'].sort_values(by=['name']).head(1)

## Plot results

### Plot elapsed time and speedup (in [`bokeh`](https://docs.bokeh.org/en/latest/))

Define metrics

In [4]:
cluster_ppn = 36

elapsedtime_metric = lambda df: df['duration']
memory_metric = lambda df: df['ram']
io_metric = lambda df: df['io_in']+df['io_out']
def speedup_metric(df):
    """Semi-speedup"""
    min_raw = df.loc[df['ncores'].idxmin()]
    return (min_raw['duration']*min_raw['ncores'])/df['duration']
def node_speedup_metric(df):
    """Semi-speedup"""
    min_raw = df[df['ncores']==cluster_ppn].iloc[0]
    return (min_raw['duration']*min_raw['ncores'])/df['duration']
def efficiency_metric(df):
    """Semi-efficiency"""
    min_raw = df.loc[df['ncores'].idxmin()]
    return (min_raw['duration']*min_raw['ncores'])/df['duration']/df['ncores']
def node_efficiency_metric(df):
    """Semi-efficiency"""
    min_raw = df[df['ncores']==cluster_ppn].iloc[0]
    return (min_raw['duration']*min_raw['ncores'])/df['duration']/df['ncores']

Plotting

In [5]:
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import Range1d, axes
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import Row, Column, gridplot
# output_notebook() # redundant in Jupyter Lab

from bokeh.palettes import Dark2_5 as palette
import itertools 

if not save_plots_to_file: output_notebook() # uncomment if wish to 

def scaling_plot(measurements, axis_type='linear', y_axis_data = ('throughput', 'Throughput, MB/s')): #'log', ('duration', 'Duration, s')
    y_axis_field, y_axis_label = y_axis_data

    row = measurements.iloc[0]
    title = 'HiBench.{name} test'.format(name=row['name'])

    # Plot some metric for all measurements
    colors = itertools.cycle(palette)
    fig = figure(title=title, sizing_mode='scale_width', y_axis_type=axis_type, x_axis_type=axis_type,)

    fig.grid.grid_line_alpha = 0.75
    fig.ygrid.band_fill_color = "olive"
    fig.ygrid.band_fill_alpha = 0.1

    min_ncores, max_ncores=measurements['ncores'].min(), measurements['ncores'].max()
    fig.xaxis.axis_label = '# of cores'
    fig.xaxis.ticker = measurements.ncores
    fig.x_range = Range1d(0, max_ncores+cluster_ppn)
    fig.extra_x_ranges = {"ClusterNodes": Range1d(start=0, end=max_ncores/cluster_ppn+1)}
    fig.add_layout(axes.LinearAxis(x_range_name="ClusterNodes", axis_label="# of nodes", ticker = measurements.ncores/cluster_ppn), 'above')

    fig.yaxis.axis_label = y_axis_label

    metric=elapsedtime_metric
    for label, measurements_scale in measurements.groupby('scale'):
        color = next(colors)
        legend=measurements_scale['scale'].iloc[0]
        fig.line(measurements_scale['ncores'], measurements_scale[y_axis_field], color=color, legend=legend)
        fig.circle(measurements_scale['ncores'], measurements_scale[y_axis_field], color=color, fill_color='white', size=6, legend=legend)

    fig.legend.location = "bottom_right"
    return fig

# List of plotting functions for popular metrics
elapsed_time_plotting_function = lambda measurements: scaling_plot(measurements, 'log', ('duration', 'Duration, s'))
# speedup_plotting_function = lambda measurements: scaling_plot(measurements, 'linear', ('speedup', 'Speedup, s'))
throughput_plotting_function =  lambda measurements: scaling_plot(measurements, 'linear', ('throughput', 'Throughput, MB/s'))

plotting_function = eval("{}_plotting_function".format(plotting_metric))
plots_table = measurements\
            .groupby(['scale','name','ncores'], as_index=False).agg({'duration':'mean', 'data_size':'mean', 'throughput':'mean', 'node_throughput':'mean'})\
            .groupby(['name'])['scale', 'name', 'ncores', 'duration', 'throughput'].apply(plotting_function)

names = ['LinearRegression', 'LogisticRegression', 'PCA', 'SVD', 'ScalaSparkAggregation', 'ScalaSparkJoin', 'ScalaSparkSort', 'ScalaSparkTerasort']
# names = measurements['name'].unique() # for all plots uncomment this line

from toolz import partition_all
L = plots_table.loc[names].values.tolist()
grid = list(partition_all(2, L))

if save_plots_to_file: output_file(save_plots_to_file, title="HiBench results on Gilgamesh")
show(gridplot(grid, plot_width=400, plot_height=400))

### Save profiles in CSV/Org-tables

In [None]:
def fix_column_name(name):
    re_col_name=re.compile(r"\(\'([a-zA-Z0-9]+)\', (\'([a-zA-Z]+)\'\)|[0-9]+)")
    m = re_col_name.match(name)
    if m: return "{1}".format(m.group(1),m.group(2))
    return name

def to_profile_tables(df, filed='duration'):
    df_profile = pandas.DataFrame(df.pivot_table(index=['name'],#['name', 'data_size'],
                                                 columns='ncores', values=['duration']).to_records())
    df_profile.columns = map(fix_column_name, df_profile.columns)
    return df_profile

measurements = measurements.groupby(['scale','name','ncores'], as_index=False).agg({'duration':'mean', 'data_size':'mean', 'throughput':'mean', 'node_throughput':'mean'})

import os
from IPython.display import display, HTML
for scale in measurements['scale'].unique():
    df_profile = to_profile_tables(pandas.DataFrame(measurements[measurements['scale'] == scale]))
#     print(display(HTML(df_profile.to_html())))
    df_profile.to_csv('/home/hpcgogol/proj/hidalgo/doc/D3.3/figs/%s.org' % scale, index_label=False, sep='|', header=True,
                      line_terminator='|' + os.linesep, float_format='%.3f')