# Analyis of grid hyperparameter search

In [None]:
import json
import pathlib
import pandas as pd
import plotly.express as px

import vaep.io
import vaep.pandas
import vaep.utils

pd.options.display.max_columns = 30
pd.options.display.multi_sparse = False

### Papermill parameters

papermill parameters:

In [None]:
metrics_json:str = "path/to/all_metrics.json" # file path to metrics json
configs_json:str = "path/to/all_configs.json" # file path to configs json ("meta data")

## Load metrics

In [None]:
path_metrics_json = pathlib.Path(metrics_json)
path_configs_json = pathlib.Path(configs_json)
FOLDER = path_metrics_json.parent

metrics_dict = vaep.io.load_json(path_metrics_json)
configs_dict = vaep.io.load_json(path_configs_json)

Random sample metric schema (all should be the same)

In [None]:
key_sampled = vaep.utils.sample_iterable(metrics_dict, 1)[0]
key_map = vaep.pandas.key_map(metrics_dict[key_sampled])
key_map

Metrics a `pandas.DataFrame`:

In [None]:
metrics = {}
for k, run_metrics in metrics_dict.items():
    metrics[k] = vaep.pandas.flatten_dict_of_dicts(run_metrics)

metrics = pd.DataFrame.from_dict(metrics, orient='index')
metrics.columns.names = ['data', 'model', 'metric_name']
metrics

## Metadata

Experiment metadata from configs

In [None]:
meta = pd.read_json(path_configs_json).T
meta.head()

Batch size for collab models depends on a factor (as the data in long format has roughly  N samples * M features entries).

In [None]:
meta['n_hidden_layers'] = meta.hidden_layers.apply(len)

In [None]:
# ToDo: Load from config
meta['bs_collab'] = meta['batch_size'] * 8
meta.head()

### Plot Top 10 for collab validation data
- options see [2Dline plot](https://matplotlib.org/stable/api/_as_gen/matplotlib.lines.Line2D.html#matplotlib.lines.Line2D)

In [None]:
_  = metrics["valid_collab"]['collab'].sort_values(
    'MSE').iloc[:10].plot(rot=90, 
                          x_compat=True, 
                          xticks=list(range(10)),
                          marker='o',
                          linestyle='',
                          title='Top 10 results for hyperparameters'
                         )

## Colorcoded metrics

In [None]:
metrics = metrics.set_index(pd.MultiIndex.from_frame(meta[['latent_dim', 'n_hidden_layers', 'batch_size']]))
metrics_styled = metrics.sort_index().style.background_gradient(
    # 'Greys',
    # 'BuPu',
    'viridis'
)
metrics_styled

In [None]:
metrics_styled.to_excel(FOLDER/ 'metrics_styled.xlsx')

### Plot Top 10 for collab validation data with numeric x-axis labels

In [None]:
_ = metrics["valid_collab"]['collab'].sort_values(
    'MSE').iloc[:10].plot(rot=90,
                          x_compat=False,
                          xticks=list(range(10)),
                          marker='o',
                          linestyle=''
                          )

## Collection of Performance plots 

- similar to hyperparameter performance plots in Tensorboard

In [None]:
metrics = metrics.reset_index()

In [None]:
metrics.iloc[:, :7].head(3)

### Parallel coordinates

- similar to Tensorboard visualization of a set of hyperparameters

In [None]:
pos_metric = 3
metric_sel = metrics.iloc[:,[0,1,2,pos_metric]]
title = metrics.columns[pos_metric]
metric_sel.columns = [' '.join( x[0].split('_')) for x in metric_sel.columns[:-1]] + [title[-1]]
metric_sel.head(2)

In [None]:
fig = px.parallel_categories(metric_sel, dimensions=metric_sel.columns[:-1],
                color="MSE", color_continuous_scale=px.colors.sequential.Inferno,
                title=' '.join(title[0].split('_'))
)
fig.show()

### Plotting without Multi-Index

In [None]:
metrics = pd.json_normalize([{'index': k, **d} for k,d in metrics_dict.items()], meta='index', sep= ' ')
metrics = metrics.set_index('index')
metrics = meta.join(metrics)
metrics

In [None]:
labels_dict = {"valid_collab collab MSE": 'MSE',
               'bs_collab': 'bs',
               'n_hidden_layers': "No. of hidden layers",
               'latent_dim': 'hidden layer dimension'}

#### Single model metric

In [None]:
col = "valid_collab collab MSE"
fig = px.scatter(metrics,
                 x="latent_dim",
                 y=col,
                 # color="hidden layers", # needs data in long format
                 facet_row="bs_collab",
                 facet_col="n_hidden_layers",
                 title='Performance on validation data for collaborative filtering model',
                 labels=labels_dict
                )
fig.show()

### Plotting from long format

To use colors meaningfully, the long format of the data is needed.

In [None]:
metrics_long = metrics_styled.data.stack(metrics_styled.columns.names)
metrics_long = metrics_long.to_frame('metric_value').reset_index().set_index('data')
metrics_long

#### All model metrics for one subset of data

In [None]:
dataset = 'test_fake_na'
fig = px.scatter(metrics_long.loc[dataset],
                 x="latent_dim",
                 y='metric_value',
                 color="model",
                 facet_row="metric_name",
                 facet_col="n_hidden_layers",
                 hover_data = ['batch_size'], # along other variables
                 title='Performance on test data for fake missing values',
                 labels = {**labels_dict,
                           "metric_value": '', 'metric_name': 'metric'}
                )
fig.show()