# Analyis of grid hyperparameter search

In [None]:
import json
import pathlib
import pandas as pd
import plotly.express as px

import vaep.io
import vaep.pandas
import vaep.utils

pd.options.display.max_columns = 45
pd.options.display.max_rows = 100
pd.options.display.multi_sparse = False

### Papermill parameters

papermill parameters:

In [None]:
metrics_json:str = "path/to/all_metrics.json" # file path to metrics json
configs_json:str = "path/to/all_configs.json" # file path to configs json ("meta data")

In [None]:
try:
    assert pathlib.Path(metrics_json).exists()
    assert pathlib.Path(configs_json).exists()
except AssertionError:
    metrics_json = snakemake.input.metrics
    configs_json = snakemake.input.config
    print(f"{metrics_json = }", f"{configs_json = }", sep="\n")  

## Load metrics

In [None]:
path_metrics_json = pathlib.Path(metrics_json)
path_configs_json = pathlib.Path(configs_json)
FOLDER = path_metrics_json.parent

metrics_dict = vaep.io.load_json(path_metrics_json)
configs_dict = vaep.io.load_json(path_configs_json)

Random sample metric schema (all should be the same)

In [None]:
key_sampled = vaep.utils.sample_iterable(metrics_dict, 1)[0]
key_map = vaep.pandas.key_map(metrics_dict[key_sampled])
key_map

Metrics a `pandas.DataFrame`:

In [None]:
metrics_dict_multikey = {}
for k, run_metrics in metrics_dict.items():
    metrics_dict_multikey[k] = {eval(k): v for k, v in run_metrics.items()} #vaep.pandas.flatten_dict_of_dicts(run_metrics)

# metrics_dicts = {
#     'AEs': {k:v for k,v in metrics_dict_multikey.items() if 'collab' not in k},
#     'collab': {k:v for k,v in metrics_dict_multikey.items() if 'collab' in k}
# }

metrics = pd.DataFrame.from_dict(metrics_dict_multikey, orient='index')
metrics.columns.names = ['subset','data_split', 'model', 'metric_name']
metrics.index.name = 'id'
metrics = metrics.dropna(axis=1,how='all')
metrics = metrics.stack('model')
metrics = metrics.drop_duplicates()
metrics

In [None]:
metrics['NA interpolated']

In [None]:
sort_by = 'MSE'
metric_columns = ['MSE', 'MAE']
subset = metrics.columns.levels[0][0]
print(f"{subset = }")

## Metadata

Experiment metadata from configs

In [None]:
meta = pd.read_json(path_configs_json).T
meta['hidden_layers'] = meta['hidden_layers'].apply(tuple) # make list a tuple
meta['n_hidden_layers'] = meta.hidden_layers.apply(len)
meta

Batch size for collab models depends on a factor (as the data in long format has roughly  N samples * M features entries).

In [None]:
mask_collab = meta.index.str.contains('collab')
meta.loc[mask_collab, 'batch_size'] = meta.loc[mask_collab, 'batch_size_collab']
meta.loc[mask_collab, 'hidden_layers'] = None
meta

### Plot Top 10 for fake na validation data
- options see [2Dline plot](https://matplotlib.org/stable/api/_as_gen/matplotlib.lines.Line2D.html#matplotlib.lines.Line2D)

In [None]:
ax = metrics[subset]["valid_fake_na"].sort_values(
    'MSE').iloc[:10, :-1].plot(rot=70, 
                          x_compat=True, 
                          xticks=list(range(10)),
                          marker='o',
                          linestyle='',
                          title='Top 10 results for hyperparameters',
                          figsize=(16,7)
                         )

In [None]:
fig = ax.get_figure()
fig.tight_layout()
vaep.savefig(fig, name='top_10_models_validation_fake_na', folder=FOLDER)

## Colorcoded metrics

- can be one of the [matplotlib color maps](https://matplotlib.org/stable/tutorials/colors/colormaps.html), which also have reversed version indicated by `*_r`

``` python
['Accent', 'Accent_r', 'Blues', 'Blues_r', 'BrBG', 'BrBG_r', 'BuGn', 'BuGn_r', 'BuPu', 'BuPu_r', 'CMRmap', 'CMRmap_r', 'Dark2', 'Dark2_r', 'GnBu', 'GnBu_r', 'Greens', 'Greens_r', 'Greys', 'Greys_r', 'OrRd', 'OrRd_r', 'Oranges', 'Oranges_r', 'PRGn', 'PRGn_r', 'Paired', 'Paired_r', 'Pastel1', 'Pastel1_r', 'Pastel2', 'Pastel2_r', 'PiYG', 'PiYG_r', 'PuBu', 'PuBuGn', 'PuBuGn_r', 'PuBu_r', 'PuOr', 'PuOr_r', 'PuRd', 'PuRd_r', 'Purples', 'Purples_r', 'RdBu', 'RdBu_r', 'RdGy', 'RdGy_r', 'RdPu', 'RdPu_r', 'RdYlBu', 'RdYlBu_r', 'RdYlGn', 'RdYlGn_r', 'Reds', 'Reds_r', 'Set1', 'Set1_r', 'Set2', 'Set2_r', 'Set3', 'Set3_r', 'Spectral', 'Spectral_r', 'Wistia', 'Wistia_r', 'YlGn', 'YlGnBu', 'YlGnBu_r', 'YlGn_r', 'YlOrBr', 'YlOrBr_r', 'YlOrRd', 'YlOrRd_r', 'afmhot', 'afmhot_r', 'autumn', 'autumn_r', 'binary', 'binary_r', 'bone', 'bone_r', 'brg', 'brg_r', 'bwr', 'bwr_r', 'cividis', 'cividis_r', 'cool', 'cool_r', 'coolwarm', 'coolwarm_r', 'copper', 'copper_r', 'cubehelix', 'cubehelix_r', 'flag', 'flag_r', 'gist_earth', 'gist_earth_r', 'gist_gray', 'gist_gray_r', 'gist_heat', 'gist_heat_r', 'gist_ncar', 'gist_ncar_r', 'gist_rainbow', 'gist_rainbow_r', 'gist_stern', 'gist_stern_r', 'gist_yarg', 'gist_yarg_r', 'gnuplot', 'gnuplot2', 'gnuplot2_r', 'gnuplot_r', 'gray', 'gray_r', 'hot', 'hot_r', 'hsv', 'hsv_r', 'inferno', 'inferno_r', 'jet', 'jet_r', 'magma', 'magma_r', 'nipy_spectral', 'nipy_spectral_r', 'ocean', 'ocean_r', 'pink', 'pink_r', 'plasma', 'plasma_r', 'prism', 'prism_r', 'rainbow', 'rainbow_r', 'seismic', 'seismic_r', 'spring', 'spring_r', 'summer', 'summer_r', 'tab10', 'tab10_r', 'tab20', 'tab20_r', 'tab20b', 'tab20b_r', 'tab20c', 'tab20c_r', 'terrain', 'terrain_r', 'turbo', 'turbo_r', 'twilight', 'twilight_r', 'twilight_shifted', 'twilight_shifted_r', 'viridis', 'viridis_r', 'winter', 'winter_r']
```

In [None]:
cmap='cividis_r'

In [None]:
metrics_styled = metrics.unstack('model')

metrics_styled = (
    metrics_styled.set_index(
        pd.MultiIndex.from_frame(
            meta.loc[metrics_styled.index, ['latent_dim', 'hidden_layers', 'batch_size']]
        ))
    .sort_index()
    .stack('model')
    .style.background_gradient(cmap)
)
metrics = metrics_styled.data
metrics_styled

In [None]:
metrics_styled.to_excel(FOLDER/ 'metrics_styled.xlsx')

In [None]:
for k in metrics.columns.levels[0][::-1]:
    print("\n"+"*"*10, f"Subset: {k}\n")
    display(metrics[k].style.background_gradient(cmap))

### Plot Top 10 for fake na validation data

In [None]:
_ = metrics[subset]["valid_fake_na"].sort_values(
    'MSE').iloc[:10,:-1].plot(rot=45,
                          x_compat=False,
                          xticks=list(range(10)),
                          marker='o',
                          linestyle='',
                          figsize=(16,7)    
                          )

fig = ax.get_figure()
fig.tight_layout()
vaep.savefig(fig, name='top_10_models_validation_fake_na_v02', folder=FOLDER)

## Collection of Performance plots 

- similar to hyperparameter performance plots in Tensorboard

In [None]:
metrics = metrics.unstack('model').reset_index()
metrics.iloc[:, :7].head(3)

### Parallel coordinates

- similar to Tensorboard visualization of a set of hyperparameters

In [None]:
def plot_parallel_categories(metrics=metrics, model_type='DAE', metric_type='MSE', subset='NA interpolated', split='valid_fake_na'):
    sel_metric = (subset, split , metric_type, model_type)
    metric_sel = metrics.loc[:, [('latent_dim', '', '', ''),
                                ('hidden_layers', '', '', ''),
                                sel_metric]].dropna()
    title = ' '.join(sel_metric)
    metric_sel.columns = [' '.join(x[0].split('_'))
                        for x in metric_sel.columns[:-1]] + [sel_metric[-2]]
    fig = px.parallel_categories(metric_sel, dimensions=metric_sel.columns[:-1],
                color="MSE", color_continuous_scale=px.colors.sequential.Inferno,
                title=title
    )
    
    return fig

fig = plot_parallel_categories(model_type='DAE')
fig.show()

In [None]:
fig = plot_parallel_categories(metrics, 'VAE')
fig.show()

### Plotting without Multi-Index

In [None]:
metrics = {k: vaep.pandas.create_dict_of_dicts(d) for k, d in metrics_dict_multikey.items()}
metrics = pd.json_normalize([{'index': k, **d} for k,d in metrics.items()], meta='index', sep= ' ')
metrics = metrics.set_index('index')
metrics = meta.join(metrics)
metrics

In [None]:
labels_dict = {"NA not interpolated valid_collab collab MSE": 'MSE',
               'batch_size_collab': 'bs',
               'n_hidden_layers': "No. of hidden layers",
               'latent_dim': 'hidden layer dimension'}

#### Single model metric

In [None]:
col = "NA interpolated valid_fake_na collab MSE"
# col = ("NA interpolated","valid_fake_na","collab","MSE")
fig = px.scatter(metrics.dropna(subset=[col]),
                 x="latent_dim",
                 y=col,
                 # color="hidden layers", # needs data in long format
                 facet_row="batch_size_collab",
                #  facet_col="n_hidden_layers",
                 title='Performance on validation data for collaborative filtering model',
                 labels=labels_dict
                )
fig.show()

### Plotting from long format

To use colors meaningfully, the long format of the data is needed.

In [None]:
metrics_long = pd.DataFrame.from_dict(metrics_dict_multikey, orient='index')
metrics_long.columns.names = ['subset', 'data_split', 'model', 'metric_name']
metrics_long.index.name = 'id'
metrics_long

In [None]:
metrics_N = metrics_long.loc[:, pd.IndexSlice[:,:,:, 'N']]
metrics_N.columns = [' '.join([*x[:2], x[3]]) for x in metrics_N.columns] #0, 1, 3 column index
metrics_N = metrics_N.groupby(lambda x:x, axis=1).mean().astype(int) # aggregate non-missing entries -> summarize
metrics_N

In [None]:
metrics_long=metrics_long.loc[:, pd.IndexSlice[:,:,:, metric_columns]]
metrics_long = metrics_long.stack(metrics_long.columns.names).to_frame('metric_value').reset_index().set_index('id').join(metrics_N).join(meta)
metrics_long

In [None]:
metrics_long = metrics_long.reset_index().set_index('data_split')
metrics_long

In [None]:
sorted(metrics_N.columns, key=lambda x: x[::-1])

#### All model metrics for one subset of data

In [None]:
metrics_long.index.unique()

In [None]:
dataset = 'test_fake_na'

def get_plotly_figure(dataset):
    fig = px.scatter(metrics_long.loc[dataset],
                     x="latent_dim",
                     y='metric_value',
                     color="model",
                     facet_row="metric_name",
                     facet_col="subset",
                     hover_data=['n_hidden_layers', 'hidden_layers',
                                 'batch_size', 'batch_size_collab',
                                 'n_params_dae', 'n_params_collab', 'n_params_vae',
                                 'subset', f'NA not interpolated {dataset} N', f'NA interpolated {dataset} N'],
                     title=f'Performance on {dataset.replace("_", " ")} data',
                     labels={**labels_dict,
                               "metric_value": '', 'metric_name': 'metric'}
                     )
    return fig


fig = get_plotly_figure(dataset)
fig.write_image(FOLDER / f"hyperpar_{dataset}_results.pdf")
fig.show()

In [None]:
dataset = 'valid_fake_na'
fig = get_plotly_figure(dataset)
fig.write_image(FOLDER / f"hyperpar_{dataset}_results.pdf")
fig.show()