# Transfer predictions from NAGuideR

In [None]:
import logging
import pandas as pd
import seaborn as sns

import vaep
import vaep.models
from vaep.io import datasplits
import vaep.pandas
from vaep.pandas import calc_errors


logger = vaep.logging.setup_logger(logging.getLogger('vaep'))

In [None]:
# catch passed parameters
args = None
args = dict(globals()).keys()

Papermill script parameters:

In [None]:
# files and folders
# Datasplit folder with data for experiment
folder_experiment: str = 'runs/example'
folder_data: str = ''  # specify data directory if needed
file_format: str = 'csv'  # file format of create splits, default pickle (csv)
identifer_str: str = '_all_'  # identifier for prediction files to be filtered
dumps: list = None  # list of dumps to be used

Some argument transformations

In [None]:
args = vaep.nb.get_params(args, globals=globals())
args = vaep.nb.args_from_dict(args)
args

load data splits

In [None]:
data = datasplits.DataSplits.from_folder(
    args.data, file_format=args.file_format)

Validation and test data split of simulated missing values

In [None]:
val_pred_fake_na = data.val_y.to_frame(name='observed')
val_pred_fake_na

In [None]:
test_pred_fake_na = data.test_y.to_frame(name='observed')
test_pred_fake_na.describe()

In [None]:
# Find and load prediction files, filter for validation and test data

In [None]:
if args.dumps is not None:
    entire_pred = args.dumps.split(',')
entire_pred

In [None]:
entire_pred = list(file for file in args.out_preds.iterdir()
                   if '_all_' in str(file))
entire_pred

In [None]:
for fpath in entire_pred:
    col_name = fpath.stem.split('_all_')[-1]
    pred = pd.read_csv(fpath, index_col=[1,0])
    # pred.columns = pred.columns.str[1:].str.replace(
    #     '.', '-', regex=False)  # NaGuideR change the sample names
    # pred.columns.name = test_pred_fake_na.index.names[0]
    # pred.index.name = test_pred_fake_na.index.names[1]
    # pred = pred.unstack()

    val_pred_fake_na[col_name] = pred
    val_pred_fake_na[['observed', col_name]].to_csv(
        args.out_preds / f'pred_val_{col_name}.csv')

    test_pred_fake_na[col_name] = pred
    test_pred_fake_na[['observed', col_name]].to_csv(
        args.out_preds / f'pred_test_{col_name}.csv')

# del pred

In [None]:
val_pred_fake_na

Metrics for simulated missing values (NA)

In [None]:
# papermill_description=metrics
d_metrics = vaep.models.Metrics()

In [None]:
added_metrics = d_metrics.add_metrics(val_pred_fake_na, 'valid_fake_na')
added_metrics

### Test Datasplit

In [None]:
added_metrics = d_metrics.add_metrics(test_pred_fake_na, 'test_fake_na')
added_metrics

In [None]:
metrics_df = vaep.models.get_df_from_nested_dict(
    d_metrics.metrics, column_levels=['model', 'metric_name']).T
metrics_df

errors = calc_errors.calc_errors_per_bin(val_pred_fake_na, target_col='observed')
errors

In [None]:
top5 = errors.drop(['bin', 'n_obs'], axis=1).mean().sort_values().iloc[:5].index.to_list()
errors[top5].describe()

In [None]:
meta_cols = ['bin', 'n_obs']
n_obs = errors[meta_cols].apply(
        lambda x: f"{x.bin} (N={x.n_obs:,d})", axis=1
        ).rename('bin').astype('category')

errors_long = (errors[top5]
               #.drop(meta_cols, axis=1)
               .stack()
               .to_frame('intensity')
               .join(n_obs)
               .reset_index()
)
errors_long.sample(5)

In [None]:
ax = sns.barplot(data=errors_long,
            x='bin', y='intensity', hue='model')
ax.xaxis.set_tick_params(rotation=-90)

fname = args.out_figures / 'NAGuideR_errors_per_bin.png'
vaep.savefig(ax.get_figure(), fname)