In [None]:
def run_multiple_evaluations(
        path_to_gt: str,
        paths_to_predictions: list[str],
        path_to_seq_ids: str,
        labels_enum,
        classes_to_eval: list,
        metrics_to_eval: list
):
    """
        Benchmarks multiple prediction files against a ground truth and generates comparative plots.

    Args:
        path_to_gt: Path to the ground truth HDF5 file.
        paths_to_predictions: A list of paths to prediction HDF5 files.
        path_to_seq_ids: Path to a .npy file containing sequence IDs for benchmarking.
        labels_enum: Enum defining data labels (e.g., BendLabels).
        classes_to_eval: List of class enums to evaluate (e.g., [BendLabels.EXON]).
        metrics_to_eval: List of metric enums to evaluate (e.g., [EvalMetrics.INDEL]).
    :param path_to_gt:
    :param paths_to_predictions:
    :param path_to_seq_ids:
    :param labels_enum:
    :param classes_to_eval:
    :param metrics_to_eval:
    :return:
    """
    all_results = {}
    for pred_path in paths_to_predictions:
        reader = H5Reader(path_to_gt=path_to_gt, path_to_predictions=pred_path)
        benchmark_results = benchmark_all(
            reader=reader,
            path_to_ids=path_to_seq_ids,
            labels=labels_enum,
            classes=classes_to_eval,
            metrics=metrics_to_eval
        )
        # Extract method name from file path
        method_name = pred_path.split("/")[-1].split(".")[0]
        all_results[method_name] = benchmark_results

    return all_results

In [None]:
def benchmark_all(reader: H5Reader, path_to_ids: str, labels, classes, metrics, collect_individual_results: bool = False):
    ids = np.load(path_to_ids)
    gts = []
    preds = []

    for seq_id in tqdm(ids,desc="Loading sequence labels"):
        bend_annot_forward, bend_annot_reverse = reader.get_gt_pred_pair(seq_id)

        gts.append(bend_annot_forward[0])
        preds.append(bend_annot_forward[1])
        gts.append(bend_annot_reverse[0])
        preds.append(bend_annot_reverse[1])

    return benchmark_gt_vs_pred_multiple(gt_labels=gts, pred_labels=preds, labels=labels, classes=classes, metrics=deepcopy(metrics),
                                         collect_individual_results=collect_individual_results)