In [None]:
"""
Notebook that is used to calculate the confidence intervals for the performance of the model,
using the bootstrap method.
"""

In [None]:
import sys
sys.path.append("..")  # Adds the project_root to the path

from utils.validate_model import TextSegmentationValidator

In [None]:
from pathlib import Path

labels = [
    "H&E", "IHCplus", "IHC", "MOL", "CON", "ADV", "BRS", "RAD", "CLN", 
    "HIS", "SID", "UNR", "CAL"
]

model_name = "flan-t5-large-_context_7_headers_false_HIS_5_CAL_5"

BASE_DIR = Path().resolve().parent

print(BASE_DIR)

evaluation = TextSegmentationValidator(labels=labels,
                        model_name=model_name,
                        base_dir=BASE_DIR,
                        model_dir=BASE_DIR / ".." / "models",
                        data_preprocessed_dir=BASE_DIR / ".." / "data" / "preprocessed_data",
                        data_predictions_dir=BASE_DIR / ".." / "data" / "predictions")

In [None]:
evaluation.validate('validation')

In [None]:
evaluation.explain_errors("validation")

In [None]:
y_true, y_pred = evaluation.actual_labels_per_medical_report, evaluation.predicted_labels_per_medical_report

In [None]:
from typing import Dict, List
from sklearn.metrics import classification_report
import tqdm
import numpy as np
import pandas as pd

def flatten(xss):
    return [x for xs in xss for x in xs]

def bootstrapper_p_medical_report(
        y_true: Dict[str, List[str]],
        y_pred: Dict[str, List[str]],
        n_bootstraps: int = 1000,
        seed: int = 42,
        confidence_level: float = 0.95,
        labels: List[str] = None,
        include_all_labels_per_iteration: bool = False
    ):
    """
    """
    if list(y_true.keys()) != list(y_pred.keys()):
        raise ValueError("Keys of y_true and y_pred do not match")

    np.random.seed(seed)

    n_bootstrap_results = {}

    for iter in tqdm.tqdm(range(n_bootstraps)):
        random_keys = np.random.choice(list(y_true.keys()), size=len(y_true), replace=True)

        # Only use bootstrapped samples that contain all labels
        if include_all_labels_per_iteration:
            # Adjusted check for label presence directly in loop condition for clarity
            max_tries = 0
            while len(set(flatten([y_true[key] for key in random_keys]))) != len(labels) and max_tries < 100:
                random_keys = np.random.choice(list(y_true.keys()), size=len(y_true), replace=True)
                max_tries += 1

            if max_tries == 100:
                raise ValueError("Unable to sample all labels in 100 tries")

        y_true_sample = flatten([y_true[key] for key in random_keys])
        y_pred_sample = flatten([y_pred[key] for key in random_keys])

        # Calculate precision, recall and f1-score
        n_bootstrap_results[iter] = classification_report(y_true_sample, y_pred_sample, labels=labels, output_dict=True, zero_division=np.nan)

    # calculate mean and confidence intervals for bootstrap samples
    final_results_bootstrap = {}
    appearances = {label: 0 for label in labels}

    for label in labels:
        final_results_bootstrap[label] = {}
        for metric in ['precision', 'recall', 'f1-score']:
            final_results_bootstrap[label][metric] = {}
            values = [n_bootstrap_results[iter][label][metric] for iter in n_bootstrap_results.keys()]
            values = [value for value in values if not np.isnan(value)]
            mean = np.mean(values)
            lower = np.quantile(values, (1-confidence_level)/2)
            upper = np.quantile(values, 1-((1-confidence_level)/2))

            final_results_bootstrap[label][metric]['mean'] = mean
            final_results_bootstrap[label][metric]['lower'] = lower
            final_results_bootstrap[label][metric]['upper'] = upper

        appearances[label] = len(values)

    columns = [
    'Label', 'Precision Mean', 'Precision Lower CI', 'Precision Upper CI',
    'Recall Mean', 'Recall Lower CI', 'Recall Upper CI',
    'F1-Score Mean', 'F1-Score Lower CI', 'F1-Score Upper CI'
    ]
    # Process data into the new format
    rows = []
    for label, metrics in final_results_bootstrap.items():
        row = [label]  # start with the label
        for metric in ['precision', 'recall', 'f1-score']:
            row.extend([metrics[metric]['mean'], metrics[metric]['lower'], metrics[metric]['upper']])
        rows.append(row)

    # Create DataFrame
    df = pd.DataFrame(rows, columns=columns).set_index('Label')

    return df, appearances

labels = [
    "<H&E>", "<IHCplus>", "<IHC>", "<MOL>", "<CON>", "<ADV>", "<BRS>", 
    "<RAD>", "<CLN>", "<HIS>", "<SID>", "<UNR>", "<CAL>", 
    'micro avg', 'macro avg', 'weighted avg'
]


bootstrap_result, appearances = bootstrapper_p_medical_report(y_true, y_pred, n_bootstraps=10, labels=labels, include_all_labels_per_iteration=False)

In [None]:
pd.options.display.float_format = "{:,.3f}".format

labels = [
    "<H&E>", 
    "<IHCplus>",
    "<IHC>", 
    "<MOL>", 
    "<CON>", 
    "<ADV>", 
    "<BRS>", 
    "<RAD>", 
    "<CLN>", 
    "<HIS>", 
    "<SID>", 
    "<UNR>", 
    "<CAL>",
    "macro avg",
    "weighted avg"
]

# Only select the rows with index in labels
bootstrap_result.loc[labels]

# Change the order of the rows
labels = [
    "<H&E>", 
    "<IHC>", 
    "<IHCplus>",
    "<MOL>", 
    "<CLN>", 
    "<HIS>", 
    "<CON>", 
    "<RAD>", 
    "<BRS>", 
    "<CAL>", 
    "<ADV>", 
    "<SID>", 
    "<UNR>",
    "macro avg",
    "weighted avg"
]

# Remove all "<" and ">" from the index
bootstrap_result.loc[labels].rename(index=lambda x: x.replace("<", "").replace(">", ""))

In [None]:
appearances