In [None]:
"""
Notebook to evaluate multiple models 
"""

In [None]:
# If you have new models, make sure to delete all files that are in ../data/predictions folder before running this script

In [None]:
from pathlib import Path
from utils import TextSegmentationValidator
import pandas as pd

In [None]:
BASE_DIR = Path().resolve()

labels = [
    "H&E", "IHCplus", "IHC", "MOL", "CON", "ADV", "BRS", "RAD", "CLN", 
    "HIS", "SID", "UNR", "CAL"
]

# Find all models that are in the model folder
models = [model.name for model in BASE_DIR.joinpath("../models").iterdir() if model.is_dir()]

print(models)

In [None]:
evaluator_list = []
results = {}

for model in models:
    PREPROCESS_CONFIGURATION = model[model.find("_"):]
    
    globals()[f"evaluator{PREPROCESS_CONFIGURATION}"] = TextSegmentationValidator(labels, model, BASE_DIR, "../models", "../data/preprocessed_data", "../data/predictions")

    evaluator_list.append(globals()[f"evaluator{PREPROCESS_CONFIGURATION}"])

In [None]:
for evaluator, model in zip(evaluator_list, models):
    evaluator.validate('training')
    evaluator.validate('validation')
    
    results[str(model)] = {}
    results[str(model)]['training'] = {}
    results[str(model)]['validation'] = {}

    results[str(model)]['training']['weighted f1-score'] = evaluator.evaluation_results['training']['classification_report']['weighted avg']['f1-score']
    results[str(model)]['training']['macro f1-score'] = evaluator.evaluation_results['training']['classification_report']['macro avg']['f1-score']
    results[str(model)]['training']['accuracy'] = evaluator.evaluation_results['training']['accuracy']

    results[str(model)]['validation']['weighted f1-score'] = evaluator.evaluation_results['validation']['classification_report']['weighted avg']['f1-score']
    results[str(model)]['validation']['macro f1-score'] = evaluator.evaluation_results['validation']['classification_report']['macro avg']['f1-score']
    results[str(model)]['validation']['accuracy'] = evaluator.evaluation_results['validation']['accuracy']

In [None]:
# Transforming the nested dictionary into a format suitable for DataFrame construction
df_data = []
for model, contexts in results.items():
    for context, metrics in contexts.items():
        row = {'Model': model, 'Dataset': context}
        row.update(metrics)
        df_data.append(row)

In [None]:
df_results = pd.DataFrame(df_data)
df_results['Context'] = df_results['Model'].apply(lambda x: x.split("_")[1]).astype(int)
df_results['Header'] = df_results['Model'].apply(lambda x: x.split("_")[3])
df_results['Header'] = df_results['Header'].apply(lambda x: True if x == "true" else False)
df_results['Oversample'] = df_results['Model'].apply(lambda x: "_".join(x.split("_")[4:]))
df_results = df_results.drop(columns=['Model'])
df_results.sort_values(by=['Context', 'Header', 'Dataset'], inplace=True)
df_results.set_index(['Context', 'Header', 'Oversample', 'Dataset'], inplace=True)
df_results

In [None]:
df_results.reset_index().set_index(['Context', 'Dataset', 'Header']).unstack().unstack()

In [None]:
df_train = df_results.xs("training", level="Dataset").reset_index()
df_val = df_results.xs("validation", level="Dataset").reset_index()

In [None]:
df_results_header = df_results.xs(True, level='Header')
df_results_no_header = df_results.xs(False, level='Header')

In [None]:
df_results_header_acc = df_results_header[['accuracy']].reset_index()
df_results_header_f1 = df_results_header[['weighted f1-score']].reset_index()
df_results_header_macro_f1 = df_results_header[['macro f1-score']].reset_index()

In [None]:
df_results_no_header_acc = df_results_no_header[['accuracy']].reset_index()
df_results_no_header_f1 = df_results_no_header[['weighted f1-score']].reset_index()
df_results_no_header_macro_f1 = df_results_no_header[['macro f1-score']].reset_index()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Weighted F1-score
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

max_train_header_f1 = df_results_header_f1[df_results_header_f1['Dataset'] == 'training']['weighted f1-score'].max()
max_valid_header_f1 = df_results_header_f1[df_results_header_f1['Dataset'] == 'validation']['weighted f1-score'].max()

max_train_no_header_f1 = df_results_no_header_f1[df_results_no_header_f1['Dataset'] == 'training']['weighted f1-score'].max()
max_valid_no_header_f1 = df_results_no_header_f1[df_results_no_header_f1['Dataset'] == 'validation']['weighted f1-score'].max()

# Plotting for the first subplot
sns.barplot(data=df_results_header_f1, x='Context', y='weighted f1-score', hue='Dataset', palette='coolwarm', ax=ax1) 
ax1.set_title('Weighted F1-score for model including headers')  # Setting title for ax1
ax1.set_xlabel('Context')  # Setting x-label for ax1
ax1.set_ylabel('Weighted F1-score')  # Setting y-label for ax1
ax1.legend(title='Dataset', loc='lower right')  # Setting legend for ax1
# Add horizontal lines for highest training and validation scores
ax1.axhline(max_train_header_f1, color='blue', linestyle='--', label='Max Training Weighted F1-score')
ax1.axhline(max_valid_header_f1, color='orange', linestyle='--', label='Max Validation Weighted F1-score')
ax1.set_ylim(0.75, 1)  # Setting y-limits for ax1

# Plotting for the second subplot (assuming df_results_no_header_acc contains similar structure data for the model without headers)
sns.barplot(data=df_results_no_header_f1, x='Context', y='weighted f1-score', hue='Dataset', palette='coolwarm', ax=ax2) 
ax2.set_title('Weighted F1-score for model excluding headers')  # Setting title for ax1
ax2.set_xlabel('Context')  # Setting x-label for ax1
ax2.set_ylabel('Weighted F1-score')  # Setting y-label for ax1
ax2.legend(title='Dataset', loc='lower right')  # Setting legend for ax1
# Add horizontal lines for highest training and validation scores
ax2.axhline(max_train_no_header_f1, color='blue', linestyle='--', label='Max Training Weighted F1-score')
ax2.axhline(max_valid_no_header_f1, color='orange', linestyle='--', label='Max Validation Weighted F1-score')
ax2.set_ylim(0.75, 1)  # Setting y-limits for ax1

plt.show()

In [None]:
# Macro F1-score

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

max_train_header_macro_f1 = df_results_header_macro_f1[df_results_header_macro_f1['Dataset'] == 'training']['macro f1-score'].max()
max_valid_header_macro_f1 = df_results_header_macro_f1[df_results_header_macro_f1['Dataset'] == 'validation']['macro f1-score'].max()

max_train_no_header_macro_f1 = df_results_no_header_macro_f1[df_results_no_header_macro_f1['Dataset'] == 'training']['macro f1-score'].max()
max_valid_no_header_macro_f1 = df_results_no_header_macro_f1[df_results_no_header_macro_f1['Dataset'] == 'validation']['macro f1-score'].max()

# Plotting for the first subplot
sns.barplot(data=df_results_header_macro_f1, x='Context', y='macro f1-score', hue='Dataset', palette='coolwarm', ax=ax1)
ax1.set_title('Macro F1-score for model including headers')  # Setting title for ax1
ax1.set_xlabel('Context')  # Setting x-label for ax1
ax1.set_ylabel('Macro F1-score')  # Setting y-label for ax1
ax1.legend(title='Dataset', loc='lower right')  # Setting legend for ax1
# Add horizontal lines for highest training and validation scores
ax1.axhline(max_train_header_macro_f1, color='blue', linestyle='--', label='Max Training Macro F1-score')
ax1.axhline(max_valid_header_macro_f1, color='orange', linestyle='--', label='Max Validation Macro F1-score')
ax1.set_ylim(0.75, 1)  # Setting y-limits for ax1

# Plotting for the second subplot (assuming df_results_no_header_acc contains similar structure data for the model without headers)
sns.barplot(data=df_results_no_header_macro_f1, x='Context', y='macro f1-score', hue='Dataset', palette='coolwarm', ax=ax2)
ax2.set_title('Macro F1-score for model excluding headers')  # Setting title for ax2
ax2.set_xlabel('Context')  # Setting x-label for ax2
ax2.set_ylabel('Macro F1-score')  # Setting y-label for ax2
ax2.legend(title='Dataset', loc='lower right')  # Setting legend for ax2
# Add horizontal lines for highest training and validation scores
ax2.axhline(max_train_no_header_macro_f1, color='blue', linestyle='--', label='Max Training Macro F1-score')
ax2.axhline(max_valid_no_header_macro_f1, color='orange', linestyle='--', label='Max Validation Macro F1-score')
ax2.set_ylim(0.75, 1)  # Setting y-limits for ax1

plt.show()

In [None]:
# Accuracy
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

max_train_header_acc = df_results_header_acc[df_results_header_acc['Dataset'] == 'training']['accuracy'].max()
max_valid_header_acc = df_results_header_acc[df_results_header_acc['Dataset'] == 'validation']['accuracy'].max()

max_train_no_header_acc = df_results_no_header_acc[df_results_no_header_acc['Dataset'] == 'training']['accuracy'].max()
max_valid_no_header_acc = df_results_no_header_acc[df_results_no_header_acc['Dataset'] == 'validation']['accuracy'].max()

# Plotting for the first subplot
sns.barplot(data=df_results_header_acc, x='Context', y='accuracy', hue='Dataset', palette='coolwarm', ax=ax1) 
ax1.set_title('Accuracy for model including headers')  # Setting title for ax1
ax1.set_xlabel('Context')  # Setting x-label for ax1
ax1.set_ylabel('Accuracy')  # Setting y-label for ax1
ax1.legend(title='Dataset', loc='lower right')  # Setting legend for ax1
# Add horizontal lines for highest training and validation scores
ax1.axhline(max_train_header_acc, color='blue', linestyle='--', label='Max Training Accuracy')
ax1.axhline(max_valid_header_acc, color='orange', linestyle='--', label='Max Validation Accuracy')
ax1.set_ylim(0.75, 1)  # Setting y-limits for ax1

# Plotting for the second subplot (assuming df_results_no_header_acc contains similar structure data for the model without headers)
sns.barplot(data=df_results_no_header_acc, x='Context', y='accuracy', hue='Dataset', palette='coolwarm', ax=ax2)
ax2.set_title('Accuracy for model excluding headers')  # Setting title for ax2
ax2.set_xlabel('Context')  # Setting x-label for ax2
ax2.set_ylabel('Accuracy')  # Setting y-label for ax2
ax2.legend(title='Dataset', loc='lower right')  # Setting legend for ax2
# Add horizontal lines for highest training and validation scores
ax2.axhline(max_train_no_header_acc, color='blue', linestyle='--', label='Max Training Accuracy')
ax2.axhline(max_valid_no_header_acc, color='orange', linestyle='--', label='Max Validation Accuracy')
ax2.set_ylim(0.75, 1)  # Setting y-limits for ax2

plt.show()