<a href="https://colab.research.google.com/github/RicardoPoleo/DeepLearning_FactChecker/blob/main/notebooks/Agents/Agent_1_Model_Comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading, infering and evaluating the models for benchmarking


In [None]:
from transformers import pipeline
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd
from google.colab import data_table

# Define the models and prompts
model_names = [
    "dmis-lab/biobert-base-cased-v1.2",
    "dmis-lab/biobert-v1.1",
    "dmis-lab/biobert-base-cased-v1.1",
    "pritamdeka/BioBert-PubMed200kRCT",
    "microsoft/deberta-v3-base",
    "Clinical-AI-Apollo/Medical-NER"
]

prompts = [
  'taking vitamin c prevents the common cold.',
  'ibuprofen is effective for reducing fever.'
]

expected_entities = {
    'taking vitamin c prevents the common cold.': ["vitamin c", "common cold"],
    'ibuprofen is effective for reducing fever.': ['ibuprofen', 'fever'],
}

def load_model_for_token_classification(model_name, strategy='simple'):
    return pipeline("token-classification", model=model_name, aggregation_strategy=strategy)

def load_models(model_names, strategy='simple'):
    models_dict = {}
    for model_name in model_names:
        print(f"Loading model {model_name} with strategy {strategy}")
        model = load_model_for_token_classification(model_name, strategy=strategy)
        models_dict[model_name] = model
    print(f"Loaded {len(models_dict)} models with strategy {strategy}")
    return models_dict

def evaluate_models(models_dict, prompts):
    results = {}
    for model_name, model in models_dict.items():
        print(f"Evaluating model {model_name}")
        results[model_name] = {}
        for prompt in prompts:
            result = model(prompt)
            results[model_name][prompt] = result
    return results

def extract_entities(results):
    extracted_entities = {}
    for model_name, model_results in results.items():
        extracted_entities[model_name] = {}
        for prompt, entities in model_results.items():
            extracted = [(entity['entity_group'].lower(), entity['word'].lower()) for entity in entities]
            extracted_entities[model_name][prompt] = extracted
            # Debugging: Print extracted entities
            print(f"Model: {model_name}, Prompt: {prompt}, Extracted: {extracted}")
    return extracted_entities

def calculate_metrics(extracted_entities, expected_entities, strategy):
    metrics = {}
    for model_name, model_entities in extracted_entities.items():
        y_true = []
        y_pred = []
        for prompt, expected in expected_entities.items():
            pred_entities = model_entities.get(prompt, [])
            true_labels = expected  # Use expected directly as list of entities
            pred_labels = [entity for _, entity in pred_entities]

            # Debugging: Print true and predicted labels
            print(f"Model: {model_name}, Strategy: {strategy}, Prompt: {prompt}, True Labels: {true_labels}, Pred Labels: {pred_labels}")

            # Ensure lengths match
            min_length = min(len(true_labels), len(pred_labels))
            true_labels = true_labels[:min_length]
            pred_labels = pred_labels[:min_length]

            y_true.extend(true_labels)
            y_pred.extend(pred_labels)

        precision = precision_score(y_true, y_pred, average='macro', zero_division=1)
        recall = recall_score(y_true, y_pred, average='macro', zero_division=1)
        f1 = f1_score(y_true, y_pred, average='macro', zero_division=1)

        metrics[model_name] = {'precision': precision, 'recall': recall, 'f1': f1}

    return metrics

def metrics_to_dataframe(metrics_by_strategy):
    rows = []
    for strategy, metrics in metrics_by_strategy.items():
        for model_name, metric_values in metrics.items():
            rows.append({
                "Strategy": strategy,
                "Model": model_name,
                "Precision": metric_values['precision'],
                "Recall": metric_values['recall'],
                "F1 Score": metric_values['f1']
            })
    df = pd.DataFrame(rows)
    return df

# Define aggregation strategies
SIMPLE = "simple"
FIRST = "first"
AVERAGE = "average"
MAX = "max"

# Store models and results for each strategy
models_dict_by_strategy = {
    SIMPLE: {},
    FIRST: {},
    AVERAGE: {},
    MAX: {}
}

# Load models for each strategy
for strategy in [SIMPLE, FIRST, AVERAGE, MAX]:
    models_dict_by_strategy[strategy] = load_models(model_names, strategy=strategy)

# Evaluate models and calculate metrics for each strategy
results_by_strategy = {}
metrics_by_strategy = {}

for strategy, models_dict in models_dict_by_strategy.items():
    print(f"Evaluating models with strategy {strategy}")
    results = evaluate_models(models_dict, prompts)
    extracted_entities = extract_entities(results)
    metrics = calculate_metrics(extracted_entities, expected_entities, strategy)
    results_by_strategy[strategy] = results
    metrics_by_strategy[strategy] = metrics

# Convert metrics to a dataframe and display
df_metrics = metrics_to_dataframe(metrics_by_strategy)

# Display the dataframe as an interactive table
data_table.DataTable(df_metrics, include_index=False, num_rows_per_page=10)


Loading model dmis-lab/biobert-base-cased-v1.2 with strategy simple


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model dmis-lab/biobert-v1.1 with strategy simple


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model dmis-lab/biobert-base-cased-v1.1 with strategy simple


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model pritamdeka/BioBert-PubMed200kRCT with strategy simple
Loading model microsoft/deberta-v3-base with strategy simple


Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model Clinical-AI-Apollo/Medical-NER with strategy simple
Loaded 6 models with strategy simple
Loading model dmis-lab/biobert-base-cased-v1.2 with strategy first


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model dmis-lab/biobert-v1.1 with strategy first


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model dmis-lab/biobert-base-cased-v1.1 with strategy first


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model pritamdeka/BioBert-PubMed200kRCT with strategy first
Loading model microsoft/deberta-v3-base with strategy first


Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model Clinical-AI-Apollo/Medical-NER with strategy first
Loaded 6 models with strategy first
Loading model dmis-lab/biobert-base-cased-v1.2 with strategy average


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model dmis-lab/biobert-v1.1 with strategy average


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model dmis-lab/biobert-base-cased-v1.1 with strategy average


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model pritamdeka/BioBert-PubMed200kRCT with strategy average
Loading model microsoft/deberta-v3-base with strategy average


Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model Clinical-AI-Apollo/Medical-NER with strategy average
Loaded 6 models with strategy average
Loading model dmis-lab/biobert-base-cased-v1.2 with strategy max


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model dmis-lab/biobert-v1.1 with strategy max


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model dmis-lab/biobert-base-cased-v1.1 with strategy max


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model pritamdeka/BioBert-PubMed200kRCT with strategy max
Loading model microsoft/deberta-v3-base with strategy max


Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model Clinical-AI-Apollo/Medical-NER with strategy max


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Loaded 6 models with strategy max
Evaluating models with strategy simple
Evaluating model dmis-lab/biobert-base-cased-v1.2


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Evaluating model dmis-lab/biobert-v1.1
Evaluating model dmis-lab/biobert-base-cased-v1.1
Evaluating model pritamdeka/BioBert-PubMed200kRCT
Evaluating model microsoft/deberta-v3-base


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Evaluating model Clinical-AI-Apollo/Medical-NER


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Model: dmis-lab/biobert-base-cased-v1.2, Prompt: taking vitamin c prevents the common cold., Extracted: [('label_0', 'taking vitamin c prevents the common cold.')]
Model: dmis-lab/biobert-base-cased-v1.2, Prompt: ibuprofen is effective for reducing fever., Extracted: [('label_0', 'ibup'), ('label_1', '##ro'), ('label_0', '##fen is effective for reducing fever.')]
Model: dmis-lab/biobert-v1.1, Prompt: taking vitamin c prevents the common cold., Extracted: [('label_0', 'taking vitamin'), ('label_1', 'c prevents the'), ('label_0', 'common'), ('label_1', 'cold.')]
Model: dmis-lab/biobert-v1.1, Prompt: ibuprofen is effective for reducing fever., Extracted: [('label_1', 'ibu'), ('label_0', '##pro'), ('label_1', '##fen is effective'), ('label_0', 'for'), ('label_1', 'reducing fever.')]
Model: dmis-lab/biobert-base-cased-v1.1, Prompt: taking vitamin c prevents the common cold., Extracted: [('label_0', 'taking vitamin c prevents the common cold.')]
Model: dmis-lab/biobert-base-cased-v1.1, Promp

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Evaluating model pritamdeka/BioBert-PubMed200kRCT
Evaluating model microsoft/deberta-v3-base


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Evaluating model Clinical-AI-Apollo/Medical-NER


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Model: dmis-lab/biobert-base-cased-v1.2, Prompt: taking vitamin c prevents the common cold., Extracted: [('label_1', 'taking vitamin c prevents the common cold.')]
Model: dmis-lab/biobert-base-cased-v1.2, Prompt: ibuprofen is effective for reducing fever., Extracted: [('label_1', 'ibuprofen is effective for reducing fever.')]
Model: dmis-lab/biobert-v1.1, Prompt: taking vitamin c prevents the common cold., Extracted: [('label_1', 'taking vitamin c prevents the common cold.')]
Model: dmis-lab/biobert-v1.1, Prompt: ibuprofen is effective for reducing fever., Extracted: [('label_1', 'ibuprofen is effective for reducing fever.')]
Model: dmis-lab/biobert-base-cased-v1.1, Prompt: taking vitamin c prevents the common cold., Extracted: [('label_0', 'taking vitamin c prevents the common cold.')]
Model: dmis-lab/biobert-base-cased-v1.1, Prompt: ibuprofen is effective for reducing fever., Extracted: [('label_1', 'ibuprofen'), ('label_0', 'is effective for reducing fever.')]
Model: pritamdeka/BioB

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Evaluating model pritamdeka/BioBert-PubMed200kRCT
Evaluating model microsoft/deberta-v3-base


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Evaluating model Clinical-AI-Apollo/Medical-NER


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Model: dmis-lab/biobert-base-cased-v1.2, Prompt: taking vitamin c prevents the common cold., Extracted: [('label_0', 'taking vitamin c prevents the common cold.')]
Model: dmis-lab/biobert-base-cased-v1.2, Prompt: ibuprofen is effective for reducing fever., Extracted: [('label_0', 'ibuprofen is'), ('label_1', 'effective'), ('label_0', 'for reducing fever.')]
Model: dmis-lab/biobert-v1.1, Prompt: taking vitamin c prevents the common cold., Extracted: [('label_1', 'taking'), ('label_0', 'vitamin c'), ('label_1', 'prevents the common cold'), ('label_0', '.')]
Model: dmis-lab/biobert-v1.1, Prompt: ibuprofen is effective for reducing fever., Extracted: [('label_0', 'ibuprofen is effective for'), ('label_1', 'reducing'), ('label_0', 'fever.')]
Model: dmis-lab/biobert-base-cased-v1.1, Prompt: taking vitamin c prevents the common cold., Extracted: [('label_1', 'taking'), ('label_0', 'vitamin'), ('label_1', 'c prevents the common'), ('label_0', 'cold'), ('label_1', '.')]
Model: dmis-lab/biobert-

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Evaluating model pritamdeka/BioBert-PubMed200kRCT
Evaluating model microsoft/deberta-v3-base


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Evaluating model Clinical-AI-Apollo/Medical-NER
Model: dmis-lab/biobert-base-cased-v1.2, Prompt: taking vitamin c prevents the common cold., Extracted: [('label_1', 'taking vitamin'), ('label_0', 'c'), ('label_1', 'prevents'), ('label_0', 'the common'), ('label_1', 'cold'), ('label_0', '.')]
Model: dmis-lab/biobert-base-cased-v1.2, Prompt: ibuprofen is effective for reducing fever., Extracted: [('label_1', 'ibuprofen'), ('label_0', 'is'), ('label_1', 'effective for reducing'), ('label_0', 'fever.')]
Model: dmis-lab/biobert-v1.1, Prompt: taking vitamin c prevents the common cold., Extracted: [('label_0', 'taking vitamin'), ('label_1', 'c'), ('label_0', 'prevents the common cold.')]
Model: dmis-lab/biobert-v1.1, Prompt: ibuprofen is effective for reducing fever., Extracted: [('label_0', 'ibuprofen is effective'), ('label_1', 'for'), ('label_0', 'reducing fever.')]
Model: dmis-lab/biobert-base-cased-v1.1, Prompt: taking vitamin c prevents the common cold., Extracted: [('label_0', 'taking 

Unnamed: 0,Strategy,Model,Precision,Recall,F1 Score
0,simple,dmis-lab/biobert-base-cased-v1.2,0.5,0.5,0.0
1,simple,dmis-lab/biobert-v1.1,0.5,0.5,0.0
2,simple,dmis-lab/biobert-base-cased-v1.1,0.5,0.5,0.0
3,simple,pritamdeka/BioBert-PubMed200kRCT,0.5,0.5,0.0
4,simple,microsoft/deberta-v3-base,0.5,0.5,0.0
5,simple,Clinical-AI-Apollo/Medical-NER,0.666667,0.666667,0.333333
6,first,dmis-lab/biobert-base-cased-v1.2,0.5,0.5,0.0
7,first,dmis-lab/biobert-v1.1,0.5,0.5,0.0
8,first,dmis-lab/biobert-base-cased-v1.1,0.6,0.6,0.2
9,first,pritamdeka/BioBert-PubMed200kRCT,0.5,0.5,0.0


# Loading and calculating basic metrics

In [None]:
import pandas as pd
from google.colab import data_table
df_metrics = metrics_to_dataframe(metrics_by_strategy)
data_table.DataTable(df_metrics, include_index=False, num_rows_per_page=10)

Unnamed: 0,Strategy,Model,Precision,Recall,F1 Score
0,simple,dmis-lab/biobert-base-cased-v1.2,0.6,0.4,0.0
1,simple,dmis-lab/biobert-v1.1,0.6,0.4,0.0
2,simple,dmis-lab/biobert-base-cased-v1.1,0.666667,0.333333,0.0
3,simple,pritamdeka/BioBert-PubMed200kRCT,0.666667,0.333333,0.0
4,simple,microsoft/deberta-v3-base,0.666667,0.333333,0.0
5,simple,Clinical-AI-Apollo/Medical-NER,0.5,0.5,0.0
6,first,dmis-lab/biobert-base-cased-v1.2,0.6,0.4,0.0
7,first,dmis-lab/biobert-v1.1,0.666667,0.333333,0.0
8,first,dmis-lab/biobert-base-cased-v1.1,0.666667,0.333333,0.0
9,first,pritamdeka/BioBert-PubMed200kRCT,0.666667,0.333333,0.0



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

