In [15]:
import re
import random
import os
import json
import evaluate

from transformers import AutoTokenizer, AutoModelForTokenClassification, get_scheduler, RobertaTokenizerFast
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict, Features, Sequence, Value, ClassLabel

from sklearn.model_selection import ParameterGrid
from sklearn.metrics import precision_recall_fscore_support

import torch
from torch.utils.data import DataLoader
from accelerate import Accelerator
from tqdm import tqdm

## Combine all .txt and .ann files and combine them per medicine

In [16]:
# Base folders containing annotation and text files
annotations_folder = 'annotations/'
original_texts_folder = 'originaltexts/'
output_folder = 'output_datasets/'
os.makedirs(output_folder, exist_ok=True)

# Group files by medicine
file_groups = {}
for file_name in os.listdir(annotations_folder):
    if file_name.endswith('.ann'):
        base_name = '.'.join(file_name.split('.')[:-1])
        medicine = base_name.rsplit('.', 1)[0]
        file_groups.setdefault(medicine, []).append(file_name)

# Process each group
for medicine, ann_files in file_groups.items():
    combined_output = []

    for ann_file in ann_files:
        txt_file = ann_file.replace('.ann', '.txt')
        txt_path = os.path.join(original_texts_folder, txt_file)
        ann_path = os.path.join(annotations_folder, ann_file)

        # Ensure the corresponding .txt file exists
        if not os.path.exists(txt_path):
            raise FileNotFoundError(f"Text file not found for annotation file {ann_file}")

        # Read the content of the .ann and .txt files
        with open(ann_path, 'r') as ann_f:
            ann_lines = ann_f.readlines()

        with open(txt_path, 'r') as txt_f:
            txt_content = txt_f.read()

        # Parse annotations and filter out AnnotatorNotes
        annotations = []
        for line in ann_lines:
            if line.startswith('T'):
                parts = line.strip().split('\t')
                if len(parts) == 3:
                    tag_info, word = parts[1], parts[2]
                    tag_parts = tag_info.split()
                    if len(tag_parts) >= 3:
                        tag = tag_parts[0]
                        try:
                            start_idx = int(tag_parts[1])
                            end_idx = int(tag_parts[2])
                        except ValueError:
                            if ';' in tag_parts[2]:  # Handle ranges like '742;763'
                                start_idx = int(tag_parts[1])
                                end_idx = int(tag_parts[2].split(';')[-1])
                            else:
                                raise ValueError(f"Unexpected annotation format: {tag_parts}")
                        annotations.append((start_idx, end_idx, tag, word))

        # Sort annotations by start index
        annotations.sort(key=lambda x: x[0])

        # Generate output format
        output = []
        current_idx = 0
        for start_idx, end_idx, tag, word in annotations:
            # Add text between the last annotation and the current annotation as "O"
            if current_idx < start_idx:
                intervening_text = txt_content[current_idx:start_idx]
                for token in re.findall(r"\w+(?:'\w+)?|[.,!?]", intervening_text):
                    output.append(f"{token} O")

            # Add the annotated word with its tag
            for i, token in enumerate(word.split()):
                tag_prefix = 'B-' if i == 0 else 'I-'
                output.append(f"{token} {tag_prefix}{tag}")

            current_idx = end_idx

        # Add remaining text as "O"
        if current_idx < len(txt_content):
            remaining_text = txt_content[current_idx:]
            for token in re.findall(r"\w+(?:'\w+)?|[.,!?]", remaining_text):
                output.append(f"{token} O")

        # Add to combined output with a newline separator
        combined_output.extend(output)
        combined_output.append('')  # Empty line between posts

    # Write combined output to file
    combined_output_text = '\n'.join(combined_output).strip()
    output_file = os.path.join(output_folder, f"{medicine}_combined_output.txt")
    with open(output_file, 'w') as out_f:
        out_f.write(combined_output_text)

    print(f"Combined output saved for {medicine} in {output_file}")

Combined output saved for ARTHROTEC in output_datasets/ARTHROTEC_combined_output.txt
Combined output saved for CAMBIA in output_datasets/CAMBIA_combined_output.txt
Combined output saved for CATAFLAM in output_datasets/CATAFLAM_combined_output.txt
Combined output saved for DICLOFENAC-POTASSIUM in output_datasets/DICLOFENAC-POTASSIUM_combined_output.txt
Combined output saved for DICLOFENAC-SODIUM in output_datasets/DICLOFENAC-SODIUM_combined_output.txt
Combined output saved for FLECTOR in output_datasets/FLECTOR_combined_output.txt
Combined output saved for LIPITOR in output_datasets/LIPITOR_combined_output.txt
Combined output saved for PENNSAID in output_datasets/PENNSAID_combined_output.txt
Combined output saved for SOLARAZE in output_datasets/SOLARAZE_combined_output.txt
Combined output saved for VOLTAREN-XR in output_datasets/VOLTAREN-XR_combined_output.txt
Combined output saved for VOLTAREN in output_datasets/VOLTAREN_combined_output.txt
Combined output saved for ZIPSOR in output_da

## Combine all the medicine files into one dataset

In [17]:
# Folder containing all combined output files
output_datasets_folder = 'output_datasets/'
final_output_file = 'final_dataset.txt'

# Ensure the folder exists
if not os.path.exists(output_datasets_folder):
    raise FileNotFoundError(f"The folder {output_datasets_folder} does not exist.")

# List all files in the folder
output_files = [f for f in os.listdir(output_datasets_folder) if f.endswith('_combined_output.txt')]

# Combine all files into a single final dataset
final_dataset = []
for file_name in output_files:
    file_path = os.path.join(output_datasets_folder, file_name)
    with open(file_path, 'r') as f:
        content = f.read().strip()  # Read and strip any trailing spaces or newlines
        final_dataset.append(content)

    # Add an empty line to separate posts from different files
    final_dataset.append('')

# Write the combined dataset to the final output file
with open(final_output_file, 'w') as f:
    f.write('\n'.join(final_dataset).strip())  # Ensure no extra trailing newline

print(f"Final dataset saved to {final_output_file}")

Final dataset saved to final_dataset.txt


## Read the final dataset into the Iob dataset format

In [18]:
def read_iob_file(file_path):
    """Reads an IOB file from filepath and returns sentences with tokens and tags."""
    sentences = []
    sentence_tokens = []
    sentence_labels = []

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line:  # If line is not empty
                token, tag = line.split()
                sentence_tokens.append(token)
                sentence_labels.append(tag)

            else:
                # End of a sentence
                if sentence_tokens:
                    sentences.append({"tokens": sentence_tokens, "ner_tags": sentence_labels})
                    sentence_tokens = []
                    sentence_labels = []
        # Add the last sentence if file doesn't end with a newline
        if sentence_tokens:
            sentences.append({"tokens": sentence_tokens, "ner_tags": sentence_labels})
    return sentences

def create_dataset_from_final_file(final_file_path):
    """Create a dataset from a single IOB file and return it as a DatasetDict."""

    if not os.path.exists(final_file_path):
        raise FileNotFoundError(f"The file {final_file_path} does not exist.")

    # Parse the file
    data = read_iob_file(final_file_path)

    # Define the label names and ClassLabel feature
    unique_labels = sorted(set(tag for d in data for tag in d["ner_tags"]))
    label_feature = ClassLabel(names=unique_labels)

    # Define the Features schema for Hugging Face datasets
    features = Features({
        'tokens': Sequence(Value("string")),
        'ner_tags': Sequence(label_feature)
    })

    # Convert data into a Dataset
    dataset = Dataset.from_list(data).cast(features)

    # Create a DatasetDict
    dataset_dict = DatasetDict({"full_data": dataset})

    return dataset_dict


In [19]:
final_dataset_path = "final_dataset.txt"
dataset_dict = create_dataset_from_final_file(final_dataset_path)
dataset = dataset_dict['full_data']

Casting the dataset: 100%|██████████| 1248/1248 [00:00<00:00, 6189.84 examples/s]


## Tokenize and align labels, also add datacollator

In [20]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None

    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)

        elif word_id is None:
            # Special token
            new_labels.append(-100)

        else:
            # Same word as previous token
            label = labels[word_id]

            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True,
        is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

## Dataset generators

In [21]:
def generate_train_datasets(dataset, number_of_samples, number_of_splits):
    """
    Generates train datasets by sampling from the given dataset based on the number of samples and splits.

    Args:
        dataset (Dataset): The base dataset to sample from.
        number_of_samples (int): Number of samples per dataset.
        number_of_splits (int): Number of datasets to generate (different seeds).

    Returns:
        List[Tuple[str, Dataset, List[int]]]: List of generated datasets with their names and indices.
    """
    datasets = []

    for seed in range(number_of_splits):
        # Set the random seed for reproducibility
        random.seed(seed)

        # Shuffle and sample from the dataset
        indices = list(range(len(dataset)))
        random.shuffle(indices)
        sampled_indices = indices[:number_of_samples]

        sampled_dataset = dataset.select(sampled_indices)

        # Add the dataset with its name and indices
        datasets.append((f"train_dataset_{number_of_samples}_{seed}", sampled_dataset, sampled_indices))

    return datasets

In [22]:
def generate_validation_datasets(dataset, train_indices, number_of_samples, number_of_splits):
    """
    Generates validation datasets by sampling from the given dataset, ensuring no overlap with training data.

    Args:
        dataset (Dataset): The base dataset to sample from.
        train_indices (List[int]): Indices of the training dataset to exclude from sampling.
        number_of_samples (int): Number of samples per validation dataset.
        number_of_splits (int): Number of validation datasets to generate (different seeds).

    Returns:
        List[Tuple[str, Dataset, List[int]]]: List of generated validation datasets with names and indices.
    """
    datasets = []
    all_indices = set(range(len(dataset)))
    available_indices = list(all_indices - set(train_indices))  # Exclude training indices

    for seed in range(number_of_splits):
        # Set the random seed for reproducibility
        random.seed(seed)

        # Shuffle and sample from the remaining indices
        random.shuffle(available_indices)
        sampled_indices = available_indices[:number_of_samples]

        sampled_dataset = dataset.select(sampled_indices)

        # Add the dataset with its name and indices
        datasets.append((f"val_dataset_{number_of_samples}_{seed}", sampled_dataset, sampled_indices))

    return datasets

In [23]:
def generate_test_datasets(dataset, train_indices, val_indices, number_of_samples, number_of_splits):
    """
    Generates test datasets by sampling from the given dataset, ensuring no overlap with training or validation data.

    Args:
        dataset (Dataset): The base dataset to sample from.
        train_indices (List[int]): Indices of the training dataset to exclude from sampling.
        val_indices (List[int]): Indices of the validation dataset to exclude from sampling.
        number_of_samples (int): Number of samples per test dataset.
        number_of_splits (int): Number of test datasets to generate (different seeds).

    Returns:
        List[Tuple[str, Dataset]]: List of generated test datasets with names.
    """
    datasets = []
    all_indices = set(range(len(dataset)))
    available_indices = list(all_indices - set(train_indices) - set(val_indices))  # Exclude train and val indices

    for seed in range(number_of_splits):
        # Set the random seed for reproducibility
        random.seed(seed)

        # Shuffle and sample from the remaining indices
        random.shuffle(available_indices)
        sampled_indices = available_indices[:number_of_samples]

        sampled_dataset = dataset.select(sampled_indices)

        # Add the dataset with its name
        datasets.append((f"test_dataset_{number_of_samples}_{seed}", sampled_dataset))

    return datasets

### Example usage

In [24]:
# Step 1: Generate Train Dataset
# train_datasets = generate_train_datasets(dataset, number_of_samples=30, number_of_splits=1)
# train_name, train_dataset, train_indices = train_datasets[0]
# print(f"{train_name}: {len(train_dataset)} samples")
#
# # Step 2: Generate Validation Dataset
# val_datasets = generate_validation_datasets(dataset, train_indices, number_of_samples=30, number_of_splits=1)
# val_name, val_dataset, val_indices = val_datasets[0]
# print(f"{val_name}: {len(val_dataset)} samples")
#
# # Step 3: Generate Test Dataset
# test_datasets = generate_test_datasets(dataset, train_indices, val_indices, number_of_samples=30, number_of_splits=1)
# test_name, test_dataset = test_datasets[0]
# print(f"{test_name}: {len(test_dataset)} samples")

# Prepping for training

In [25]:
# Load the evaluation metric
metric = evaluate.load("seqeval")

label_names = dataset.features["ner_tags"].feature.names
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [26]:
def create_dataset_given_model(tokenizer_):
    # tokenizer = AutoTokenizer.from_pretrained(model_name)
    data_collator_ = DataCollatorForTokenClassification(tokenizer=tokenizer_)

    # Tokenize and align labels
    tokenized_dataset_ = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset.column_names)

    # Step 1: Generate Train Dataset
    train_datasets = generate_train_datasets(tokenized_dataset_, number_of_samples=30, number_of_splits=1)
    train_name, train_dataset, train_indices = train_datasets[0]
    print(f"{train_name}: {len(train_dataset)} samples")

    # Step 2: Generate Validation Dataset
    val_datasets = generate_validation_datasets(tokenized_dataset_, train_indices, number_of_samples=30, number_of_splits=1)
    val_name, val_dataset, val_indices = val_datasets[0]
    print(f"{val_name}: {len(val_dataset)} samples")

    # Step 3: Generate Test Dataset
    test_datasets = generate_test_datasets(tokenized_dataset_, train_indices, val_indices, number_of_samples=30, number_of_splits=1)
    test_name, test_dataset = test_datasets[0]
    print(f"{test_name}: {len(test_dataset)} samples")

    return data_collator_, train_dataset, val_dataset, test_dataset


In [27]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Flatten predictions and labels, removing ignored indices
    true_labels = [label for label_seq in labels for label in label_seq if label != -100]
    true_predictions = [pred for pred_seq, label_seq in zip(predictions, labels)
                        for pred, label in zip(pred_seq, label_seq) if label != -100]
    return true_labels, true_predictions


# Training and evaluation

In [28]:
# Define the models and their corresponding sizes
models = {
    "small": "bert-base-cased",
    "medium": "bert-large-cased",
    "large": "roberta-large"
}

# Define hyperparameter grid
param_grid = {
    "learning_rate": [5e-6, 2e-5, 5e-5],
    "batch_size": [8, 16],
    "weight_decay": [0.0, 0.01]
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loop over models
for size, model_name in models.items():
    print(f"\nTuning and evaluating {model_name} ({size})...")
    best_f1 = 0.0
    best_hyperparameters = None
    model_save_path = f"saved_models/{model_name}"
    os.makedirs(model_save_path, exist_ok=True)
    if size == "large":
        tokenizer = RobertaTokenizerFast.from_pretrained("roberta-large", add_prefix_space=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Hyperparameter tuning loop
    for params in ParameterGrid(param_grid):
        print(f"\nHyperparameters: {params}")

        # Create datasets and data collator for the current model
        data_collator, tokenized_train, tokenized_val, tokenized_test = create_dataset_given_model(tokenizer)

        # Create DataLoaders
        train_dataloader = DataLoader(tokenized_train, batch_size=params["batch_size"], shuffle=True, collate_fn=data_collator)
        val_dataloader = DataLoader(tokenized_val, batch_size=params["batch_size"], collate_fn=data_collator)
        test_dataloader = DataLoader(tokenized_test, batch_size=params["batch_size"], collate_fn=data_collator)

        # Initialize the model for token classification
        model = AutoModelForTokenClassification.from_pretrained(
            model_name, id2label=id2label, label2id=label2id
        )

        # Set up optimizer and learning rate scheduler
        optimizer = torch.optim.AdamW(
            model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"]
        )
        num_train_epochs = 3
        num_update_steps_per_epoch = len(train_dataloader)
        num_training_steps = num_train_epochs * num_update_steps_per_epoch
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps
        )

        # Use the accelerator for distributed training
        accelerator = Accelerator()
        model, optimizer, train_dataloader, val_dataloader = accelerator.prepare(
            model, optimizer, train_dataloader, val_dataloader
        )

        # Training loop
        for epoch in range(num_train_epochs):
            print(f"Epoch {epoch + 1}/{num_train_epochs}")
            model.train()
            total_loss = 0
            progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}")
            for batch in progress_bar:
                # Move batch data to the same device as the model
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                total_loss += loss.item()

                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

                progress_bar.set_postfix(loss=loss.item())

            print(f"Epoch {epoch + 1} Loss: {total_loss:.4f}")

            # Validation loop
            model.eval()
            val_predictions, val_labels = [], []
            with torch.no_grad():
                for batch in val_dataloader:
                    # Move batch data to the same device as the model
                    batch = {k: v.to(device) for k, v in batch.items()}
                    outputs = model(**batch)
                    logits = outputs.logits
                    predictions = logits.argmax(dim=-1)
                    labels = batch["labels"]

                    # Handle padding across processes for multi-GPU
                    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
                    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

                    predictions_gathered = accelerator.gather(predictions)
                    labels_gathered = accelerator.gather(labels)

                    # Postprocess to get flattened labels and predictions
                    flat_labels, flat_predictions = postprocess(predictions_gathered, labels_gathered)
                    val_labels.extend(flat_labels)
                    val_predictions.extend(flat_predictions)

            # Calculate validation metrics
            precision, recall, f1, _ = precision_recall_fscore_support(
                val_labels, val_predictions, average="micro"  # 'micro' aggregates across all classes
            )
            print(f"Validation Metrics (Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f})")

        # Test loop for evaluation after training
        model.eval()
        test_predictions, test_labels = [], []
        with torch.no_grad():
            for batch in test_dataloader:
                # Move batch data to the same device as the model
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                logits = outputs.logits
                predictions = logits.argmax(dim=-1)
                labels = batch["labels"]

                # Handle padding across processes for multi-GPU
                predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
                labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

                predictions_gathered = accelerator.gather(predictions)
                labels_gathered = accelerator.gather(labels)

                # Postprocess to get flattened labels and predictions
                flat_labels, flat_predictions = postprocess(predictions_gathered, labels_gathered)
                test_labels.extend(flat_labels)
                test_predictions.extend(flat_predictions)

        # Calculate test metrics
        precision, recall, f1, _ = precision_recall_fscore_support(
            test_labels, test_predictions, average="micro"  # 'micro' aggregates across all classes
        )
        print(f"Test Metrics (Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f})")

        # Save the best model based on test F1-score
        if f1 > best_f1:
            print(f"New best model found for {model_name} with F1: {f1:.4f}")
            best_f1 = f1
            best_hyperparameters = params

            # Overwrite the saved model
            accelerator.unwrap_model(model).save_pretrained(model_save_path)
            tokenizer.save_pretrained(model_save_path)

            # Save the best hyperparameters to a JSON file in the model's folder
            best_config_path = os.path.join(model_save_path, "best_config.json")
            with open(best_config_path, "w") as f:
                json.dump({
                    "best_hyperparameters": best_hyperparameters,
                    "best_f1": best_f1
                }, f, indent=4)

    print(f"\nBest Model for {model_name} ({size}): {model_save_path}")
    print(f"Best Hyperparameters: {best_hyperparameters}")
    print(f"Best F1-Score: {best_f1:.4f}")



Tuning and evaluating bert-base-cased (small)...

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4320.48 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:05<00:00,  1.37s/it, loss=2.11]


Epoch 1 Loss: 8.8974
Validation Metrics (Precision: 0.4260, Recall: 0.4260, F1: 0.4260)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:05<00:00,  1.45s/it, loss=2.02]


Epoch 2 Loss: 8.1820
Validation Metrics (Precision: 0.6050, Recall: 0.6050, F1: 0.6050)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:06<00:00,  1.58s/it, loss=1.93]


Epoch 3 Loss: 7.7886
Validation Metrics (Precision: 0.6470, Recall: 0.6470, F1: 0.6470)
Test Metrics (Precision: 0.6664, Recall: 0.6664, F1: 0.6664)
New best model found for bert-base-cased with F1: 0.6664

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 5177.55 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:09<00:00,  2.36s/it, loss=2.17]


Epoch 1 Loss: 9.0584
Validation Metrics (Precision: 0.3496, Recall: 0.3496, F1: 0.3496)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:09<00:00,  2.48s/it, loss=2.06]


Epoch 2 Loss: 8.3081
Validation Metrics (Precision: 0.5609, Recall: 0.5609, F1: 0.5609)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:07<00:00,  1.86s/it, loss=1.98]


Epoch 3 Loss: 7.8983
Validation Metrics (Precision: 0.6168, Recall: 0.6168, F1: 0.6168)
Test Metrics (Precision: 0.6146, Recall: 0.6146, F1: 0.6146)

Hyperparameters: {'batch_size': 8, 'learning_rate': 2e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4900.46 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:05<00:00,  1.41s/it, loss=1.71]


Epoch 1 Loss: 8.1648
Validation Metrics (Precision: 0.7446, Recall: 0.7446, F1: 0.7446)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:05<00:00,  1.39s/it, loss=1.26]


Epoch 2 Loss: 5.7123
Validation Metrics (Precision: 0.7473, Recall: 0.7473, F1: 0.7473)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:06<00:00,  1.52s/it, loss=1.21] 


Epoch 3 Loss: 4.6537
Validation Metrics (Precision: 0.7473, Recall: 0.7473, F1: 0.7473)
Test Metrics (Precision: 0.7660, Recall: 0.7660, F1: 0.7660)
New best model found for bert-base-cased with F1: 0.7660

Hyperparameters: {'batch_size': 8, 'learning_rate': 2e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4203.71 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:07<00:00,  1.90s/it, loss=1.96]


Epoch 1 Loss: 9.1255
Validation Metrics (Precision: 0.6978, Recall: 0.6978, F1: 0.6978)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:07<00:00,  1.86s/it, loss=1.16]


Epoch 2 Loss: 6.0687
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:08<00:00,  2.25s/it, loss=1.25]


Epoch 3 Loss: 4.7102
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)
New best model found for bert-base-cased with F1: 0.7664

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 3198.68 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:03<00:00,  1.12it/s, loss=0.938]


Epoch 1 Loss: 6.9722
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:07<00:00,  1.83s/it, loss=1.33] 


Epoch 2 Loss: 3.9562
Validation Metrics (Precision: 0.7490, Recall: 0.7490, F1: 0.7490)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:05<00:00,  1.31s/it, loss=0.564]


Epoch 3 Loss: 3.2297
Validation Metrics (Precision: 0.7547, Recall: 0.7547, F1: 0.7547)
Test Metrics (Precision: 0.7704, Recall: 0.7704, F1: 0.7704)
New best model found for bert-base-cased with F1: 0.7704

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4509.08 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:08<00:00,  2.16s/it, loss=1.17]


Epoch 1 Loss: 6.8171
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:09<00:00,  2.27s/it, loss=0.999]


Epoch 2 Loss: 3.7042
Validation Metrics (Precision: 0.7487, Recall: 0.7487, F1: 0.7487)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:08<00:00,  2.19s/it, loss=0.773]


Epoch 3 Loss: 3.1680
Validation Metrics (Precision: 0.7497, Recall: 0.7497, F1: 0.7497)
Test Metrics (Precision: 0.7674, Recall: 0.7674, F1: 0.7674)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-06, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4494.75 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:07<00:00,  3.54s/it, loss=2.47]


Epoch 1 Loss: 5.0147
Validation Metrics (Precision: 0.0727, Recall: 0.0727, F1: 0.0727)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:06<00:00,  3.39s/it, loss=2.36]


Epoch 2 Loss: 4.7590
Validation Metrics (Precision: 0.1423, Recall: 0.1423, F1: 0.1423)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:07<00:00,  3.79s/it, loss=2.31]


Epoch 3 Loss: 4.6152
Validation Metrics (Precision: 0.1800, Recall: 0.1800, F1: 0.1800)
Test Metrics (Precision: 0.1801, Recall: 0.1801, F1: 0.1801)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-06, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4603.60 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:08<00:00,  4.48s/it, loss=2.38]


Epoch 1 Loss: 4.8156
Validation Metrics (Precision: 0.1376, Recall: 0.1376, F1: 0.1376)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:10<00:00,  5.10s/it, loss=2.28]


Epoch 2 Loss: 4.5996
Validation Metrics (Precision: 0.2079, Recall: 0.2079, F1: 0.2079)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:09<00:00,  4.82s/it, loss=2.22]


Epoch 3 Loss: 4.4660
Validation Metrics (Precision: 0.2379, Recall: 0.2379, F1: 0.2379)
Test Metrics (Precision: 0.2323, Recall: 0.2323, F1: 0.2323)

Hyperparameters: {'batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4749.61 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:04<00:00,  2.25s/it, loss=2.18]


Epoch 1 Loss: 4.5874
Validation Metrics (Precision: 0.5945, Recall: 0.5945, F1: 0.5945)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:06<00:00,  3.12s/it, loss=1.84]


Epoch 2 Loss: 3.7794
Validation Metrics (Precision: 0.7312, Recall: 0.7312, F1: 0.7312)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:07<00:00,  3.67s/it, loss=1.57]


Epoch 3 Loss: 3.3278
Validation Metrics (Precision: 0.7439, Recall: 0.7439, F1: 0.7439)
Test Metrics (Precision: 0.7614, Recall: 0.7614, F1: 0.7614)

Hyperparameters: {'batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4751.77 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:09<00:00,  4.98s/it, loss=2.41]


Epoch 1 Loss: 5.0485
Validation Metrics (Precision: 0.2826, Recall: 0.2826, F1: 0.2826)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:09<00:00,  4.53s/it, loss=2.02]


Epoch 2 Loss: 4.1747
Validation Metrics (Precision: 0.6709, Recall: 0.6709, F1: 0.6709)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:09<00:00,  4.76s/it, loss=1.76]


Epoch 3 Loss: 3.6401
Validation Metrics (Precision: 0.7325, Recall: 0.7325, F1: 0.7325)
Test Metrics (Precision: 0.7396, Recall: 0.7396, F1: 0.7396)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4800.37 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:07<00:00,  3.64s/it, loss=1.67]


Epoch 1 Loss: 4.0045
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:06<00:00,  3.07s/it, loss=1.01]


Epoch 2 Loss: 2.2566
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:04<00:00,  2.29s/it, loss=0.891]


Epoch 3 Loss: 1.9069
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7660, Recall: 0.7660, F1: 0.7660)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4695.84 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:09<00:00,  4.75s/it, loss=1.77]


Epoch 1 Loss: 4.0283
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:08<00:00,  4.34s/it, loss=1.06]


Epoch 2 Loss: 2.4433
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:09<00:00,  4.91s/it, loss=0.901]


Epoch 3 Loss: 1.9605
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Best Model for bert-base-cased (small): saved_models/bert-base-cased
Best Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.0}
Best F1-Score: 0.7704

Tuning and evaluating bert-large-cased (medium)...

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4992.03 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:17<00:00,  4.48s/it, loss=2.19]


Epoch 1 Loss: 9.3426
Validation Metrics (Precision: 0.3758, Recall: 0.3758, F1: 0.3758)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:28<00:00,  7.01s/it, loss=1.78]


Epoch 2 Loss: 7.6675
Validation Metrics (Precision: 0.6171, Recall: 0.6171, F1: 0.6171)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:19<00:00,  4.75s/it, loss=1.7] 


Epoch 3 Loss: 6.8593
Validation Metrics (Precision: 0.6692, Recall: 0.6692, F1: 0.6692)
Test Metrics (Precision: 0.7032, Recall: 0.7032, F1: 0.7032)
New best model found for bert-large-cased with F1: 0.7032

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 3417.65 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:26<00:00,  6.69s/it, loss=1.98]


Epoch 1 Loss: 8.7572
Validation Metrics (Precision: 0.5394, Recall: 0.5394, F1: 0.5394)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:22<00:00,  5.60s/it, loss=1.65]


Epoch 2 Loss: 7.1626
Validation Metrics (Precision: 0.7046, Recall: 0.7046, F1: 0.7046)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:19<00:00,  4.89s/it, loss=1.55]


Epoch 3 Loss: 6.4346
Validation Metrics (Precision: 0.7291, Recall: 0.7291, F1: 0.7291)
Test Metrics (Precision: 0.7380, Recall: 0.7380, F1: 0.7380)
New best model found for bert-large-cased with F1: 0.7380

Hyperparameters: {'batch_size': 8, 'learning_rate': 2e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 3739.46 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:25<00:00,  6.44s/it, loss=1.5] 


Epoch 1 Loss: 7.6200
Validation Metrics (Precision: 0.7470, Recall: 0.7470, F1: 0.7470)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:20<00:00,  5.08s/it, loss=0.746]


Epoch 2 Loss: 3.8697
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:18<00:00,  4.61s/it, loss=0.973]


Epoch 3 Loss: 3.5487
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7660, Recall: 0.7660, F1: 0.7660)
New best model found for bert-large-cased with F1: 0.7660

Hyperparameters: {'batch_size': 8, 'learning_rate': 2e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4727.24 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:30<00:00,  7.71s/it, loss=1.25]


Epoch 1 Loss: 7.1659
Validation Metrics (Precision: 0.7497, Recall: 0.7497, F1: 0.7497)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:25<00:00,  6.45s/it, loss=0.936]


Epoch 2 Loss: 3.9183
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:24<00:00,  6.24s/it, loss=0.836]


Epoch 3 Loss: 3.3870
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)
New best model found for bert-large-cased with F1: 0.7664

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4802.81 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:31<00:00,  7.83s/it, loss=0.728]


Epoch 1 Loss: 5.3958
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:23<00:00,  5.78s/it, loss=0.764]


Epoch 2 Loss: 3.4807
Validation Metrics (Precision: 0.7571, Recall: 0.7571, F1: 0.7571)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:20<00:00,  5.03s/it, loss=0.58] 


Epoch 3 Loss: 2.8058
Validation Metrics (Precision: 0.7699, Recall: 0.7699, F1: 0.7699)
Test Metrics (Precision: 0.7851, Recall: 0.7851, F1: 0.7851)
New best model found for bert-large-cased with F1: 0.7851

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 3662.99 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:30<00:00,  7.56s/it, loss=0.894]


Epoch 1 Loss: 6.0389
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:26<00:00,  6.60s/it, loss=0.666]


Epoch 2 Loss: 3.3377
Validation Metrics (Precision: 0.7537, Recall: 0.7537, F1: 0.7537)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:24<00:00,  6.15s/it, loss=0.512]


Epoch 3 Loss: 2.7575
Validation Metrics (Precision: 0.7678, Recall: 0.7678, F1: 0.7678)
Test Metrics (Precision: 0.7844, Recall: 0.7844, F1: 0.7844)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-06, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4876.95 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:43<00:00, 21.95s/it, loss=2.27]


Epoch 1 Loss: 4.7386
Validation Metrics (Precision: 0.3052, Recall: 0.3052, F1: 0.3052)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:48<00:00, 24.48s/it, loss=2.04]


Epoch 2 Loss: 4.2061
Validation Metrics (Precision: 0.5114, Recall: 0.5114, F1: 0.5114)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:56<00:00, 28.26s/it, loss=1.94]


Epoch 3 Loss: 3.9177
Validation Metrics (Precision: 0.5777, Recall: 0.5777, F1: 0.5777)
Test Metrics (Precision: 0.6073, Recall: 0.6073, F1: 0.6073)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-06, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4659.25 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:48<00:00, 24.06s/it, loss=2.38]


Epoch 1 Loss: 4.8628
Validation Metrics (Precision: 0.2544, Recall: 0.2544, F1: 0.2544)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:53<00:00, 26.80s/it, loss=2.13]


Epoch 2 Loss: 4.3700
Validation Metrics (Precision: 0.4226, Recall: 0.4226, F1: 0.4226)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:44<00:00, 22.21s/it, loss=2.03]


Epoch 3 Loss: 4.0933
Validation Metrics (Precision: 0.4879, Recall: 0.4879, F1: 0.4879)
Test Metrics (Precision: 0.4890, Recall: 0.4890, F1: 0.4890)

Hyperparameters: {'batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4556.96 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:46<00:00, 23.00s/it, loss=1.86]


Epoch 1 Loss: 4.3150
Validation Metrics (Precision: 0.7436, Recall: 0.7436, F1: 0.7436)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:56<00:00, 28.14s/it, loss=1.23]


Epoch 2 Loss: 2.6382
Validation Metrics (Precision: 0.7493, Recall: 0.7493, F1: 0.7493)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:45<00:00, 22.88s/it, loss=0.896]


Epoch 3 Loss: 2.0315
Validation Metrics (Precision: 0.7493, Recall: 0.7493, F1: 0.7493)
Test Metrics (Precision: 0.7637, Recall: 0.7637, F1: 0.7637)

Hyperparameters: {'batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4604.43 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [01:01<00:00, 30.81s/it, loss=1.79]


Epoch 1 Loss: 4.1148
Validation Metrics (Precision: 0.7446, Recall: 0.7446, F1: 0.7446)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [01:04<00:00, 32.40s/it, loss=1.12]


Epoch 2 Loss: 2.5590
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:58<00:00, 29.06s/it, loss=0.961]


Epoch 3 Loss: 2.0663
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4197.70 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:37<00:00, 18.69s/it, loss=1.27]


Epoch 1 Loss: 3.5963
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:42<00:00, 21.02s/it, loss=0.813]


Epoch 2 Loss: 1.8349
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:41<00:00, 20.99s/it, loss=0.795]


Epoch 3 Loss: 1.6436
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4568.70 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:47<00:00, 23.89s/it, loss=1.28]


Epoch 1 Loss: 3.7228
Validation Metrics (Precision: 0.7463, Recall: 0.7463, F1: 0.7463)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:56<00:00, 28.46s/it, loss=0.74]


Epoch 2 Loss: 1.8478
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:49<00:00, 24.81s/it, loss=0.874]


Epoch 3 Loss: 1.6665
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7660, Recall: 0.7660, F1: 0.7660)

Best Model for bert-large-cased (medium): saved_models/bert-large-cased
Best Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.0}
Best F1-Score: 0.7851

Tuning and evaluating roberta-large (large)...

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4947.05 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:25<00:00,  6.37s/it, loss=1.95]


Epoch 1 Loss: 8.5436
Validation Metrics (Precision: 0.5488, Recall: 0.5488, F1: 0.5488)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:24<00:00,  6.11s/it, loss=1.77]


Epoch 2 Loss: 7.1918
Validation Metrics (Precision: 0.7501, Recall: 0.7501, F1: 0.7501)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:30<00:00,  7.69s/it, loss=1.64]


Epoch 3 Loss: 6.4660
Validation Metrics (Precision: 0.7617, Recall: 0.7617, F1: 0.7617)
Test Metrics (Precision: 0.7723, Recall: 0.7723, F1: 0.7723)
New best model found for roberta-large with F1: 0.7723

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4364.43 examples/s]
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:31<00:00,  7.77s/it, loss=1.84]


Epoch 1 Loss: 8.1622
Validation Metrics (Precision: 0.5027, Recall: 0.5027, F1: 0.5027)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:24<00:00,  6.20s/it, loss=1.34]


Epoch 2 Loss: 6.4150
Validation Metrics (Precision: 0.7229, Recall: 0.7229, F1: 0.7229)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:30<00:00,  7.60s/it, loss=1.43]


Epoch 3 Loss: 5.4933
Validation Metrics (Precision: 0.7359, Recall: 0.7359, F1: 0.7359)
Test Metrics (Precision: 0.7595, Recall: 0.7595, F1: 0.7595)

Hyperparameters: {'batch_size': 8, 'learning_rate': 2e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4891.92 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:26<00:00,  6.50s/it, loss=1.55]


Epoch 1 Loss: 7.5179
Validation Metrics (Precision: 0.7682, Recall: 0.7682, F1: 0.7682)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:31<00:00,  7.83s/it, loss=1.17] 


Epoch 2 Loss: 4.0478
Validation Metrics (Precision: 0.7671, Recall: 0.7671, F1: 0.7671)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:27<00:00,  6.85s/it, loss=0.705]


Epoch 3 Loss: 3.1623
Validation Metrics (Precision: 0.7671, Recall: 0.7671, F1: 0.7671)
Test Metrics (Precision: 0.7773, Recall: 0.7773, F1: 0.7773)
New best model found for roberta-large with F1: 0.7773

Hyperparameters: {'batch_size': 8, 'learning_rate': 2e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 5032.03 examples/s]
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:21<00:00,  5.31s/it, loss=1.4] 


Epoch 1 Loss: 9.3678
Validation Metrics (Precision: 0.7468, Recall: 0.7468, F1: 0.7468)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:26<00:00,  6.64s/it, loss=0.817]


Epoch 2 Loss: 3.8476
Validation Metrics (Precision: 0.7639, Recall: 0.7639, F1: 0.7639)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:29<00:00,  7.47s/it, loss=0.613]


Epoch 3 Loss: 3.2995
Validation Metrics (Precision: 0.7653, Recall: 0.7653, F1: 0.7653)
Test Metrics (Precision: 0.7759, Recall: 0.7759, F1: 0.7759)

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 5041.03 examples/s]
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:25<00:00,  6.42s/it, loss=0.655]


Epoch 1 Loss: 5.8993
Validation Metrics (Precision: 0.7671, Recall: 0.7671, F1: 0.7671)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:26<00:00,  6.50s/it, loss=0.606]


Epoch 2 Loss: 2.7021
Validation Metrics (Precision: 0.7798, Recall: 0.7798, F1: 0.7798)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:27<00:00,  6.89s/it, loss=0.549]


Epoch 3 Loss: 2.1131
Validation Metrics (Precision: 0.8114, Recall: 0.8114, F1: 0.8114)
Test Metrics (Precision: 0.8278, Recall: 0.8278, F1: 0.8278)
New best model found for roberta-large with F1: 0.8278

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 5012.25 examples/s]
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:22<00:00,  5.58s/it, loss=1.13]


Epoch 1 Loss: 6.5140
Validation Metrics (Precision: 0.7639, Recall: 0.7639, F1: 0.7639)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:27<00:00,  6.79s/it, loss=0.675]


Epoch 2 Loss: 3.1117
Validation Metrics (Precision: 0.7690, Recall: 0.7690, F1: 0.7690)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:25<00:00,  6.35s/it, loss=0.656]


Epoch 3 Loss: 2.5068
Validation Metrics (Precision: 0.7773, Recall: 0.7773, F1: 0.7773)
Test Metrics (Precision: 0.7862, Recall: 0.7862, F1: 0.7862)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-06, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4843.97 examples/s]
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:43<00:00, 21.90s/it, loss=2.04]


Epoch 1 Loss: 4.1655
Validation Metrics (Precision: 0.6866, Recall: 0.6866, F1: 0.6866)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:39<00:00, 19.56s/it, loss=1.83]


Epoch 2 Loss: 3.7353
Validation Metrics (Precision: 0.7468, Recall: 0.7468, F1: 0.7468)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:41<00:00, 20.82s/it, loss=1.75]


Epoch 3 Loss: 3.5305
Validation Metrics (Precision: 0.7548, Recall: 0.7548, F1: 0.7548)
Test Metrics (Precision: 0.7684, Recall: 0.7684, F1: 0.7684)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-06, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4922.36 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:47<00:00, 23.56s/it, loss=2.41]


Epoch 1 Loss: 4.9834
Validation Metrics (Precision: 0.1110, Recall: 0.1110, F1: 0.1110)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:44<00:00, 22.38s/it, loss=2.14]


Epoch 2 Loss: 4.4172
Validation Metrics (Precision: 0.2731, Recall: 0.2731, F1: 0.2731)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:44<00:00, 22.38s/it, loss=2.05]


Epoch 3 Loss: 4.0810
Validation Metrics (Precision: 0.3758, Recall: 0.3758, F1: 0.3758)
Test Metrics (Precision: 0.3849, Recall: 0.3849, F1: 0.3849)

Hyperparameters: {'batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4918.54 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:43<00:00, 21.83s/it, loss=2.8] 


Epoch 1 Loss: 6.3803
Validation Metrics (Precision: 0.4204, Recall: 0.4204, F1: 0.4204)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:41<00:00, 20.98s/it, loss=1.4] 


Epoch 2 Loss: 3.3268
Validation Metrics (Precision: 0.7156, Recall: 0.7156, F1: 0.7156)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:46<00:00, 23.06s/it, loss=0.891]


Epoch 3 Loss: 2.1164
Validation Metrics (Precision: 0.7403, Recall: 0.7403, F1: 0.7403)
Test Metrics (Precision: 0.7531, Recall: 0.7531, F1: 0.7531)

Hyperparameters: {'batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4846.95 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:42<00:00, 21.48s/it, loss=1.75]


Epoch 1 Loss: 4.0581
Validation Metrics (Precision: 0.7570, Recall: 0.7570, F1: 0.7570)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:42<00:00, 21.36s/it, loss=1.01]


Epoch 2 Loss: 2.2375
Validation Metrics (Precision: 0.7671, Recall: 0.7671, F1: 0.7671)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:38<00:00, 19.11s/it, loss=0.851]


Epoch 3 Loss: 1.7999
Validation Metrics (Precision: 0.7671, Recall: 0.7671, F1: 0.7671)
Test Metrics (Precision: 0.7769, Recall: 0.7769, F1: 0.7769)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4928.14 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:30<00:00, 15.42s/it, loss=1.53]


Epoch 1 Loss: 4.7361
Validation Metrics (Precision: 0.7552, Recall: 0.7552, F1: 0.7552)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:35<00:00, 17.55s/it, loss=0.912]


Epoch 2 Loss: 2.0044
Validation Metrics (Precision: 0.7657, Recall: 0.7657, F1: 0.7657)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:35<00:00, 17.54s/it, loss=0.648]


Epoch 3 Loss: 1.4886
Validation Metrics (Precision: 0.7686, Recall: 0.7686, F1: 0.7686)
Test Metrics (Precision: 0.7784, Recall: 0.7784, F1: 0.7784)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4817.64 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:41<00:00, 20.91s/it, loss=1.48]


Epoch 1 Loss: 3.8801
Validation Metrics (Precision: 0.7671, Recall: 0.7671, F1: 0.7671)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:40<00:00, 20.31s/it, loss=0.945]


Epoch 2 Loss: 1.8420
Validation Metrics (Precision: 0.7671, Recall: 0.7671, F1: 0.7671)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:39<00:00, 19.85s/it, loss=0.68] 


Epoch 3 Loss: 1.4932
Validation Metrics (Precision: 0.7671, Recall: 0.7671, F1: 0.7671)
Test Metrics (Precision: 0.7777, Recall: 0.7777, F1: 0.7777)

Best Model for roberta-large (large): saved_models/roberta-large
Best Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.0}
Best F1-Score: 0.8278
