In [26]:
import os
import re
import random
import os
import json
import evaluate

from transformers import AutoTokenizer, AutoModelForTokenClassification, get_scheduler
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict, Features, Sequence, Value, ClassLabel

from sklearn.model_selection import ParameterGrid
from sklearn.metrics import precision_recall_fscore_support

import torch
from torch.utils.data import DataLoader
from accelerate import Accelerator
from tqdm import tqdm

## Combine all .txt and .ann files and combine them per medicine

In [52]:
# Base folders containing annotation and text files
annotations_folder = 'annotations/'
original_texts_folder = 'originaltexts/'
output_folder = 'output_datasets/'
os.makedirs(output_folder, exist_ok=True)

# Group files by medicine
file_groups = {}
for file_name in os.listdir(annotations_folder):
    if file_name.endswith('.ann'):
        base_name = '.'.join(file_name.split('.')[:-1])
        medicine = base_name.rsplit('.', 1)[0]
        file_groups.setdefault(medicine, []).append(file_name)

# Process each group
for medicine, ann_files in file_groups.items():
    combined_output = []

    for ann_file in ann_files:
        txt_file = ann_file.replace('.ann', '.txt')
        txt_path = os.path.join(original_texts_folder, txt_file)
        ann_path = os.path.join(annotations_folder, ann_file)

        # Ensure the corresponding .txt file exists
        if not os.path.exists(txt_path):
            raise FileNotFoundError(f"Text file not found for annotation file {ann_file}")

        # Read the content of the .ann and .txt files
        with open(ann_path, 'r') as ann_f:
            ann_lines = ann_f.readlines()

        with open(txt_path, 'r') as txt_f:
            txt_content = txt_f.read()

        # Parse annotations and filter out AnnotatorNotes
        annotations = []
        for line in ann_lines:
            if line.startswith('T'):
                parts = line.strip().split('\t')
                if len(parts) == 3:
                    tag_info, word = parts[1], parts[2]
                    tag_parts = tag_info.split()
                    if len(tag_parts) >= 3:
                        tag = tag_parts[0]
                        try:
                            start_idx = int(tag_parts[1])
                            end_idx = int(tag_parts[2])
                        except ValueError:
                            if ';' in tag_parts[2]:  # Handle ranges like '742;763'
                                start_idx = int(tag_parts[1])
                                end_idx = int(tag_parts[2].split(';')[-1])
                            else:
                                raise ValueError(f"Unexpected annotation format: {tag_parts}")
                        annotations.append((start_idx, end_idx, tag, word))

        # Sort annotations by start index
        annotations.sort(key=lambda x: x[0])

        # Generate output format
        output = []
        current_idx = 0
        for start_idx, end_idx, tag, word in annotations:
            # Add text between the last annotation and the current annotation as "O"
            if current_idx < start_idx:
                intervening_text = txt_content[current_idx:start_idx]
                for token in re.findall(r"\w+(?:'\w+)?|[.,!?]", intervening_text):
                    output.append(f"{token} O")

            # Add the annotated word with its tag
            for i, token in enumerate(word.split()):
                tag_prefix = 'B-' if i == 0 else 'I-'
                output.append(f"{token} {tag_prefix}{tag}")

            current_idx = end_idx

        # Add remaining text as "O"
        if current_idx < len(txt_content):
            remaining_text = txt_content[current_idx:]
            for token in re.findall(r"\w+(?:'\w+)?|[.,!?]", remaining_text):
                output.append(f"{token} O")

        # Add to combined output with a newline separator
        combined_output.extend(output)
        combined_output.append('')  # Empty line between posts

    # Write combined output to file
    combined_output_text = '\n'.join(combined_output).strip()
    output_file = os.path.join(output_folder, f"{medicine}_combined_output.txt")
    with open(output_file, 'w') as out_f:
        out_f.write(combined_output_text)

    print(f"Combined output saved for {medicine} in {output_file}")

Combined output saved for ARTHROTEC in output_datasets/ARTHROTEC_combined_output.txt
Combined output saved for CAMBIA in output_datasets/CAMBIA_combined_output.txt
Combined output saved for CATAFLAM in output_datasets/CATAFLAM_combined_output.txt
Combined output saved for DICLOFENAC-POTASSIUM in output_datasets/DICLOFENAC-POTASSIUM_combined_output.txt
Combined output saved for DICLOFENAC-SODIUM in output_datasets/DICLOFENAC-SODIUM_combined_output.txt
Combined output saved for FLECTOR in output_datasets/FLECTOR_combined_output.txt
Combined output saved for LIPITOR in output_datasets/LIPITOR_combined_output.txt
Combined output saved for PENNSAID in output_datasets/PENNSAID_combined_output.txt
Combined output saved for SOLARAZE in output_datasets/SOLARAZE_combined_output.txt
Combined output saved for VOLTAREN-XR in output_datasets/VOLTAREN-XR_combined_output.txt
Combined output saved for VOLTAREN in output_datasets/VOLTAREN_combined_output.txt
Combined output saved for ZIPSOR in output_da

## Combine all the medicine files into one dataset

In [53]:
# Folder containing all combined output files
output_datasets_folder = 'output_datasets/'
final_output_file = 'final_dataset.txt'

# Ensure the folder exists
if not os.path.exists(output_datasets_folder):
    raise FileNotFoundError(f"The folder {output_datasets_folder} does not exist.")

# List all files in the folder
output_files = [f for f in os.listdir(output_datasets_folder) if f.endswith('_combined_output.txt')]

# Combine all files into a single final dataset
final_dataset = []
for file_name in output_files:
    file_path = os.path.join(output_datasets_folder, file_name)
    with open(file_path, 'r') as f:
        content = f.read().strip()  # Read and strip any trailing spaces or newlines
        final_dataset.append(content)

    # Add an empty line to separate posts from different files
    final_dataset.append('')

# Write the combined dataset to the final output file
with open(final_output_file, 'w') as f:
    f.write('\n'.join(final_dataset).strip())  # Ensure no extra trailing newline

print(f"Final dataset saved to {final_output_file}")

Final dataset saved to final_dataset.txt


## Read the final dataset into the Iob dataset format

In [2]:
def read_iob_file(file_path):
    """Reads an IOB file from filepath and returns sentences with tokens and tags."""
    sentences = []
    sentence_tokens = []
    sentence_labels = []

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line:  # If line is not empty
                token, tag = line.split()
                sentence_tokens.append(token)
                sentence_labels.append(tag)

            else:
                # End of a sentence
                if sentence_tokens:
                    sentences.append({"tokens": sentence_tokens, "ner_tags": sentence_labels})
                    sentence_tokens = []
                    sentence_labels = []
        # Add the last sentence if file doesn't end with a newline
        if sentence_tokens:
            sentences.append({"tokens": sentence_tokens, "ner_tags": sentence_labels})
    return sentences

def create_dataset_from_final_file(final_file_path):
    """Create a dataset from a single IOB file and return it as a DatasetDict."""

    if not os.path.exists(final_file_path):
        raise FileNotFoundError(f"The file {final_file_path} does not exist.")

    # Parse the file
    data = read_iob_file(final_file_path)

    # Define the label names and ClassLabel feature
    unique_labels = sorted(set(tag for d in data for tag in d["ner_tags"]))
    label_feature = ClassLabel(names=unique_labels)

    # Define the Features schema for Hugging Face datasets
    features = Features({
        'tokens': Sequence(Value("string")),
        'ner_tags': Sequence(label_feature)
    })

    # Convert data into a Dataset
    dataset = Dataset.from_list(data).cast(features)

    # Create a DatasetDict
    dataset_dict = DatasetDict({"full_data": dataset})

    return dataset_dict


In [3]:
final_dataset_path = "final_dataset.txt"
dataset_dict = create_dataset_from_final_file(final_dataset_path)
dataset = dataset_dict['full_data']

Casting the dataset: 100%|██████████| 1248/1248 [00:00<00:00, 6546.79 examples/s]


## Tokenize and align labels, also add datacollator

In [21]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None

    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)

        elif word_id is None:
            # Special token
            new_labels.append(-100)

        else:
            # Same word as previous token
            label = labels[word_id]

            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True,
        is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

### Example usage

In [9]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset.column_names)

TypeError: string indices must be integers, not 'str'

## Dataset generators

In [10]:
def generate_train_datasets(dataset, number_of_samples, number_of_splits):
    """
    Generates train datasets by sampling from the given dataset based on the number of samples and splits.

    Args:
        dataset (Dataset): The base dataset to sample from.
        number_of_samples (int): Number of samples per dataset.
        number_of_splits (int): Number of datasets to generate (different seeds).

    Returns:
        List[Tuple[str, Dataset, List[int]]]: List of generated datasets with their names and indices.
    """
    datasets = []

    for seed in range(number_of_splits):
        # Set the random seed for reproducibility
        random.seed(seed)

        # Shuffle and sample from the dataset
        indices = list(range(len(dataset)))
        random.shuffle(indices)
        sampled_indices = indices[:number_of_samples]

        sampled_dataset = dataset.select(sampled_indices)

        # Add the dataset with its name and indices
        datasets.append((f"train_dataset_{number_of_samples}_{seed}", sampled_dataset, sampled_indices))

    return datasets

In [11]:
def generate_validation_datasets(dataset, train_indices, number_of_samples, number_of_splits):
    """
    Generates validation datasets by sampling from the given dataset, ensuring no overlap with training data.

    Args:
        dataset (Dataset): The base dataset to sample from.
        train_indices (List[int]): Indices of the training dataset to exclude from sampling.
        number_of_samples (int): Number of samples per validation dataset.
        number_of_splits (int): Number of validation datasets to generate (different seeds).

    Returns:
        List[Tuple[str, Dataset, List[int]]]: List of generated validation datasets with names and indices.
    """
    datasets = []
    all_indices = set(range(len(dataset)))
    available_indices = list(all_indices - set(train_indices))  # Exclude training indices

    for seed in range(number_of_splits):
        # Set the random seed for reproducibility
        random.seed(seed)

        # Shuffle and sample from the remaining indices
        random.shuffle(available_indices)
        sampled_indices = available_indices[:number_of_samples]

        sampled_dataset = dataset.select(sampled_indices)

        # Add the dataset with its name and indices
        datasets.append((f"val_dataset_{number_of_samples}_{seed}", sampled_dataset, sampled_indices))

    return datasets

In [12]:
def generate_test_datasets(dataset, train_indices, val_indices, number_of_samples, number_of_splits):
    """
    Generates test datasets by sampling from the given dataset, ensuring no overlap with training or validation data.

    Args:
        dataset (Dataset): The base dataset to sample from.
        train_indices (List[int]): Indices of the training dataset to exclude from sampling.
        val_indices (List[int]): Indices of the validation dataset to exclude from sampling.
        number_of_samples (int): Number of samples per test dataset.
        number_of_splits (int): Number of test datasets to generate (different seeds).

    Returns:
        List[Tuple[str, Dataset]]: List of generated test datasets with names.
    """
    datasets = []
    all_indices = set(range(len(dataset)))
    available_indices = list(all_indices - set(train_indices) - set(val_indices))  # Exclude train and val indices

    for seed in range(number_of_splits):
        # Set the random seed for reproducibility
        random.seed(seed)

        # Shuffle and sample from the remaining indices
        random.shuffle(available_indices)
        sampled_indices = available_indices[:number_of_samples]

        sampled_dataset = dataset.select(sampled_indices)

        # Add the dataset with its name
        datasets.append((f"test_dataset_{number_of_samples}_{seed}", sampled_dataset))

    return datasets

### Example usage

In [71]:
# Step 1: Generate Train Dataset
# train_datasets = generate_train_datasets(dataset, number_of_samples=30, number_of_splits=1)
# train_name, train_dataset, train_indices = train_datasets[0]
# print(f"{train_name}: {len(train_dataset)} samples")
#
# # Step 2: Generate Validation Dataset
# val_datasets = generate_validation_datasets(dataset, train_indices, number_of_samples=30, number_of_splits=1)
# val_name, val_dataset, val_indices = val_datasets[0]
# print(f"{val_name}: {len(val_dataset)} samples")
#
# # Step 3: Generate Test Dataset
# test_datasets = generate_test_datasets(dataset, train_indices, val_indices, number_of_samples=30, number_of_splits=1)
# test_name, test_dataset = test_datasets[0]
# print(f"{test_name}: {len(test_dataset)} samples")

train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


# Prepping for training

In [23]:
# Load the evaluation metric
metric = evaluate.load("seqeval")

label_names = dataset.features["ner_tags"].feature.names
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [16]:
def create_dataset_given_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    data_collator_ = DataCollatorForTokenClassification(tokenizer=tokenizer)

    # Tokenize and align labels
    tokenized_dataset_ = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset.column_names)

    # Step 1: Generate Train Dataset
    train_datasets = generate_train_datasets(tokenized_dataset_, number_of_samples=30, number_of_splits=1)
    train_name, train_dataset, train_indices = train_datasets[0]
    print(f"{train_name}: {len(train_dataset)} samples")

    # Step 2: Generate Validation Dataset
    val_datasets = generate_validation_datasets(tokenized_dataset_, train_indices, number_of_samples=30, number_of_splits=1)
    val_name, val_dataset, val_indices = val_datasets[0]
    print(f"{val_name}: {len(val_dataset)} samples")

    # Step 3: Generate Test Dataset
    test_datasets = generate_test_datasets(tokenized_dataset_, train_indices, val_indices, number_of_samples=30, number_of_splits=1)
    test_name, test_dataset = test_datasets[0]
    print(f"{test_name}: {len(test_dataset)} samples")

    return data_collator_, train_dataset, val_dataset, test_dataset


In [17]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Flatten predictions and labels, removing ignored indices
    true_labels = [label for label_seq in labels for label in label_seq if label != -100]
    true_predictions = [pred for pred_seq, label_seq in zip(predictions, labels)
                        for pred, label in zip(pred_seq, label_seq) if label != -100]
    return true_labels, true_predictions


# Training and evaluation

In [30]:
from sklearn.model_selection import ParameterGrid
import os
import json

# Define the models and their corresponding sizes
models = {
    "small": "bert-base-cased",
    "medium": "bert-large-cased",
    "large": "roberta-large"
}

# Define hyperparameter grid
param_grid = {
    "learning_rate": [5e-6, 2e-5, 5e-5],
    "batch_size": [8, 16],
    "weight_decay": [0.0, 0.01]
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loop over models
for size, model_name in models.items():
    print(f"\nTuning and evaluating {model_name} ({size})...")
    best_f1 = 0.0
    best_hyperparameters = None
    model_save_path = f"saved_models/{model_name}"
    os.makedirs(model_save_path, exist_ok=True)

    # Hyperparameter tuning loop
    for params in ParameterGrid(param_grid):
        print(f"\nHyperparameters: {params}")

        # Create datasets and data collator for the current model
        data_collator, tokenized_train, tokenized_val, tokenized_test = create_dataset_given_model(model_name)

        # Create DataLoaders
        train_dataloader = DataLoader(tokenized_train, batch_size=params["batch_size"], shuffle=True, collate_fn=data_collator)
        val_dataloader = DataLoader(tokenized_val, batch_size=params["batch_size"], collate_fn=data_collator)
        test_dataloader = DataLoader(tokenized_test, batch_size=params["batch_size"], collate_fn=data_collator)

        # Initialize the model for token classification
        model = AutoModelForTokenClassification.from_pretrained(
            model_name, id2label=id2label, label2id=label2id
        )

        # Set up optimizer and learning rate scheduler
        optimizer = torch.optim.AdamW(
            model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"]
        )
        num_train_epochs = 3
        num_update_steps_per_epoch = len(train_dataloader)
        num_training_steps = num_train_epochs * num_update_steps_per_epoch
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps
        )

        # Use the accelerator for distributed training
        accelerator = Accelerator()
        model, optimizer, train_dataloader, val_dataloader = accelerator.prepare(
            model, optimizer, train_dataloader, val_dataloader
        )

        # Training loop
        for epoch in range(num_train_epochs):
            print(f"Epoch {epoch + 1}/{num_train_epochs}")
            model.train()
            total_loss = 0
            progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}")
            for batch in progress_bar:
                # Move batch data to the same device as the model
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                total_loss += loss.item()

                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

                progress_bar.set_postfix(loss=loss.item())

            print(f"Epoch {epoch + 1} Loss: {total_loss:.4f}")

            # Validation loop
            model.eval()
            val_predictions, val_labels = [], []
            with torch.no_grad():
                for batch in val_dataloader:
                    # Move batch data to the same device as the model
                    batch = {k: v.to(device) for k, v in batch.items()}
                    outputs = model(**batch)
                    logits = outputs.logits
                    predictions = logits.argmax(dim=-1)
                    labels = batch["labels"]

                    # Handle padding across processes for multi-GPU
                    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
                    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

                    predictions_gathered = accelerator.gather(predictions)
                    labels_gathered = accelerator.gather(labels)

                    # Postprocess to get flattened labels and predictions
                    flat_labels, flat_predictions = postprocess(predictions_gathered, labels_gathered)
                    val_labels.extend(flat_labels)
                    val_predictions.extend(flat_predictions)

            # Calculate validation metrics
            precision, recall, f1, _ = precision_recall_fscore_support(
                val_labels, val_predictions, average="micro"  # 'micro' aggregates across all classes
            )
            print(f"Validation Metrics (Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f})")

        # Test loop for evaluation after training
        model.eval()
        test_predictions, test_labels = [], []
        with torch.no_grad():
            for batch in test_dataloader:
                # Move batch data to the same device as the model
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                logits = outputs.logits
                predictions = logits.argmax(dim=-1)
                labels = batch["labels"]

                # Handle padding across processes for multi-GPU
                predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
                labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

                predictions_gathered = accelerator.gather(predictions)
                labels_gathered = accelerator.gather(labels)

                # Postprocess to get flattened labels and predictions
                flat_labels, flat_predictions = postprocess(predictions_gathered, labels_gathered)
                test_labels.extend(flat_labels)
                test_predictions.extend(flat_predictions)

        # Calculate test metrics
        precision, recall, f1, _ = precision_recall_fscore_support(
            test_labels, test_predictions, average="micro"  # 'micro' aggregates across all classes
        )
        print(f"Test Metrics (Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f})")

        # Save the best model based on test F1-score
        if f1 > best_f1:
            print(f"New best model found for {model_name} with F1: {f1:.4f}")
            best_f1 = f1
            best_hyperparameters = params

            # Overwrite the saved model
            accelerator.unwrap_model(model).save_pretrained(model_save_path)
            tokenizer.save_pretrained(model_save_path)

            # Save the best hyperparameters to a JSON file in the model's folder
            best_config_path = os.path.join(model_save_path, "best_config.json")
            with open(best_config_path, "w") as f:
                json.dump({
                    "best_hyperparameters": best_hyperparameters,
                    "best_f1": best_f1
                }, f, indent=4)

    print(f"\nBest Model for {model_name} ({size}): {model_save_path}")
    print(f"Best Hyperparameters: {best_hyperparameters}")
    print(f"Best F1-Score: {best_f1:.4f}") 



Tuning and evaluating bert-base-cased (small)...

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 5360.95 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:11<00:00,  2.78s/it, loss=2.47]


Epoch 1 Loss: 10.3012
Validation Metrics (Precision: 0.0757, Recall: 0.0757, F1: 0.0757)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:10<00:00,  2.69s/it, loss=2.3] 


Epoch 2 Loss: 9.4678
Validation Metrics (Precision: 0.1252, Recall: 0.1252, F1: 0.1252)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:10<00:00,  2.73s/it, loss=2.27]


Epoch 3 Loss: 9.0833
Validation Metrics (Precision: 0.1642, Recall: 0.1642, F1: 0.1642)
Test Metrics (Precision: 0.1584, Recall: 0.1584, F1: 0.1584)
New best model found for bert-base-cased with F1: 0.1584

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4521.32 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:07<00:00,  1.95s/it, loss=2.12]


Epoch 1 Loss: 8.8535
Validation Metrics (Precision: 0.4916, Recall: 0.4916, F1: 0.4916)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:07<00:00,  1.84s/it, loss=2.01]


Epoch 2 Loss: 8.1376
Validation Metrics (Precision: 0.6289, Recall: 0.6289, F1: 0.6289)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:07<00:00,  1.90s/it, loss=1.95]


Epoch 3 Loss: 7.7468
Validation Metrics (Precision: 0.6726, Recall: 0.6726, F1: 0.6726)
Test Metrics (Precision: 0.6875, Recall: 0.6875, F1: 0.6875)
New best model found for bert-base-cased with F1: 0.6875

Hyperparameters: {'batch_size': 8, 'learning_rate': 2e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4956.15 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:10<00:00,  2.58s/it, loss=1.68]


Epoch 1 Loss: 8.1044
Validation Metrics (Precision: 0.7419, Recall: 0.7419, F1: 0.7419)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:10<00:00,  2.58s/it, loss=1.14]


Epoch 2 Loss: 5.4879
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:10<00:00,  2.64s/it, loss=1.23]


Epoch 3 Loss: 4.3846
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)
New best model found for bert-base-cased with F1: 0.7664

Hyperparameters: {'batch_size': 8, 'learning_rate': 2e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 3776.43 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:08<00:00,  2.02s/it, loss=1.89]


Epoch 1 Loss: 8.9551
Validation Metrics (Precision: 0.7254, Recall: 0.7254, F1: 0.7254)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:07<00:00,  1.85s/it, loss=1.15]


Epoch 2 Loss: 5.7971
Validation Metrics (Precision: 0.7456, Recall: 0.7456, F1: 0.7456)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:07<00:00,  1.85s/it, loss=1.08] 


Epoch 3 Loss: 4.4185
Validation Metrics (Precision: 0.7476, Recall: 0.7476, F1: 0.7476)
Test Metrics (Precision: 0.7644, Recall: 0.7644, F1: 0.7644)

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 3831.43 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:09<00:00,  2.44s/it, loss=1.36]


Epoch 1 Loss: 8.0176
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:09<00:00,  2.38s/it, loss=1.09] 


Epoch 2 Loss: 3.9114
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:09<00:00,  2.39s/it, loss=0.883]


Epoch 3 Loss: 3.2800
Validation Metrics (Precision: 0.7534, Recall: 0.7534, F1: 0.7534)
Test Metrics (Precision: 0.7704, Recall: 0.7704, F1: 0.7704)
New best model found for bert-base-cased with F1: 0.7704

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4916.34 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:07<00:00,  1.99s/it, loss=1.22]


Epoch 1 Loss: 6.2509
Validation Metrics (Precision: 0.7487, Recall: 0.7487, F1: 0.7487)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:07<00:00,  1.85s/it, loss=0.612]


Epoch 2 Loss: 3.3492
Validation Metrics (Precision: 0.7537, Recall: 0.7537, F1: 0.7537)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:07<00:00,  1.87s/it, loss=0.643]


Epoch 3 Loss: 2.8711
Validation Metrics (Precision: 0.7537, Recall: 0.7537, F1: 0.7537)
Test Metrics (Precision: 0.7724, Recall: 0.7724, F1: 0.7724)
New best model found for bert-base-cased with F1: 0.7724

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-06, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 5111.38 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:13<00:00,  6.67s/it, loss=2.35]


Epoch 1 Loss: 4.8278
Validation Metrics (Precision: 0.1339, Recall: 0.1339, F1: 0.1339)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:13<00:00,  6.74s/it, loss=2.27]


Epoch 2 Loss: 4.6048
Validation Metrics (Precision: 0.1985, Recall: 0.1985, F1: 0.1985)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:12<00:00,  6.07s/it, loss=2.23]


Epoch 3 Loss: 4.4536
Validation Metrics (Precision: 0.2281, Recall: 0.2281, F1: 0.2281)
Test Metrics (Precision: 0.2346, Recall: 0.2346, F1: 0.2346)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-06, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4924.15 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:10<00:00,  5.01s/it, loss=2.28]


Epoch 1 Loss: 4.6608
Validation Metrics (Precision: 0.1659, Recall: 0.1659, F1: 0.1659)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:10<00:00,  5.24s/it, loss=2.19]


Epoch 2 Loss: 4.4301
Validation Metrics (Precision: 0.2692, Recall: 0.2692, F1: 0.2692)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:10<00:00,  5.25s/it, loss=2.13]


Epoch 3 Loss: 4.3035
Validation Metrics (Precision: 0.3250, Recall: 0.3250, F1: 0.3250)
Test Metrics (Precision: 0.3352, Recall: 0.3352, F1: 0.3352)

Hyperparameters: {'batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 5204.58 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:13<00:00,  6.82s/it, loss=2.18]


Epoch 1 Loss: 4.6484
Validation Metrics (Precision: 0.5848, Recall: 0.5848, F1: 0.5848)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:13<00:00,  6.73s/it, loss=1.68]


Epoch 2 Loss: 3.5673
Validation Metrics (Precision: 0.7476, Recall: 0.7476, F1: 0.7476)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:14<00:00,  7.07s/it, loss=1.5] 


Epoch 3 Loss: 2.9820
Validation Metrics (Precision: 0.7487, Recall: 0.7487, F1: 0.7487)
Test Metrics (Precision: 0.7654, Recall: 0.7654, F1: 0.7654)

Hyperparameters: {'batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4815.68 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:10<00:00,  5.23s/it, loss=2.21]


Epoch 1 Loss: 4.6797
Validation Metrics (Precision: 0.5744, Recall: 0.5744, F1: 0.5744)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:10<00:00,  5.27s/it, loss=1.86]


Epoch 2 Loss: 3.8137
Validation Metrics (Precision: 0.7342, Recall: 0.7342, F1: 0.7342)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:10<00:00,  5.03s/it, loss=1.6] 


Epoch 3 Loss: 3.2827
Validation Metrics (Precision: 0.7413, Recall: 0.7413, F1: 0.7413)
Test Metrics (Precision: 0.7594, Recall: 0.7594, F1: 0.7594)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4969.43 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:13<00:00,  6.65s/it, loss=1.76]


Epoch 1 Loss: 4.1419
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:12<00:00,  6.39s/it, loss=0.982]


Epoch 2 Loss: 2.2646
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:14<00:00,  7.07s/it, loss=1.04] 


Epoch 3 Loss: 1.8821
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4872.38 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:10<00:00,  5.31s/it, loss=1.7] 


Epoch 1 Loss: 4.1059
Validation Metrics (Precision: 0.7480, Recall: 0.7480, F1: 0.7480)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:09<00:00,  4.97s/it, loss=0.996]


Epoch 2 Loss: 2.2363
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:10<00:00,  5.03s/it, loss=0.858]


Epoch 3 Loss: 1.9306
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Best Model for bert-base-cased (small): saved_models/bert-base-cased
Best Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.01}
Best F1-Score: 0.7724

Tuning and evaluating bert-large-cased (medium)...

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4682.45 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:42<00:00, 10.52s/it, loss=1.93]


Epoch 1 Loss: 8.1968
Validation Metrics (Precision: 0.6746, Recall: 0.6746, F1: 0.6746)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:48<00:00, 12.09s/it, loss=1.51]


Epoch 2 Loss: 6.5508
Validation Metrics (Precision: 0.7379, Recall: 0.7379, F1: 0.7379)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:46<00:00, 11.56s/it, loss=1.43]


Epoch 3 Loss: 5.8392
Validation Metrics (Precision: 0.7466, Recall: 0.7466, F1: 0.7466)
Test Metrics (Precision: 0.7617, Recall: 0.7617, F1: 0.7617)
New best model found for bert-large-cased with F1: 0.7617

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4803.88 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:35<00:00,  8.95s/it, loss=2.11]


Epoch 1 Loss: 9.2280
Validation Metrics (Precision: 0.3745, Recall: 0.3745, F1: 0.3745)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:37<00:00,  9.42s/it, loss=1.79]


Epoch 2 Loss: 7.5880
Validation Metrics (Precision: 0.6171, Recall: 0.6171, F1: 0.6171)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:36<00:00,  9.16s/it, loss=1.69]


Epoch 3 Loss: 6.7410
Validation Metrics (Precision: 0.6585, Recall: 0.6585, F1: 0.6585)
Test Metrics (Precision: 0.6798, Recall: 0.6798, F1: 0.6798)

Hyperparameters: {'batch_size': 8, 'learning_rate': 2e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4803.36 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:35<00:00,  8.97s/it, loss=1.23]


Epoch 1 Loss: 7.2831
Validation Metrics (Precision: 0.7487, Recall: 0.7487, F1: 0.7487)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:46<00:00, 11.67s/it, loss=0.88]


Epoch 2 Loss: 4.1554
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:44<00:00, 11.13s/it, loss=0.721]


Epoch 3 Loss: 3.7644
Validation Metrics (Precision: 0.7487, Recall: 0.7487, F1: 0.7487)
Test Metrics (Precision: 0.7637, Recall: 0.7637, F1: 0.7637)
New best model found for bert-large-cased with F1: 0.7637

Hyperparameters: {'batch_size': 8, 'learning_rate': 2e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4700.79 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:33<00:00,  8.37s/it, loss=1.2] 


Epoch 1 Loss: 7.0064
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:37<00:00,  9.40s/it, loss=0.748]


Epoch 2 Loss: 3.8730
Validation Metrics (Precision: 0.7493, Recall: 0.7493, F1: 0.7493)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:38<00:00,  9.73s/it, loss=0.816]


Epoch 3 Loss: 3.4634
Validation Metrics (Precision: 0.7497, Recall: 0.7497, F1: 0.7497)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)
New best model found for bert-large-cased with F1: 0.7664

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4739.79 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:37<00:00,  9.26s/it, loss=0.983]


Epoch 1 Loss: 5.7285
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:46<00:00, 11.59s/it, loss=0.806]


Epoch 2 Loss: 3.1179
Validation Metrics (Precision: 0.7513, Recall: 0.7513, F1: 0.7513)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:49<00:00, 12.31s/it, loss=0.708]


Epoch 3 Loss: 2.6751
Validation Metrics (Precision: 0.7584, Recall: 0.7584, F1: 0.7584)
Test Metrics (Precision: 0.7754, Recall: 0.7754, F1: 0.7754)
New best model found for bert-large-cased with F1: 0.7754

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4675.47 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:37<00:00,  9.28s/it, loss=0.837]


Epoch 1 Loss: 5.8950
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:35<00:00,  8.77s/it, loss=0.807]


Epoch 2 Loss: 3.1555
Validation Metrics (Precision: 0.7534, Recall: 0.7534, F1: 0.7534)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:38<00:00,  9.73s/it, loss=0.602]


Epoch 3 Loss: 2.5244
Validation Metrics (Precision: 0.7732, Recall: 0.7732, F1: 0.7732)
Test Metrics (Precision: 0.7951, Recall: 0.7951, F1: 0.7951)
New best model found for bert-large-cased with F1: 0.7951

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-06, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4887.98 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [01:01<00:00, 30.98s/it, loss=2.31]


Epoch 1 Loss: 4.7983
Validation Metrics (Precision: 0.2338, Recall: 0.2338, F1: 0.2338)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [01:03<00:00, 31.70s/it, loss=2.09]


Epoch 2 Loss: 4.3216
Validation Metrics (Precision: 0.3631, Recall: 0.3631, F1: 0.3631)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [01:07<00:00, 33.72s/it, loss=2]   


Epoch 3 Loss: 4.0289
Validation Metrics (Precision: 0.4337, Recall: 0.4337, F1: 0.4337)
Test Metrics (Precision: 0.4378, Recall: 0.4378, F1: 0.4378)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-06, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4863.79 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:42<00:00, 21.43s/it, loss=2.29]


Epoch 1 Loss: 4.7058
Validation Metrics (Precision: 0.2638, Recall: 0.2638, F1: 0.2638)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:46<00:00, 23.41s/it, loss=2.06]


Epoch 2 Loss: 4.2283
Validation Metrics (Precision: 0.3991, Recall: 0.3991, F1: 0.3991)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:48<00:00, 24.40s/it, loss=1.95]


Epoch 3 Loss: 3.9542
Validation Metrics (Precision: 0.4505, Recall: 0.4505, F1: 0.4505)
Test Metrics (Precision: 0.4726, Recall: 0.4726, F1: 0.4726)

Hyperparameters: {'batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 5186.31 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:59<00:00, 29.90s/it, loss=1.9] 


Epoch 1 Loss: 4.4273
Validation Metrics (Precision: 0.7234, Recall: 0.7234, F1: 0.7234)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [01:02<00:00, 31.32s/it, loss=1.31]


Epoch 2 Loss: 2.7470
Validation Metrics (Precision: 0.7470, Recall: 0.7470, F1: 0.7470)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [01:00<00:00, 30.39s/it, loss=1.08]


Epoch 3 Loss: 2.1331
Validation Metrics (Precision: 0.7476, Recall: 0.7476, F1: 0.7476)
Test Metrics (Precision: 0.7647, Recall: 0.7647, F1: 0.7647)

Hyperparameters: {'batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 3213.85 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:39<00:00, 19.96s/it, loss=2.03]


Epoch 1 Loss: 4.6286
Validation Metrics (Precision: 0.7342, Recall: 0.7342, F1: 0.7342)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:48<00:00, 24.08s/it, loss=1.49]


Epoch 2 Loss: 2.9810
Validation Metrics (Precision: 0.7453, Recall: 0.7453, F1: 0.7453)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:44<00:00, 22.26s/it, loss=1.03]


Epoch 3 Loss: 2.2452
Validation Metrics (Precision: 0.7460, Recall: 0.7460, F1: 0.7460)
Test Metrics (Precision: 0.7640, Recall: 0.7640, F1: 0.7640)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4850.36 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:59<00:00, 29.61s/it, loss=1.22]


Epoch 1 Loss: 3.5859
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:58<00:00, 29.06s/it, loss=0.833]


Epoch 2 Loss: 1.8553
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [01:02<00:00, 31.21s/it, loss=0.97] 


Epoch 3 Loss: 1.7428
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7660, Recall: 0.7660, F1: 0.7660)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4776.92 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:41<00:00, 20.63s/it, loss=1.28]


Epoch 1 Loss: 3.5614
Validation Metrics (Precision: 0.7460, Recall: 0.7460, F1: 0.7460)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:37<00:00, 18.62s/it, loss=0.761]


Epoch 2 Loss: 1.7911
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:48<00:00, 24.21s/it, loss=0.76] 


Epoch 3 Loss: 1.5784
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7660, Recall: 0.7660, F1: 0.7660)

Best Model for bert-large-cased (medium): saved_models/bert-large-cased
Best Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.01}
Best F1-Score: 0.7951

Tuning and evaluating roberta-large (large)...

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4852.51 examples/s]
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:43<00:00, 10.81s/it, loss=1.63]


Epoch 1 Loss: 7.7519
Validation Metrics (Precision: 0.7413, Recall: 0.7413, F1: 0.7413)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:40<00:00, 10.13s/it, loss=0.997]


Epoch 2 Loss: 5.0145
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:35<00:00,  8.82s/it, loss=0.899]


Epoch 3 Loss: 4.4752
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)
New best model found for roberta-large with F1: 0.7664

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4863.14 examples/s]
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:34<00:00,  8.65s/it, loss=1.76]


Epoch 1 Loss: 8.1065
Validation Metrics (Precision: 0.7137, Recall: 0.7137, F1: 0.7137)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:31<00:00,  7.88s/it, loss=1.33]


Epoch 2 Loss: 5.3463
Validation Metrics (Precision: 0.7473, Recall: 0.7473, F1: 0.7473)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:30<00:00,  7.60s/it, loss=1.23] 


Epoch 3 Loss: 4.4653
Validation Metrics (Precision: 0.7476, Recall: 0.7476, F1: 0.7476)
Test Metrics (Precision: 0.7660, Recall: 0.7660, F1: 0.7660)

Hyperparameters: {'batch_size': 8, 'learning_rate': 2e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4790.23 examples/s]
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:40<00:00, 10.02s/it, loss=1.56]


Epoch 1 Loss: 8.0436
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:41<00:00, 10.32s/it, loss=0.955]


Epoch 2 Loss: 4.2734
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:37<00:00,  9.31s/it, loss=0.995]


Epoch 3 Loss: 4.0045
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Hyperparameters: {'batch_size': 8, 'learning_rate': 2e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 5000.32 examples/s]
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:32<00:00,  8.15s/it, loss=1.23]


Epoch 1 Loss: 5.9525
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:30<00:00,  7.71s/it, loss=1.26] 


Epoch 2 Loss: 4.3242
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:30<00:00,  7.65s/it, loss=1.08] 


Epoch 3 Loss: 3.9590
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4905.73 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:39<00:00,  9.93s/it, loss=1.21]


Epoch 1 Loss: 5.5838
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:39<00:00,  9.97s/it, loss=0.911]


Epoch 2 Loss: 3.9640
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:36<00:00,  9.10s/it, loss=0.999]


Epoch 3 Loss: 3.7695
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4746.59 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:33<00:00,  8.38s/it, loss=1.21]


Epoch 1 Loss: 6.8411
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:33<00:00,  8.47s/it, loss=1]    


Epoch 2 Loss: 3.9956
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:34<00:00,  8.51s/it, loss=0.879]


Epoch 3 Loss: 3.7734
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-06, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4730.51 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:47<00:00, 23.87s/it, loss=1.53]


Epoch 1 Loss: 3.2364
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:54<00:00, 27.35s/it, loss=1.29]


Epoch 2 Loss: 2.7224
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [01:04<00:00, 32.32s/it, loss=1.14]


Epoch 3 Loss: 2.4379
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-06, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 3123.63 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:39<00:00, 19.97s/it, loss=1.52]


Epoch 1 Loss: 3.2153
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:48<00:00, 24.35s/it, loss=1.42]


Epoch 2 Loss: 2.7249
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:45<00:00, 22.63s/it, loss=1.15]


Epoch 3 Loss: 2.4922
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Hyperparameters: {'batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 5259.90 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:55<00:00, 27.58s/it, loss=2.11]


Epoch 1 Loss: 5.0290
Validation Metrics (Precision: 0.7288, Recall: 0.7288, F1: 0.7288)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [01:02<00:00, 31.43s/it, loss=1.1] 


Epoch 2 Loss: 2.5600
Validation Metrics (Precision: 0.7473, Recall: 0.7473, F1: 0.7473)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:50<00:00, 25.35s/it, loss=1.24] 


Epoch 3 Loss: 2.2364
Validation Metrics (Precision: 0.7480, Recall: 0.7480, F1: 0.7480)
Test Metrics (Precision: 0.7660, Recall: 0.7660, F1: 0.7660)

Hyperparameters: {'batch_size': 16, 'learning_rate': 2e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4685.89 examples/s]
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:43<00:00, 21.90s/it, loss=1.46]


Epoch 1 Loss: 3.8894
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:48<00:00, 24.38s/it, loss=1.24]


Epoch 2 Loss: 2.3433
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:38<00:00, 19.29s/it, loss=0.987]


Epoch 3 Loss: 2.1074
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4478.71 examples/s]
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:54<00:00, 27.27s/it, loss=1.77]


Epoch 1 Loss: 4.8790
Validation Metrics (Precision: 0.7480, Recall: 0.7480, F1: 0.7480)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [01:02<00:00, 31.10s/it, loss=1.09]


Epoch 2 Loss: 2.3610
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [01:09<00:00, 35.00s/it, loss=0.94]


Epoch 3 Loss: 1.9819
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4827.13 examples/s]
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [01:05<00:00, 32.92s/it, loss=1.28]


Epoch 1 Loss: 3.7813
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:51<00:00, 25.84s/it, loss=0.995]


Epoch 2 Loss: 2.7757
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:50<00:00, 25.32s/it, loss=0.989]


Epoch 3 Loss: 2.1307
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Best Model for roberta-large (large): saved_models/roberta-large
Best Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.0}
Best F1-Score: 0.7664
