In [1]:
import re
import random
import os
import json
import evaluate

from transformers import AutoTokenizer, AutoModelForTokenClassification, get_scheduler
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict, Features, Sequence, Value, ClassLabel

from sklearn.model_selection import ParameterGrid
from sklearn.metrics import precision_recall_fscore_support

import torch
from torch.utils.data import DataLoader
from accelerate import Accelerator
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Combine all .txt and .ann files and combine them per medicine

In [52]:
# Base folders containing annotation and text files
annotations_folder = 'annotations/'
original_texts_folder = 'originaltexts/'
output_folder = 'output_datasets/'
os.makedirs(output_folder, exist_ok=True)

# Group files by medicine
file_groups = {}
for file_name in os.listdir(annotations_folder):
    if file_name.endswith('.ann'):
        base_name = '.'.join(file_name.split('.')[:-1])
        medicine = base_name.rsplit('.', 1)[0]
        file_groups.setdefault(medicine, []).append(file_name)

# Process each group
for medicine, ann_files in file_groups.items():
    combined_output = []

    for ann_file in ann_files:
        txt_file = ann_file.replace('.ann', '.txt')
        txt_path = os.path.join(original_texts_folder, txt_file)
        ann_path = os.path.join(annotations_folder, ann_file)

        # Ensure the corresponding .txt file exists
        if not os.path.exists(txt_path):
            raise FileNotFoundError(f"Text file not found for annotation file {ann_file}")

        # Read the content of the .ann and .txt files
        with open(ann_path, 'r') as ann_f:
            ann_lines = ann_f.readlines()

        with open(txt_path, 'r') as txt_f:
            txt_content = txt_f.read()

        # Parse annotations and filter out AnnotatorNotes
        annotations = []
        for line in ann_lines:
            if line.startswith('T'):
                parts = line.strip().split('\t')
                if len(parts) == 3:
                    tag_info, word = parts[1], parts[2]
                    tag_parts = tag_info.split()
                    if len(tag_parts) >= 3:
                        tag = tag_parts[0]
                        try:
                            start_idx = int(tag_parts[1])
                            end_idx = int(tag_parts[2])
                        except ValueError:
                            if ';' in tag_parts[2]:  # Handle ranges like '742;763'
                                start_idx = int(tag_parts[1])
                                end_idx = int(tag_parts[2].split(';')[-1])
                            else:
                                raise ValueError(f"Unexpected annotation format: {tag_parts}")
                        annotations.append((start_idx, end_idx, tag, word))

        # Sort annotations by start index
        annotations.sort(key=lambda x: x[0])

        # Generate output format
        output = []
        current_idx = 0
        for start_idx, end_idx, tag, word in annotations:
            # Add text between the last annotation and the current annotation as "O"
            if current_idx < start_idx:
                intervening_text = txt_content[current_idx:start_idx]
                for token in re.findall(r"\w+(?:'\w+)?|[.,!?]", intervening_text):
                    output.append(f"{token} O")

            # Add the annotated word with its tag
            for i, token in enumerate(word.split()):
                tag_prefix = 'B-' if i == 0 else 'I-'
                output.append(f"{token} {tag_prefix}{tag}")

            current_idx = end_idx

        # Add remaining text as "O"
        if current_idx < len(txt_content):
            remaining_text = txt_content[current_idx:]
            for token in re.findall(r"\w+(?:'\w+)?|[.,!?]", remaining_text):
                output.append(f"{token} O")

        # Add to combined output with a newline separator
        combined_output.extend(output)
        combined_output.append('')  # Empty line between posts

    # Write combined output to file
    combined_output_text = '\n'.join(combined_output).strip()
    output_file = os.path.join(output_folder, f"{medicine}_combined_output.txt")
    with open(output_file, 'w') as out_f:
        out_f.write(combined_output_text)

    print(f"Combined output saved for {medicine} in {output_file}")

Combined output saved for ARTHROTEC in output_datasets/ARTHROTEC_combined_output.txt
Combined output saved for CAMBIA in output_datasets/CAMBIA_combined_output.txt
Combined output saved for CATAFLAM in output_datasets/CATAFLAM_combined_output.txt
Combined output saved for DICLOFENAC-POTASSIUM in output_datasets/DICLOFENAC-POTASSIUM_combined_output.txt
Combined output saved for DICLOFENAC-SODIUM in output_datasets/DICLOFENAC-SODIUM_combined_output.txt
Combined output saved for FLECTOR in output_datasets/FLECTOR_combined_output.txt
Combined output saved for LIPITOR in output_datasets/LIPITOR_combined_output.txt
Combined output saved for PENNSAID in output_datasets/PENNSAID_combined_output.txt
Combined output saved for SOLARAZE in output_datasets/SOLARAZE_combined_output.txt
Combined output saved for VOLTAREN-XR in output_datasets/VOLTAREN-XR_combined_output.txt
Combined output saved for VOLTAREN in output_datasets/VOLTAREN_combined_output.txt
Combined output saved for ZIPSOR in output_da

## Combine all the medicine files into one dataset

In [53]:
# Folder containing all combined output files
output_datasets_folder = 'output_datasets/'
final_output_file = 'final_dataset.txt'

# Ensure the folder exists
if not os.path.exists(output_datasets_folder):
    raise FileNotFoundError(f"The folder {output_datasets_folder} does not exist.")

# List all files in the folder
output_files = [f for f in os.listdir(output_datasets_folder) if f.endswith('_combined_output.txt')]

# Combine all files into a single final dataset
final_dataset = []
for file_name in output_files:
    file_path = os.path.join(output_datasets_folder, file_name)
    with open(file_path, 'r') as f:
        content = f.read().strip()  # Read and strip any trailing spaces or newlines
        final_dataset.append(content)

    # Add an empty line to separate posts from different files
    final_dataset.append('')

# Write the combined dataset to the final output file
with open(final_output_file, 'w') as f:
    f.write('\n'.join(final_dataset).strip())  # Ensure no extra trailing newline

print(f"Final dataset saved to {final_output_file}")

Final dataset saved to final_dataset.txt


## Read the final dataset into the Iob dataset format

In [2]:
def read_iob_file(file_path):
    """Reads an IOB file from filepath and returns sentences with tokens and tags."""
    sentences = []
    sentence_tokens = []
    sentence_labels = []

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line:  # If line is not empty
                token, tag = line.split()
                sentence_tokens.append(token)
                sentence_labels.append(tag)

            else:
                # End of a sentence
                if sentence_tokens:
                    sentences.append({"tokens": sentence_tokens, "ner_tags": sentence_labels})
                    sentence_tokens = []
                    sentence_labels = []
        # Add the last sentence if file doesn't end with a newline
        if sentence_tokens:
            sentences.append({"tokens": sentence_tokens, "ner_tags": sentence_labels})
    return sentences

def create_dataset_from_final_file(final_file_path):
    """Create a dataset from a single IOB file and return it as a DatasetDict."""

    if not os.path.exists(final_file_path):
        raise FileNotFoundError(f"The file {final_file_path} does not exist.")

    # Parse the file
    data = read_iob_file(final_file_path)

    # Define the label names and ClassLabel feature
    unique_labels = sorted(set(tag for d in data for tag in d["ner_tags"]))
    label_feature = ClassLabel(names=unique_labels)

    # Define the Features schema for Hugging Face datasets
    features = Features({
        'tokens': Sequence(Value("string")),
        'ner_tags': Sequence(label_feature)
    })

    # Convert data into a Dataset
    dataset = Dataset.from_list(data).cast(features)

    # Create a DatasetDict
    dataset_dict = DatasetDict({"full_data": dataset})

    return dataset_dict


In [3]:
final_dataset_path = "final_dataset.txt"
dataset_dict = create_dataset_from_final_file(final_dataset_path)
dataset = dataset_dict['full_data']

Casting the dataset: 100%|██████████| 1248/1248 [00:00<00:00, 6363.59 examples/s]


## Tokenize and align labels, also add datacollator

In [4]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None

    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)

        elif word_id is None:
            # Special token
            new_labels.append(-100)

        else:
            # Same word as previous token
            label = labels[word_id]

            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True,
        is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

## Dataset generators

In [5]:
def generate_train_datasets(dataset, number_of_samples, number_of_splits):
    """
    Generates train datasets by sampling from the given dataset based on the number of samples and splits.

    Args:
        dataset (Dataset): The base dataset to sample from.
        number_of_samples (int): Number of samples per dataset.
        number_of_splits (int): Number of datasets to generate (different seeds).

    Returns:
        List[Tuple[str, Dataset, List[int]]]: List of generated datasets with their names and indices.
    """
    datasets = []

    for seed in range(number_of_splits):
        # Set the random seed for reproducibility
        random.seed(seed)

        # Shuffle and sample from the dataset
        indices = list(range(len(dataset)))
        random.shuffle(indices)
        sampled_indices = indices[:number_of_samples]

        sampled_dataset = dataset.select(sampled_indices)

        # Add the dataset with its name and indices
        datasets.append((f"train_dataset_{number_of_samples}_{seed}", sampled_dataset, sampled_indices))

    return datasets

In [6]:
def generate_validation_datasets(dataset, train_indices, number_of_samples, number_of_splits):
    """
    Generates validation datasets by sampling from the given dataset, ensuring no overlap with training data.

    Args:
        dataset (Dataset): The base dataset to sample from.
        train_indices (List[int]): Indices of the training dataset to exclude from sampling.
        number_of_samples (int): Number of samples per validation dataset.
        number_of_splits (int): Number of validation datasets to generate (different seeds).

    Returns:
        List[Tuple[str, Dataset, List[int]]]: List of generated validation datasets with names and indices.
    """
    datasets = []
    all_indices = set(range(len(dataset)))
    available_indices = list(all_indices - set(train_indices))  # Exclude training indices

    for seed in range(number_of_splits):
        # Set the random seed for reproducibility
        random.seed(seed)

        # Shuffle and sample from the remaining indices
        random.shuffle(available_indices)
        sampled_indices = available_indices[:number_of_samples]

        sampled_dataset = dataset.select(sampled_indices)

        # Add the dataset with its name and indices
        datasets.append((f"val_dataset_{number_of_samples}_{seed}", sampled_dataset, sampled_indices))

    return datasets

In [7]:
def generate_test_datasets(dataset, train_indices, val_indices, number_of_samples, number_of_splits):
    """
    Generates test datasets by sampling from the given dataset, ensuring no overlap with training or validation data.

    Args:
        dataset (Dataset): The base dataset to sample from.
        train_indices (List[int]): Indices of the training dataset to exclude from sampling.
        val_indices (List[int]): Indices of the validation dataset to exclude from sampling.
        number_of_samples (int): Number of samples per test dataset.
        number_of_splits (int): Number of test datasets to generate (different seeds).

    Returns:
        List[Tuple[str, Dataset]]: List of generated test datasets with names.
    """
    datasets = []
    all_indices = set(range(len(dataset)))
    available_indices = list(all_indices - set(train_indices) - set(val_indices))  # Exclude train and val indices

    for seed in range(number_of_splits):
        # Set the random seed for reproducibility
        random.seed(seed)

        # Shuffle and sample from the remaining indices
        random.shuffle(available_indices)
        sampled_indices = available_indices[:number_of_samples]

        sampled_dataset = dataset.select(sampled_indices)

        # Add the dataset with its name
        datasets.append((f"test_dataset_{number_of_samples}_{seed}", sampled_dataset))

    return datasets

### Example usage

In [71]:
# Step 1: Generate Train Dataset
# train_datasets = generate_train_datasets(dataset, number_of_samples=30, number_of_splits=1)
# train_name, train_dataset, train_indices = train_datasets[0]
# print(f"{train_name}: {len(train_dataset)} samples")
#
# # Step 2: Generate Validation Dataset
# val_datasets = generate_validation_datasets(dataset, train_indices, number_of_samples=30, number_of_splits=1)
# val_name, val_dataset, val_indices = val_datasets[0]
# print(f"{val_name}: {len(val_dataset)} samples")
#
# # Step 3: Generate Test Dataset
# test_datasets = generate_test_datasets(dataset, train_indices, val_indices, number_of_samples=30, number_of_splits=1)
# test_name, test_dataset = test_datasets[0]
# print(f"{test_name}: {len(test_dataset)} samples")

train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


# Prepping for training

In [8]:
# Load the evaluation metric
metric = evaluate.load("seqeval")

label_names = dataset.features["ner_tags"].feature.names
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [12]:
def create_dataset_given_model(tokenizer_):
    # tokenizer = AutoTokenizer.from_pretrained(model_name)
    data_collator_ = DataCollatorForTokenClassification(tokenizer=tokenizer_)

    # Tokenize and align labels
    tokenized_dataset_ = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset.column_names)

    # Step 1: Generate Train Dataset
    train_datasets = generate_train_datasets(tokenized_dataset_, number_of_samples=30, number_of_splits=1)
    train_name, train_dataset, train_indices = train_datasets[0]
    print(f"{train_name}: {len(train_dataset)} samples")

    # Step 2: Generate Validation Dataset
    val_datasets = generate_validation_datasets(tokenized_dataset_, train_indices, number_of_samples=30, number_of_splits=1)
    val_name, val_dataset, val_indices = val_datasets[0]
    print(f"{val_name}: {len(val_dataset)} samples")

    # Step 3: Generate Test Dataset
    test_datasets = generate_test_datasets(tokenized_dataset_, train_indices, val_indices, number_of_samples=30, number_of_splits=1)
    test_name, test_dataset = test_datasets[0]
    print(f"{test_name}: {len(test_dataset)} samples")

    return data_collator_, train_dataset, val_dataset, test_dataset


In [10]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Flatten predictions and labels, removing ignored indices
    true_labels = [label for label_seq in labels for label in label_seq if label != -100]
    true_predictions = [pred for pred_seq, label_seq in zip(predictions, labels)
                        for pred, label in zip(pred_seq, label_seq) if label != -100]
    return true_labels, true_predictions


# Training and evaluation

In [13]:
# Define the models and their corresponding sizes
models = {
    "small": "bert-base-cased",
    "medium": "bert-large-cased",
    "large": "roberta-large"
}

# Define hyperparameter grid
param_grid = {
    "learning_rate": [5e-6, 2e-5, 5e-5],
    "batch_size": [8, 16],
    "weight_decay": [0.0, 0.01]
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loop over models
for size, model_name in models.items():
    print(f"\nTuning and evaluating {model_name} ({size})...")
    best_f1 = 0.0
    best_hyperparameters = None
    model_save_path = f"saved_models/{model_name}"
    os.makedirs(model_save_path, exist_ok=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Hyperparameter tuning loop
    for params in ParameterGrid(param_grid):
        print(f"\nHyperparameters: {params}")

        # Create datasets and data collator for the current model
        data_collator, tokenized_train, tokenized_val, tokenized_test = create_dataset_given_model(tokenizer)

        # Create DataLoaders
        train_dataloader = DataLoader(tokenized_train, batch_size=params["batch_size"], shuffle=True, collate_fn=data_collator)
        val_dataloader = DataLoader(tokenized_val, batch_size=params["batch_size"], collate_fn=data_collator)
        test_dataloader = DataLoader(tokenized_test, batch_size=params["batch_size"], collate_fn=data_collator)

        # Initialize the model for token classification
        model = AutoModelForTokenClassification.from_pretrained(
            model_name, id2label=id2label, label2id=label2id
        )

        # Set up optimizer and learning rate scheduler
        optimizer = torch.optim.AdamW(
            model.parameters(), lr=params["learning_rate"], weight_decay=params["weight_decay"]
        )
        num_train_epochs = 3
        num_update_steps_per_epoch = len(train_dataloader)
        num_training_steps = num_train_epochs * num_update_steps_per_epoch
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps
        )

        # Use the accelerator for distributed training
        accelerator = Accelerator()
        model, optimizer, train_dataloader, val_dataloader = accelerator.prepare(
            model, optimizer, train_dataloader, val_dataloader
        )

        # Training loop
        for epoch in range(num_train_epochs):
            print(f"Epoch {epoch + 1}/{num_train_epochs}")
            model.train()
            total_loss = 0
            progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}")
            for batch in progress_bar:
                # Move batch data to the same device as the model
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                total_loss += loss.item()

                accelerator.backward(loss)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

                progress_bar.set_postfix(loss=loss.item())

            print(f"Epoch {epoch + 1} Loss: {total_loss:.4f}")

            # Validation loop
            model.eval()
            val_predictions, val_labels = [], []
            with torch.no_grad():
                for batch in val_dataloader:
                    # Move batch data to the same device as the model
                    batch = {k: v.to(device) for k, v in batch.items()}
                    outputs = model(**batch)
                    logits = outputs.logits
                    predictions = logits.argmax(dim=-1)
                    labels = batch["labels"]

                    # Handle padding across processes for multi-GPU
                    predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
                    labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

                    predictions_gathered = accelerator.gather(predictions)
                    labels_gathered = accelerator.gather(labels)

                    # Postprocess to get flattened labels and predictions
                    flat_labels, flat_predictions = postprocess(predictions_gathered, labels_gathered)
                    val_labels.extend(flat_labels)
                    val_predictions.extend(flat_predictions)

            # Calculate validation metrics
            precision, recall, f1, _ = precision_recall_fscore_support(
                val_labels, val_predictions, average="micro"  # 'micro' aggregates across all classes
            )
            print(f"Validation Metrics (Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f})")

        # Test loop for evaluation after training
        model.eval()
        test_predictions, test_labels = [], []
        with torch.no_grad():
            for batch in test_dataloader:
                # Move batch data to the same device as the model
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                logits = outputs.logits
                predictions = logits.argmax(dim=-1)
                labels = batch["labels"]

                # Handle padding across processes for multi-GPU
                predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
                labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

                predictions_gathered = accelerator.gather(predictions)
                labels_gathered = accelerator.gather(labels)

                # Postprocess to get flattened labels and predictions
                flat_labels, flat_predictions = postprocess(predictions_gathered, labels_gathered)
                test_labels.extend(flat_labels)
                test_predictions.extend(flat_predictions)

        # Calculate test metrics
        precision, recall, f1, _ = precision_recall_fscore_support(
            test_labels, test_predictions, average="micro"  # 'micro' aggregates across all classes
        )
        print(f"Test Metrics (Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f})")

        # Save the best model based on test F1-score
        if f1 > best_f1:
            print(f"New best model found for {model_name} with F1: {f1:.4f}")
            best_f1 = f1
            best_hyperparameters = params

            # Overwrite the saved model
            accelerator.unwrap_model(model).save_pretrained(model_save_path)
            tokenizer.save_pretrained(model_save_path)

            # Save the best hyperparameters to a JSON file in the model's folder
            best_config_path = os.path.join(model_save_path, "best_config.json")
            with open(best_config_path, "w") as f:
                json.dump({
                    "best_hyperparameters": best_hyperparameters,
                    "best_f1": best_f1
                }, f, indent=4)

    print(f"\nBest Model for {model_name} ({size}): {model_save_path}")
    print(f"Best Hyperparameters: {best_hyperparameters}")
    print(f"Best F1-Score: {best_f1:.4f}")



Tuning and evaluating bert-base-cased (small)...

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4967.80 examples/s]


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:01<00:00,  3.31it/s, loss=2.4] 


Epoch 1 Loss: 9.9849
Validation Metrics (Precision: 0.0986, Recall: 0.0986, F1: 0.0986)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:01<00:00,  3.78it/s, loss=2.24]


Epoch 2 Loss: 9.2286
Validation Metrics (Precision: 0.1719, Recall: 0.1719, F1: 0.1719)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:01<00:00,  3.48it/s, loss=2.14]


Epoch 3 Loss: 8.7905
Validation Metrics (Precision: 0.2184, Recall: 0.2184, F1: 0.2184)
Test Metrics (Precision: 0.2066, Recall: 0.2066, F1: 0.2066)
New best model found for bert-base-cased with F1: 0.2066

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-06, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4997.42 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:01<00:00,  3.71it/s, loss=2]   


Epoch 1 Loss: 8.4399
Validation Metrics (Precision: 0.5787, Recall: 0.5787, F1: 0.5787)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:01<00:00,  3.61it/s, loss=1.93]


Epoch 2 Loss: 7.7773
Validation Metrics (Precision: 0.6851, Recall: 0.6851, F1: 0.6851)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:01<00:00,  3.76it/s, loss=1.79]


Epoch 3 Loss: 7.3840
Validation Metrics (Precision: 0.7093, Recall: 0.7093, F1: 0.7093)
Test Metrics (Precision: 0.7159, Recall: 0.7159, F1: 0.7159)
New best model found for bert-base-cased with F1: 0.7159

Hyperparameters: {'batch_size': 8, 'learning_rate': 2e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 5106.20 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:01<00:00,  3.51it/s, loss=1.57]


Epoch 1 Loss: 7.5377
Validation Metrics (Precision: 0.7453, Recall: 0.7453, F1: 0.7453)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:00<00:00,  4.50it/s, loss=1.21]


Epoch 2 Loss: 5.0832
Validation Metrics (Precision: 0.7480, Recall: 0.7480, F1: 0.7480)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:01<00:00,  3.75it/s, loss=0.847]


Epoch 3 Loss: 4.1153
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Test Metrics (Precision: 0.7660, Recall: 0.7660, F1: 0.7660)
New best model found for bert-base-cased with F1: 0.7660

Hyperparameters: {'batch_size': 8, 'learning_rate': 2e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 5053.34 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:01<00:00,  3.49it/s, loss=1.94]


Epoch 1 Loss: 8.7389
Validation Metrics (Precision: 0.7318, Recall: 0.7318, F1: 0.7318)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:01<00:00,  3.57it/s, loss=1.33]


Epoch 2 Loss: 6.2776
Validation Metrics (Precision: 0.7460, Recall: 0.7460, F1: 0.7460)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:01<00:00,  3.70it/s, loss=1.41]


Epoch 3 Loss: 5.3110
Validation Metrics (Precision: 0.7480, Recall: 0.7480, F1: 0.7480)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)
New best model found for bert-base-cased with F1: 0.7664

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4933.87 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:01<00:00,  3.96it/s, loss=1.05]


Epoch 1 Loss: 7.0843
Validation Metrics (Precision: 0.7490, Recall: 0.7490, F1: 0.7490)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:01<00:00,  3.72it/s, loss=0.875]


Epoch 2 Loss: 3.7857
Validation Metrics (Precision: 0.7500, Recall: 0.7500, F1: 0.7500)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:00<00:00,  4.03it/s, loss=0.645]


Epoch 3 Loss: 3.2225
Validation Metrics (Precision: 0.7534, Recall: 0.7534, F1: 0.7534)
Test Metrics (Precision: 0.7711, Recall: 0.7711, F1: 0.7711)
New best model found for bert-base-cased with F1: 0.7711

Hyperparameters: {'batch_size': 8, 'learning_rate': 5e-05, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4955.98 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 4/4 [00:01<00:00,  3.83it/s, loss=1.02]


Epoch 1 Loss: 6.5991
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 2/3


Training Epoch 2: 100%|██████████| 4/4 [00:01<00:00,  3.85it/s, loss=1.04] 


Epoch 2 Loss: 3.7364
Validation Metrics (Precision: 0.7483, Recall: 0.7483, F1: 0.7483)
Epoch 3/3


Training Epoch 3: 100%|██████████| 4/4 [00:01<00:00,  3.65it/s, loss=1.07] 


Epoch 3 Loss: 3.3168
Validation Metrics (Precision: 0.7487, Recall: 0.7487, F1: 0.7487)
Test Metrics (Precision: 0.7664, Recall: 0.7664, F1: 0.7664)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-06, 'weight_decay': 0.0}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 3427.04 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:01<00:00,  1.74it/s, loss=2.04]


Epoch 1 Loss: 4.2314
Validation Metrics (Precision: 0.5959, Recall: 0.5959, F1: 0.5959)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:01<00:00,  1.73it/s, loss=1.92]


Epoch 2 Loss: 3.9341
Validation Metrics (Precision: 0.6652, Recall: 0.6652, F1: 0.6652)
Epoch 3/3


Training Epoch 3: 100%|██████████| 2/2 [00:01<00:00,  1.69it/s, loss=1.86]


Epoch 3 Loss: 3.7875
Validation Metrics (Precision: 0.6915, Recall: 0.6915, F1: 0.6915)
Test Metrics (Precision: 0.6965, Recall: 0.6965, F1: 0.6965)

Hyperparameters: {'batch_size': 16, 'learning_rate': 5e-06, 'weight_decay': 0.01}


Map: 100%|██████████| 1248/1248 [00:00<00:00, 4882.95 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train_dataset_30_0: 30 samples
val_dataset_30_0: 30 samples
test_dataset_30_0: 30 samples
Epoch 1/3


Training Epoch 1: 100%|██████████| 2/2 [00:01<00:00,  1.85it/s, loss=2.55]


Epoch 1 Loss: 5.1669
Validation Metrics (Precision: 0.0373, Recall: 0.0373, F1: 0.0373)
Epoch 2/3


Training Epoch 2: 100%|██████████| 2/2 [00:01<00:00,  1.77it/s, loss=2.45]


Epoch 2 Loss: 4.9363


KeyboardInterrupt: 