In [1]:
import re
import random
import os
import pandas as pd
import evaluate
import torch
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForTokenClassification, RobertaTokenizerFast
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict, Features, Sequence, Value, ClassLabel

from sklearn.metrics import precision_recall_fscore_support

In [2]:
# Reduce VRAM usage by reducing fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


## Combine all .txt and .ann files and combine them per medicine

In [3]:
# Base folders containing annotation and text files
annotations_folder = 'annotations/'
original_texts_folder = 'originaltexts/'
output_folder = 'output_datasets/'
os.makedirs(output_folder, exist_ok=True)

# Group files by medicine
file_groups = {}
for file_name in os.listdir(annotations_folder):
    if file_name.endswith('.ann'):
        base_name = '.'.join(file_name.split('.')[:-1])
        medicine = base_name.rsplit('.', 1)[0]
        file_groups.setdefault(medicine, []).append(file_name)

# Process each group
for medicine, ann_files in file_groups.items():
    combined_output = []

    for ann_file in ann_files:
        txt_file = ann_file.replace('.ann', '.txt')
        txt_path = os.path.join(original_texts_folder, txt_file)
        ann_path = os.path.join(annotations_folder, ann_file)

        # Ensure the corresponding .txt file exists
        if not os.path.exists(txt_path):
            raise FileNotFoundError(f"Text file not found for annotation file {ann_file}")

        # Read the content of the .ann and .txt files
        with open(ann_path, 'r') as ann_f:
            ann_lines = ann_f.readlines()

        with open(txt_path, 'r') as txt_f:
            txt_content = txt_f.read()

        # Parse annotations and filter out AnnotatorNotes
        annotations = []
        for line in ann_lines:
            if line.startswith('T'):
                parts = line.strip().split('\t')
                if len(parts) == 3:
                    tag_info, word = parts[1], parts[2]
                    tag_parts = tag_info.split()
                    if len(tag_parts) >= 3:
                        tag = tag_parts[0]
                        try:
                            start_idx = int(tag_parts[1])
                            end_idx = int(tag_parts[2])
                        except ValueError:
                            if ';' in tag_parts[2]:  # Handle ranges like '742;763'
                                start_idx = int(tag_parts[1])
                                end_idx = int(tag_parts[2].split(';')[-1])
                            else:
                                raise ValueError(f"Unexpected annotation format: {tag_parts}")
                        annotations.append((start_idx, end_idx, tag, word))

        # Sort annotations by start index
        annotations.sort(key=lambda x: x[0])

        # Generate output format
        output = []
        current_idx = 0
        for start_idx, end_idx, tag, word in annotations:
            # Add text between the last annotation and the current annotation as "O"
            if current_idx < start_idx:
                intervening_text = txt_content[current_idx:start_idx]
                for token in re.findall(r"\w+(?:'\w+)?|[.,!?]", intervening_text):
                    output.append(f"{token} O")

            # Add the annotated word with its tag
            for i, token in enumerate(word.split()):
                tag_prefix = 'B-' if i == 0 else 'I-'
                output.append(f"{token} {tag_prefix}{tag}")

            current_idx = end_idx

        # Add remaining text as "O"
        if current_idx < len(txt_content):
            remaining_text = txt_content[current_idx:]
            for token in re.findall(r"\w+(?:'\w+)?|[.,!?]", remaining_text):
                output.append(f"{token} O")

        # Add to combined output with a newline separator
        combined_output.extend(output)
        combined_output.append('')  # Empty line between posts

    # Write combined output to file
    combined_output_text = '\n'.join(combined_output).strip()
    output_file = os.path.join(output_folder, f"{medicine}_combined_output.txt")
    with open(output_file, 'w') as out_f:
        out_f.write(combined_output_text)

    print(f"Combined output saved for {medicine} in {output_file}")

Combined output saved for LIPITOR in output_datasets/LIPITOR_combined_output.txt
Combined output saved for VOLTAREN-XR in output_datasets/VOLTAREN-XR_combined_output.txt
Combined output saved for ARTHROTEC in output_datasets/ARTHROTEC_combined_output.txt
Combined output saved for VOLTAREN in output_datasets/VOLTAREN_combined_output.txt
Combined output saved for DICLOFENAC-SODIUM in output_datasets/DICLOFENAC-SODIUM_combined_output.txt
Combined output saved for ZIPSOR in output_datasets/ZIPSOR_combined_output.txt
Combined output saved for FLECTOR in output_datasets/FLECTOR_combined_output.txt
Combined output saved for PENNSAID in output_datasets/PENNSAID_combined_output.txt
Combined output saved for CATAFLAM in output_datasets/CATAFLAM_combined_output.txt
Combined output saved for SOLARAZE in output_datasets/SOLARAZE_combined_output.txt
Combined output saved for DICLOFENAC-POTASSIUM in output_datasets/DICLOFENAC-POTASSIUM_combined_output.txt
Combined output saved for CAMBIA in output_da

## Combine all the medicine files into one dataset

In [4]:
# Folder containing all combined output files
output_datasets_folder = 'output_datasets/'
final_output_file = 'final_dataset.txt'

# Ensure the folder exists
if not os.path.exists(output_datasets_folder):
    raise FileNotFoundError(f"The folder {output_datasets_folder} does not exist.")

# List all files in the folder
output_files = [f for f in os.listdir(output_datasets_folder) if f.endswith('_combined_output.txt')]

# Combine all files into a single final dataset
final_dataset = []
for file_name in output_files:
    file_path = os.path.join(output_datasets_folder, file_name)
    with open(file_path, 'r') as f:
        content = f.read().strip()  # Read and strip any trailing spaces or newlines
        final_dataset.append(content)

    # Add an empty line to separate posts from different files
    final_dataset.append('')

# Write the combined dataset to the final output file
with open(final_output_file, 'w') as f:
    f.write('\n'.join(final_dataset).strip())  # Ensure no extra trailing newline

print(f"Final dataset saved to {final_output_file}")

Final dataset saved to final_dataset.txt


## Read the final dataset into the Iob dataset format

In [5]:
def read_iob_file(file_path):
    """Reads an IOB file from filepath and returns sentences with tokens and tags."""
    sentences = []
    sentence_tokens = []
    sentence_labels = []

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line:  # If line is not empty
                token, tag = line.split()
                sentence_tokens.append(token)
                sentence_labels.append(tag)

            else:
                # End of a sentence
                if sentence_tokens:
                    sentences.append({"tokens": sentence_tokens, "ner_tags": sentence_labels})
                    sentence_tokens = []
                    sentence_labels = []
        # Add the last sentence if file doesn't end with a newline
        if sentence_tokens:
            sentences.append({"tokens": sentence_tokens, "ner_tags": sentence_labels})
    return sentences

def create_dataset_from_final_file(final_file_path):
    """Create a dataset from a single IOB file and return it as a DatasetDict."""

    if not os.path.exists(final_file_path):
        raise FileNotFoundError(f"The file {final_file_path} does not exist.")

    # Parse the file
    data = read_iob_file(final_file_path)

    # Define the label names and ClassLabel feature
    unique_labels = sorted(set(tag for d in data for tag in d["ner_tags"]))
    label_feature = ClassLabel(names=unique_labels)

    # Define the Features schema for Hugging Face datasets
    features = Features({
        'tokens': Sequence(Value("string")),
        'ner_tags': Sequence(label_feature)
    })

    # Convert data into a Dataset
    dataset = Dataset.from_list(data).cast(features)

    # Create a DatasetDict
    dataset_dict = DatasetDict({"full_data": dataset})

    return dataset_dict


In [6]:
final_dataset_path = "final_dataset.txt"
dataset_dict = create_dataset_from_final_file(final_dataset_path)
dataset = dataset_dict['full_data']

Casting the dataset:   0%|          | 0/1248 [00:00<?, ? examples/s]

## Tokenize and align labels, also add datacollator

In [7]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None

    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)

        elif word_id is None:
            # Special token
            new_labels.append(-100)

        else:
            # Same word as previous token
            label = labels[word_id]

            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True,
        is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

## Dataset generators

In [8]:
def generate_train_datasets(dataset, number_of_samples, number_of_splits):
    """
    Generates train datasets by sampling from the given dataset based on the number of samples and splits.

    Args:
        dataset (Dataset): The base dataset to sample from.
        number_of_samples (int): Number of samples per dataset.
        number_of_splits (int): Number of datasets to generate (different seeds).

    Returns:
        List[Tuple[str, Dataset, List[int]]]: List of generated datasets with their names and indices.
    """
    datasets = []

    for seed in range(number_of_splits):
        # Set the random seed for reproducibility
        random.seed(seed)

        # Shuffle and sample from the dataset
        indices = list(range(len(dataset)))
        random.shuffle(indices)
        sampled_indices = indices[:number_of_samples]

        sampled_dataset = dataset.select(sampled_indices)

        # Add the dataset with its name and indices
        datasets.append((f"train_dataset_{number_of_samples}_{seed}", sampled_dataset, sampled_indices))

    return datasets

In [9]:
def generate_validation_datasets(dataset, train_indices, number_of_samples, number_of_splits):
    """
    Generates validation datasets by sampling from the given dataset, ensuring no overlap with training data.

    Args:
        dataset (Dataset): The base dataset to sample from.
        train_indices (List[int]): Indices of the training dataset to exclude from sampling.
        number_of_samples (int): Number of samples per validation dataset.
        number_of_splits (int): Number of validation datasets to generate (different seeds).

    Returns:
        List[Tuple[str, Dataset, List[int]]]: List of generated validation datasets with names and indices.
    """
    datasets = []
    all_indices = set(range(len(dataset)))
    available_indices = list(all_indices - set(train_indices))  # Exclude training indices

    for seed in range(number_of_splits):
        # Set the random seed for reproducibility
        random.seed(seed)

        # Shuffle and sample from the remaining indices
        random.shuffle(available_indices)
        sampled_indices = available_indices[:int(number_of_samples / 5)]

        sampled_dataset = dataset.select(sampled_indices)

        # Add the dataset with its name and indices
        datasets.append((f"val_dataset_{number_of_samples/5}_{seed}", sampled_dataset, sampled_indices))

    return datasets

In [10]:
def generate_test_datasets(dataset, train_indices, val_indices, number_of_samples, number_of_splits):
    """
    Generates test datasets by sampling from the given dataset, ensuring no overlap with training or validation data.

    Args:
        dataset (Dataset): The base dataset to sample from.
        train_indices (List[int]): Indices of the training dataset to exclude from sampling.
        val_indices (List[int]): Indices of the validation dataset to exclude from sampling.
        number_of_samples (int): Number of samples per test dataset.
        number_of_splits (int): Number of test datasets to generate (different seeds).

    Returns:
        List[Tuple[str, Dataset]]: List of generated test datasets with names.
    """
    datasets = []
    all_indices = set(range(len(dataset)))
    available_indices = list(all_indices - set(train_indices) - set(val_indices))  # Exclude train and val indices

    for seed in range(number_of_splits):
        sampled_indices = available_indices[:]

        sampled_dataset = dataset.select(sampled_indices)

        # Add the dataset with its name
        datasets.append((f"test_dataset_{number_of_samples}_{seed}", sampled_dataset))

    return datasets

### Example usage

In [11]:
# Step 1: Generate Train Dataset
# train_datasets = generate_train_datasets(dataset, number_of_samples=30, number_of_splits=1)
# train_name, train_dataset, train_indices = train_datasets[0]
# print(f"{train_name}: {len(train_dataset)} samples")
#
# # Step 2: Generate Validation Dataset
# val_datasets = generate_validation_datasets(dataset, train_indices, number_of_samples=30, number_of_splits=1)
# val_name, val_dataset, val_indices = val_datasets[0]
# print(f"{val_name}: {len(val_dataset)} samples")
#
# # Step 3: Generate Test Dataset
# test_datasets = generate_test_datasets(dataset, train_indices, val_indices, number_of_samples=30, number_of_splits=1)
# test_name, test_dataset = test_datasets[0]
# print(f"{test_name}: {len(test_dataset)} samples")

# Prepping for training

In [12]:
# Load the evaluation metric
metric = evaluate.load("seqeval")

label_names = dataset.features["ner_tags"].feature.names
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

Using the latest cached version of the module from /home/res/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Tue Nov  5 13:12:22 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.


In [13]:
def create_dataset_given_model(tokenizer_):
    data_collator_ = DataCollatorForTokenClassification(tokenizer=tokenizer_)

    # Tokenize and align labels
    tokenized_dataset_ = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset.column_names)

    # Step 1: Generate Train Dataset
    train_datasets = generate_train_datasets(tokenized_dataset_, number_of_samples=100, number_of_splits=1)
    train_name, train_dataset, train_indices = train_datasets[0]
    print(f"{train_name}: {len(train_dataset)} samples")

    # Step 2: Generate Validation Dataset
    val_datasets = generate_validation_datasets(tokenized_dataset_, train_indices, number_of_samples=100, number_of_splits=1)
    val_name, val_dataset, val_indices = val_datasets[0]
    print(f"{val_name}: {len(val_dataset)} samples")

    # Step 3: Generate Test Dataset
    test_datasets = generate_test_datasets(tokenized_dataset_, train_indices, val_indices, number_of_samples=100, number_of_splits=1)
    test_name, test_dataset = test_datasets[0]
    print(f"{test_name}: {len(test_dataset)} samples")

    return data_collator_, train_dataset, val_dataset, test_dataset


In [14]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Flatten predictions and labels, removing ignored indices
    true_labels = [label for label_seq in labels for label in label_seq if label != -100]
    true_predictions = [pred for pred_seq, label_seq in zip(predictions, labels)
                        for pred, label in zip(pred_seq, label_seq) if label != -100]
    return true_labels, true_predictions


# Training and evaluation

In [15]:
def iterate_and_finetune(
    dataset,
    file_name,
    models,
    start_size=5,
    end_size=500,
    step_size=5,
    k_splits=5,
    batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.0,
    num_epochs=3,
):
    """
    Fine-tune models with varying dataset sizes and k-fold splits, saving results to Excel.

    Parameters:
    - dataset (DatasetDict): Dataset for training, validation, and testing.
    - file_name (str): Excel file to save results.
    - models (dict): Dictionary of model names and their sizes.
    - start_size (int): Starting size for training datasets.
    - end_size (int): Maximum size for training datasets.
    - step_size (int): Step size for increasing dataset sizes.
    - k_splits (int): Number of k-fold splits.
    - batch_size (int): Training batch size.
    - learning_rate (float): Learning rate for fine-tuning.
    - weight_decay (float): Weight decay for optimizer.
    - num_epochs (int): Number of training epochs.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Check or create the results file
    if os.path.exists(file_name):
        results_df = pd.read_excel(file_name)
    else:
        results_df = pd.DataFrame(columns=["Train Size", "K-Fold", "Test F1", "Model"])

    for train_size in range(start_size, end_size + 1, step_size):
        for split in range(k_splits):
            for size, model_name in models.items():
                print(f"\nFine-tuning {model_name} ({size}) with Train Size {train_size}, Split {split + 1}...")

                # Initialize tokenizer
                if size == "large":
                    tokenizer = RobertaTokenizerFast.from_pretrained("roberta-large", add_prefix_space=True)
                else:
                    tokenizer = AutoTokenizer.from_pretrained(model_name)

                # Generate datasets
                train_datasets = generate_train_datasets(
                    dataset, number_of_samples=train_size, number_of_splits=k_splits
                )
                train_name, train_dataset, train_indices = train_datasets[split]

                val_datasets = generate_validation_datasets(
                    dataset, train_indices=train_indices, number_of_samples=train_size, number_of_splits=k_splits
                )
                val_name, val_dataset, val_indices = val_datasets[split]

                test_datasets = generate_test_datasets(
                    dataset, train_indices=train_indices, val_indices=val_indices,
                    number_of_samples=train_size, number_of_splits=k_splits
                )
                test_name, test_dataset = test_datasets[split]

                def align_labels_with_tokens(labels, word_ids):
                    new_labels = []
                    current_word = None

                    for word_id in word_ids:
                        if word_id != current_word:
                            current_word = word_id
                            label = -100 if word_id is None else labels[word_id]
                            new_labels.append(label)

                        elif word_id is None:
                            # Special token
                            new_labels.append(-100)

                        else:
                            # Same word as previous token
                            label = labels[word_id]

                            # If the label is B-XXX we change it to I-XXX
                            if label % 2 == 1:
                                label += 1
                            new_labels.append(label)

                    return new_labels

                def tokenize_and_align_labels(examples):
                    tokenized_inputs = tokenizer(
                        examples["tokens"], truncation=True,
                        is_split_into_words=True, padding=True
                    )
                    all_labels = examples["ner_tags"]
                    new_labels = []
                    for i, labels in enumerate(all_labels):
                        word_ids = tokenized_inputs.word_ids(i)
                        new_labels.append(align_labels_with_tokens(labels, word_ids))

                    tokenized_inputs["labels"] = new_labels
                    return tokenized_inputs

                tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
                tokenized_val = val_dataset.map(tokenize_and_align_labels, batched=True)
                tokenized_test = test_dataset.map(tokenize_and_align_labels, batched=True)


                # Initialize model
                model = AutoModelForTokenClassification.from_pretrained(
                    model_name, num_labels=len(dataset.features["ner_tags"].feature.names)
                ).to(device)
                model.gradient_checkpointing_enable()

                # Set up training arguments
                training_args = TrainingArguments(
                    output_dir="./results",
                    evaluation_strategy="epoch",
                    learning_rate=learning_rate,
                    per_device_train_batch_size=batch_size,
                    per_device_eval_batch_size=batch_size,
                    weight_decay=weight_decay,
                    logging_dir="./logs",
                    logging_steps=10,
                    save_strategy="no",
                    num_train_epochs=num_epochs,
                )

                # Define metrics
                def compute_metrics(pred):
                    labels = pred.label_ids
                    preds = pred.predictions.argmax(-1)
                    precision, recall, f1, _ = precision_recall_fscore_support(
                        labels.flatten(), preds.flatten(), average="micro"
                    )
                    return {"f1": f1}

                # Initialize Trainer
                trainer = Trainer(
                    model=model,
                    args=training_args,
                    train_dataset=tokenized_train,
                    eval_dataset=tokenized_val,
                    tokenizer=tokenizer,
                    compute_metrics=compute_metrics,
                )

                # Train the model
                trainer.train()

                # Evaluate on the test set
                test_results = trainer.evaluate(tokenized_test)

                # Append results to DataFrame
                new_row = pd.DataFrame(
                    [{
                        "Train Size": train_size,
                        "K-Fold": split + 1,
                        "Test F1": test_results["eval_f1"],
                        "Model": model_name,
                    }]
                )

                # Concatenate the new row to the results DataFrame
                results_df = pd.concat([results_df, new_row], ignore_index=True)

                # Save results to Excel
                results_df.to_excel(file_name, index=False)
                
                del model, trainer, test_results, train_dataset, train_datasets, train_indices, 
                del val_dataset, val_datasets, val_indices, test_dataset, test_datasets
                del tokenized_train, tokenized_test, tokenized_val
                torch.cuda.empty_cache()

    print(f"Results saved to {file_name}")


In [17]:
# Define the models and their corresponding sizes
models = {
    "small": "bert-base-cased",
    "medium": "bert-large-cased",
    "large": "roberta-large"
}

iterate_and_finetune(dataset=dataset, file_name='Experiments_full_labeled.xlsx', models=models, start_size=15, end_size=40, step_size=5)


Fine-tuning bert-base-cased (small) with Train Size 15, Split 1...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1230 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3883501291275024, 'eval_f1': 0.4603174603174603, 'eval_runtime': 0.0242, 'eval_samples_per_second': 123.973, 'eval_steps_per_second': 41.324, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.121384620666504, 'eval_f1': 0.46296296296296297, 'eval_runtime': 0.024, 'eval_samples_per_second': 124.885, 'eval_steps_per_second': 41.628, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.0778985023498535, 'eval_f1': 0.46296296296296297, 'eval_runtime': 0.0244, 'eval_samples_per_second': 122.824, 'eval_steps_per_second': 40.941, 'epoch': 3.0}
{'train_runtime': 0.8171, 'train_samples_per_second': 55.07, 'train_steps_per_second': 7.343, 'train_loss': 1.3763583501180012, 'epoch': 3.0}


  0%|          | 0/154 [00:00<?, ?it/s]


Fine-tuning bert-large-cased (medium) with Train Size 15, Split 1...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1230 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.1271417140960693, 'eval_f1': 0.4603174603174603, 'eval_runtime': 0.0703, 'eval_samples_per_second': 42.694, 'eval_steps_per_second': 14.231, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.0537893772125244, 'eval_f1': 0.46296296296296297, 'eval_runtime': 0.071, 'eval_samples_per_second': 42.282, 'eval_steps_per_second': 14.094, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.960249125957489, 'eval_f1': 0.46296296296296297, 'eval_runtime': 0.0724, 'eval_samples_per_second': 41.457, 'eval_steps_per_second': 13.819, 'epoch': 3.0}
{'train_runtime': 2.5597, 'train_samples_per_second': 17.58, 'train_steps_per_second': 2.344, 'train_loss': 1.26207701365153, 'epoch': 3.0}


  0%|          | 0/154 [00:00<?, ?it/s]


Fine-tuning roberta-large (large) with Train Size 15, Split 1...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1230 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.0635716915130615, 'eval_f1': 0.46070460704607047, 'eval_runtime': 0.0735, 'eval_samples_per_second': 40.815, 'eval_steps_per_second': 13.605, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.9735262393951416, 'eval_f1': 0.46070460704607047, 'eval_runtime': 0.072, 'eval_samples_per_second': 41.668, 'eval_steps_per_second': 13.889, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.8487274050712585, 'eval_f1': 0.46070460704607047, 'eval_runtime': 0.0735, 'eval_samples_per_second': 40.794, 'eval_steps_per_second': 13.598, 'epoch': 3.0}
{'train_runtime': 2.0956, 'train_samples_per_second': 21.474, 'train_steps_per_second': 2.863, 'train_loss': 1.1605053742726643, 'epoch': 3.0}


  0%|          | 0/154 [00:00<?, ?it/s]


Fine-tuning bert-base-cased (small) with Train Size 15, Split 2...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1230 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5788029432296753, 'eval_f1': 0.3924731182795699, 'eval_runtime': 0.0211, 'eval_samples_per_second': 142.01, 'eval_steps_per_second': 47.337, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.2323918342590332, 'eval_f1': 0.4032258064516129, 'eval_runtime': 0.0204, 'eval_samples_per_second': 146.755, 'eval_steps_per_second': 48.918, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.1635944843292236, 'eval_f1': 0.4032258064516129, 'eval_runtime': 0.0222, 'eval_samples_per_second': 134.927, 'eval_steps_per_second': 44.976, 'epoch': 3.0}
{'train_runtime': 0.7301, 'train_samples_per_second': 61.634, 'train_steps_per_second': 8.218, 'train_loss': 1.410048484802246, 'epoch': 3.0}


  0%|          | 0/154 [00:00<?, ?it/s]


Fine-tuning bert-large-cased (medium) with Train Size 15, Split 2...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1230 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.2969995737075806, 'eval_f1': 0.3978494623655914, 'eval_runtime': 0.0637, 'eval_samples_per_second': 47.076, 'eval_steps_per_second': 15.692, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.118672490119934, 'eval_f1': 0.4032258064516129, 'eval_runtime': 0.0631, 'eval_samples_per_second': 47.551, 'eval_steps_per_second': 15.85, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.9989364147186279, 'eval_f1': 0.4032258064516129, 'eval_runtime': 0.0639, 'eval_samples_per_second': 46.966, 'eval_steps_per_second': 15.655, 'epoch': 3.0}
{'train_runtime': 2.2335, 'train_samples_per_second': 20.147, 'train_steps_per_second': 2.686, 'train_loss': 1.3120729128519695, 'epoch': 3.0}


  0%|          | 0/154 [00:00<?, ?it/s]


Fine-tuning roberta-large (large) with Train Size 15, Split 2...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1230 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.3600267171859741, 'eval_f1': 0.40804597701149425, 'eval_runtime': 0.0679, 'eval_samples_per_second': 44.181, 'eval_steps_per_second': 14.727, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.9482375979423523, 'eval_f1': 0.40804597701149425, 'eval_runtime': 0.0672, 'eval_samples_per_second': 44.634, 'eval_steps_per_second': 14.878, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.8574625849723816, 'eval_f1': 0.40804597701149425, 'eval_runtime': 0.0665, 'eval_samples_per_second': 45.137, 'eval_steps_per_second': 15.046, 'epoch': 3.0}
{'train_runtime': 2.1845, 'train_samples_per_second': 20.6, 'train_steps_per_second': 2.747, 'train_loss': 1.099441925684611, 'epoch': 3.0}


  0%|          | 0/154 [00:00<?, ?it/s]


Fine-tuning bert-base-cased (small) with Train Size 15, Split 3...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1230 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4951311349868774, 'eval_f1': 0.41496598639455784, 'eval_runtime': 0.0251, 'eval_samples_per_second': 119.727, 'eval_steps_per_second': 39.909, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.2125626802444458, 'eval_f1': 0.4217687074829932, 'eval_runtime': 0.0231, 'eval_samples_per_second': 129.883, 'eval_steps_per_second': 43.294, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.1647825241088867, 'eval_f1': 0.4217687074829932, 'eval_runtime': 0.0228, 'eval_samples_per_second': 131.696, 'eval_steps_per_second': 43.899, 'epoch': 3.0}
{'train_runtime': 0.8678, 'train_samples_per_second': 51.858, 'train_steps_per_second': 6.914, 'train_loss': 1.3748857180277507, 'epoch': 3.0}


  0%|          | 0/154 [00:00<?, ?it/s]


Fine-tuning bert-large-cased (medium) with Train Size 15, Split 3...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1230 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.22175931930542, 'eval_f1': 0.42517006802721086, 'eval_runtime': 0.0687, 'eval_samples_per_second': 43.666, 'eval_steps_per_second': 14.555, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.1355350017547607, 'eval_f1': 0.42517006802721086, 'eval_runtime': 0.0686, 'eval_samples_per_second': 43.73, 'eval_steps_per_second': 14.577, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.0601528882980347, 'eval_f1': 0.42517006802721086, 'eval_runtime': 0.0709, 'eval_samples_per_second': 42.294, 'eval_steps_per_second': 14.098, 'epoch': 3.0}
{'train_runtime': 2.563, 'train_samples_per_second': 17.557, 'train_steps_per_second': 2.341, 'train_loss': 1.269650141398112, 'epoch': 3.0}


  0%|          | 0/154 [00:00<?, ?it/s]


Fine-tuning roberta-large (large) with Train Size 15, Split 3...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1230 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.1418628692626953, 'eval_f1': 0.4157706093189964, 'eval_runtime': 0.0713, 'eval_samples_per_second': 42.052, 'eval_steps_per_second': 14.017, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.8537271022796631, 'eval_f1': 0.4157706093189964, 'eval_runtime': 0.0709, 'eval_samples_per_second': 42.316, 'eval_steps_per_second': 14.105, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7610248923301697, 'eval_f1': 0.4157706093189964, 'eval_runtime': 0.0727, 'eval_samples_per_second': 41.247, 'eval_steps_per_second': 13.749, 'epoch': 3.0}
{'train_runtime': 2.4096, 'train_samples_per_second': 18.676, 'train_steps_per_second': 2.49, 'train_loss': 1.111615498860677, 'epoch': 3.0}


  0%|          | 0/154 [00:00<?, ?it/s]


Fine-tuning bert-base-cased (small) with Train Size 15, Split 4...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1230 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.088685393333435, 'eval_f1': 0.5982142857142857, 'eval_runtime': 0.0289, 'eval_samples_per_second': 103.861, 'eval_steps_per_second': 34.62, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6651760935783386, 'eval_f1': 0.5982142857142857, 'eval_runtime': 0.0284, 'eval_samples_per_second': 105.477, 'eval_steps_per_second': 35.159, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6086443066596985, 'eval_f1': 0.5982142857142857, 'eval_runtime': 0.0281, 'eval_samples_per_second': 106.74, 'eval_steps_per_second': 35.58, 'epoch': 3.0}
{'train_runtime': 0.9556, 'train_samples_per_second': 47.09, 'train_steps_per_second': 6.279, 'train_loss': 1.4080123901367188, 'epoch': 3.0}


  0%|          | 0/154 [00:00<?, ?it/s]


Fine-tuning bert-large-cased (medium) with Train Size 15, Split 4...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1230 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.7158446907997131, 'eval_f1': 0.5952380952380952, 'eval_runtime': 0.0901, 'eval_samples_per_second': 33.303, 'eval_steps_per_second': 11.101, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.5135597586631775, 'eval_f1': 0.5952380952380952, 'eval_runtime': 0.0886, 'eval_samples_per_second': 33.87, 'eval_steps_per_second': 11.29, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.4958893358707428, 'eval_f1': 0.5982142857142857, 'eval_runtime': 0.0892, 'eval_samples_per_second': 33.635, 'eval_steps_per_second': 11.212, 'epoch': 3.0}
{'train_runtime': 2.8722, 'train_samples_per_second': 15.667, 'train_steps_per_second': 2.089, 'train_loss': 1.2432244618733723, 'epoch': 3.0}


  0%|          | 0/154 [00:00<?, ?it/s]


Fine-tuning roberta-large (large) with Train Size 15, Split 4...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1230 [00:00<?, ? examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.5539509654045105, 'eval_f1': 0.5891472868217055, 'eval_runtime': 0.0926, 'eval_samples_per_second': 32.392, 'eval_steps_per_second': 10.797, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.4912218749523163, 'eval_f1': 0.5891472868217055, 'eval_runtime': 0.0929, 'eval_samples_per_second': 32.306, 'eval_steps_per_second': 10.769, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.4482291042804718, 'eval_f1': 0.5891472868217055, 'eval_runtime': 0.0917, 'eval_samples_per_second': 32.714, 'eval_steps_per_second': 10.905, 'epoch': 3.0}
{'train_runtime': 2.707, 'train_samples_per_second': 16.623, 'train_steps_per_second': 2.216, 'train_loss': 1.0809468428293865, 'epoch': 3.0}


  0%|          | 0/154 [00:00<?, ?it/s]


Fine-tuning bert-base-cased (small) with Train Size 15, Split 5...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1230 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.2189196348190308, 'eval_f1': 0.480257116620753, 'eval_runtime': 0.0373, 'eval_samples_per_second': 80.495, 'eval_steps_per_second': 26.832, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.0136332511901855, 'eval_f1': 0.48117539026629935, 'eval_runtime': 0.038, 'eval_samples_per_second': 78.977, 'eval_steps_per_second': 26.326, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.995426595211029, 'eval_f1': 0.48117539026629935, 'eval_runtime': 0.0367, 'eval_samples_per_second': 81.661, 'eval_steps_per_second': 27.22, 'epoch': 3.0}
{'train_runtime': 1.5428, 'train_samples_per_second': 29.168, 'train_steps_per_second': 3.889, 'train_loss': 1.3602409362792969, 'epoch': 3.0}


  0%|          | 0/154 [00:00<?, ?it/s]


Fine-tuning bert-large-cased (medium) with Train Size 15, Split 5...


Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/1230 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/6 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacity of 9.74 GiB of which 62.19 MiB is free. Including non-PyTorch memory, this process has 8.89 GiB memory in use. Of the allocated memory 8.60 GiB is allocated by PyTorch, and 25.01 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)