In [10]:
import os
import json
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import classification_report, accuracy_score
import torch
import torch.nn as nn

In [11]:
MAX_LEN = 512  # BERT's max sequence length
DATA_DIR = "/media/smartdragon/WORK/6th Semester/22AIE315 - Natural Language Processing/Project/New_Json_Files/Batch 2"

In [12]:
def load_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return Dataset.from_list(data)


In [13]:
def split_long_sentence(tokens, tags, max_len=MAX_LEN - 2):  # -2 for [CLS] and [SEP]
    chunks = []
    for i in range(0, len(tokens), max_len):
        chunk_tokens = tokens[i:i+max_len]
        chunk_tags = tags[i:i+max_len]
        chunks.append({'tokens': chunk_tokens, 'tags': chunk_tags})
    return chunks

In [14]:
def preprocess_dataset(dataset):
    split_data = []
    for example in dataset:
        split_sentences = split_long_sentence(example['tokens'], example['tags'])
        split_data.extend(split_sentences)
    return Dataset.from_list(split_data)

In [15]:
def tokenize_and_align_labels(example, tokenizer, label2id):
    tokenized = tokenizer(example['tokens'], truncation=True, max_length=512, is_split_into_words=True)
    word_ids = tokenized.word_ids()

    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            aligned_labels.append(label2id.get(example['tags'][word_idx], -100))
        else:
            aligned_labels.append(label2id.get(example['tags'][word_idx], -100))

        previous_word_idx = word_idx

    tokenized['labels'] = aligned_labels
    return tokenized

In [16]:
def compute_metrics(p, id2label):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    true_predictions = []

    for i in range(len(labels)):
        label_sequence = []
        prediction_sequence = []

        for j in range(len(labels[i])):
            if labels[i][j] != -100:  # Exclude padding tokens
                label_sequence.append(id2label.get(labels[i][j], "O"))
                prediction_sequence.append(id2label.get(predictions[i][j], "O"))

        true_labels.append(label_sequence)
        true_predictions.append(prediction_sequence)

    report = classification_report(true_labels, true_predictions, output_dict=True)
    accuracy = accuracy_score(true_labels, true_predictions)

    return {
        "f1": report["micro avg"]["f1-score"],
        "precision": report["micro avg"]["precision"],
        "recall": report["micro avg"]["recall"],
        "accuracy": accuracy
    }

In [17]:
def detect_batches():
    batch_numbers = set()
    for file in os.listdir(DATA_DIR):
        if file.startswith("combined_train_") and file.endswith(".jsonl"):
            batch_num = file.split("_")[-1].replace(".jsonl", "")
            if batch_num.isdigit():
                batch_numbers.add(int(batch_num))
    return sorted(batch_numbers)

In [18]:
def train_and_evaluate(batch_num):
    dataset_prefix = f"combined_{batch_num}"

    train_file = os.path.join(DATA_DIR, f"combined_train_{batch_num}.jsonl")
    dev_file = os.path.join(DATA_DIR, f"combined_dev_{batch_num}.jsonl")
    test_file = os.path.join(DATA_DIR, f"combined_test_{batch_num}.jsonl")

    if not os.path.exists(train_file) or not os.path.exists(dev_file) or not os.path.exists(test_file):
        print(f"🚨 Skipping batch {batch_num}, files not found.")
        return

    print(f"🚀 Processing batch {batch_num}...")

    
    model_path = f"/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch{batch_num-1}"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    # Load label2id from Batch 1 to ensure consistent label space
    with open(os.path.join(model_path, "label2id.json"), "r") as f:
        old_label2id = json.load(f)
    id2label = {v: k for k, v in old_label2id.items()}
    print(f"✅ Loaded label2id from Batch {batch_num - 1}: {old_label2id}")

    train_dataset = preprocess_dataset(load_jsonl(train_file))
    dev_dataset = preprocess_dataset(load_jsonl(dev_file))
    test_dataset = preprocess_dataset(load_jsonl(test_file))
    print(f"After splitting: Train={len(train_dataset)}, Dev={len(dev_dataset)}, Test={len(test_dataset)}")

    unique_tags = set(tag for example in train_dataset['tags'] for tag in example)
    label2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
    id2label = {v: k for k, v in label2id.items()}

    train_dataset = train_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label2id))
    dev_dataset = dev_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label2id))
    test_dataset = test_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label2id))

    # Load the number of labels from batch 1
    old_num_labels = len(old_label2id)  # Should match previous training setup
    new_num_labels = len(label2id)  # Ensure this matches the new dataset

    # Load the previously trained model from batch 1
    model = AutoModelForTokenClassification.from_pretrained(
        model_path, num_labels=old_num_labels
    )

    # Extract the old classifier layer
    old_classifier = model.classifier

    # Create a new classifier layer with updated label count
    new_classifier = nn.Linear(old_classifier.in_features, new_num_labels)

    # Transfer weights from the old classifier to the new one (for common labels)
    with torch.no_grad():
        num_common_labels = min(old_num_labels, new_num_labels)
        new_classifier.weight[:num_common_labels, :] = old_classifier.weight[:num_common_labels, :]
        new_classifier.bias[:num_common_labels] = old_classifier.bias[:num_common_labels]

    # Assign the updated classifier to the model
    model.classifier = new_classifier

    # Save updated model before continuing training
    model.save_pretrained("/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/BioBERT-updated")
    print(f"✅ Model updated to support new label set while keeping batch {batch_num - 1} training.")

    model_checkpoint = f"/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch{batch_num}"
    # Create the directory if it does not exist
    os.makedirs(model_checkpoint, exist_ok=True)
    print(f"Directory ensured: {model_checkpoint}")
    model_path_2 = "/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/BioBERT-updated"
    model = AutoModelForTokenClassification.from_pretrained(model_path_2, num_labels=len(label2id))

    training_args = TrainingArguments(
        output_dir="BioBERT-finetuned-mtl",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir="./logs",
        learning_rate=3e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        num_train_epochs=2,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        push_to_hub=False,
        save_total_limit=2,
        gradient_accumulation_steps=8,  # This simulates batch size 4 * 4 = 16
        fp16=True,
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=lambda p: compute_metrics(p, id2label)
    )

    trainer.train()

    metrics = trainer.evaluate(test_dataset)
    print("Test Set Performance:")
    print(metrics)
    
    model.save_pretrained(model_checkpoint)
    tokenizer.save_pretrained(model_checkpoint)

    print(f"✅ Model trained and saved for batch {batch_num}")

    predictions, labels, _ = trainer.predict(test_dataset)
    predictions = np.argmax(predictions, axis=2)

        # Save label2id to file
    with open(f"{model_checkpoint}/label2id.json", "w") as f:
        json.dump(label2id, f)
    print("✅ Saved new_label2id mapping.")

    # Save label2id to file
    with open("label2id.json", "w") as f:
        json.dump(label2id, f)
    print("✅ Saved new_label2id mapping.")
    print("✅ Saved label2id mapping to model directory.")

    # results = classification_report(labels, predictions, output_dict=True)
    # with open(f"{model_checkpoint}_evaluation.json", "w") as f:
    #     json.dump(results, f)

    # print(f"📊 Evaluation results saved for batch {batch_num}")

In [19]:
batch_numbers = detect_batches()
print(f"📝 Detected batches: {batch_numbers}")

for batch_num in batch_numbers:
    train_and_evaluate(batch_num)

print("🎉 All detected batches processed successfully!")

📝 Detected batches: [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
🚀 Processing batch 21...
✅ Loaded label2id from Batch 20: {'B-Protein': 0, 'E-Protein': 1, 'I-Protein': 2, 'O': 3, 'S-Protein': 4}
After splitting: Train=81856, Dev=22917, Test=61643


Map:   0%|          | 0/81856 [00:00<?, ? examples/s]

Map:   0%|          | 0/22917 [00:00<?, ? examples/s]

Map:   0%|          | 0/61643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 20 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch21


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.404564,0.70455,0.718675,0.69097,0.925895


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Test Set Performance:
{'eval_loss': 0.3382920026779175, 'eval_f1': 0.6870533642691415, 'eval_precision': 0.7527964205816555, 'eval_recall': 0.6318709457152611, 'eval_accuracy': 0.9234769139991149, 'eval_runtime': 13.9482, 'eval_samples_per_second': 4419.432, 'eval_steps_per_second': 69.113, 'epoch': 1.9882720875684128}
✅ Model trained and saved for batch 21
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 22...
✅ Loaded label2id from Batch 21: {'B-Chemical': 0, 'B-Organism': 1, 'B-Protein': 2, 'B-Regulon-operon': 3, 'I-Chemical': 4, 'I-Organism': 5, 'I-Protein': 6, 'I-Regulon-operon': 7, 'O': 8}
After splitting: Train=81856, Dev=22917, Test=61643


Map:   0%|          | 0/81856 [00:00<?, ? examples/s]

Map:   0%|          | 0/22917 [00:00<?, ? examples/s]

Map:   0%|          | 0/61643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 21 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch22


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.431468,0.709128,0.707398,0.710867,0.918845


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Test Set Performance:
{'eval_loss': 0.3785388767719269, 'eval_f1': 0.7443956733030255, 'eval_precision': 0.8120564343736639, 'eval_recall': 0.687142753780479, 'eval_accuracy': 0.9175345194329345, 'eval_runtime': 10.501, 'eval_samples_per_second': 5870.199, 'eval_steps_per_second': 91.801, 'epoch': 1.9882720875684128}
✅ Model trained and saved for batch 22
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 23...
✅ Loaded label2id from Batch 22: {'B-Chemical': 0, 'B-Organism': 1, 'B-Protein': 2, 'B-Regulon-operon': 3, 'E-Chemical': 4, 'E-Organism': 5, 'E-Protein': 6, 'E-Regulon-operon': 7, 'I-Chemical': 8, 'I-Organism': 9, 'I-Protein': 10, 'O': 11, 'S-Chemical': 12, 'S-Organism': 13, 'S-Protein': 14, 'S-Regulon-operon': 15}
After splitting: Train=81856, Dev=22917, Test=61643


Map:   0%|          | 0/81856 [00:00<?, ? examples/s]

Map:   0%|          | 0/22917 [00:00<?, ? examples/s]

Map:   0%|          | 0/61643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 22 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch23


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.2235,0.482499,0.773748,0.350548,0.964447


Test Set Performance:
{'eval_loss': 0.17072421312332153, 'eval_f1': 0.559647038859664, 'eval_precision': 0.7770970782280867, 'eval_recall': 0.43728453990983823, 'eval_accuracy': 0.9643998623199095, 'eval_runtime': 10.1284, 'eval_samples_per_second': 6086.176, 'eval_steps_per_second': 95.178, 'epoch': 1.9882720875684128}
✅ Model trained and saved for batch 23
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 24...
✅ Loaded label2id from Batch 23: {'B-Organism': 0, 'I-Organism': 1, 'O': 2}
After splitting: Train=81856, Dev=22917, Test=61643


Map:   0%|          | 0/81856 [00:00<?, ? examples/s]

Map:   0%|          | 0/22917 [00:00<?, ? examples/s]

Map:   0%|          | 0/61643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 23 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch24


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.263307,0.600872,0.799578,0.48127,0.96158


Test Set Performance:
{'eval_loss': 0.2173120230436325, 'eval_f1': 0.7407260464193612, 'eval_precision': 0.8391011235955056, 'eval_recall': 0.6629971590909091, 'eval_accuracy': 0.9596671092098146, 'eval_runtime': 10.1688, 'eval_samples_per_second': 6061.945, 'eval_steps_per_second': 94.799, 'epoch': 1.9882720875684128}
✅ Model trained and saved for batch 24
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 25...
✅ Loaded label2id from Batch 24: {'B-Organism': 0, 'E-Organism': 1, 'I-Organism': 2, 'O': 3, 'S-Organism': 4}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 24 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch25


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.017044,0.693587,0.719212,0.669725,0.995947


Test Set Performance:
{'eval_loss': 0.016075801104307175, 'eval_f1': 0.7047387606318348, 'eval_precision': 0.7512953367875648, 'eval_recall': 0.6636155606407322, 'eval_accuracy': 0.9964218892970788, 'eval_runtime': 8.7404, 'eval_samples_per_second': 6037.621, 'eval_steps_per_second': 94.39, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 25
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 26...
✅ Loaded label2id from Batch 25: {'B-Cellular_component': 0, 'I-Cellular_component': 1, 'O': 2}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 25 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch26


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.020405,0.640693,0.627119,0.654867,0.995167


Test Set Performance:
{'eval_loss': 0.017448056489229202, 'eval_f1': 0.7106842737094837, 'eval_precision': 0.7688311688311689, 'eval_recall': 0.6607142857142857, 'eval_accuracy': 0.9964218892970788, 'eval_runtime': 8.7582, 'eval_samples_per_second': 6025.314, 'eval_steps_per_second': 94.197, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 26
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 27...
✅ Loaded label2id from Batch 26: {'B-Cellular_component': 0, 'E-Cellular_component': 1, 'I-Cellular_component': 2, 'O': 3, 'S-Cellular_component': 4}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 26 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch27


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.125807,0.581958,0.584024,0.579906,0.957856


Test Set Performance:
{'eval_loss': 0.1265675574541092, 'eval_f1': 0.5994991898659597, 'eval_precision': 0.5983534254630991, 'eval_recall': 0.6006493506493507, 'eval_accuracy': 0.9567075918768693, 'eval_runtime': 8.7376, 'eval_samples_per_second': 6039.537, 'eval_steps_per_second': 94.42, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 27
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 28...
✅ Loaded label2id from Batch 27: {'B-Cell': 0, 'I-Cell': 1, 'O': 2}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 27 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch28


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.151064,0.597486,0.613077,0.582668,0.95466


Test Set Performance:
{'eval_loss': 0.15368005633354187, 'eval_f1': 0.5949862338791481, 'eval_precision': 0.6002923976608188, 'eval_recall': 0.5897730537201954, 'eval_accuracy': 0.9522008112205181, 'eval_runtime': 8.791, 'eval_samples_per_second': 6002.858, 'eval_steps_per_second': 93.846, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 28
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 29...
✅ Loaded label2id from Batch 28: {'B-Cell': 0, 'E-Cell': 1, 'I-Cell': 2, 'O': 3, 'S-Cell': 4}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 28 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch29


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.119994,0.61925,0.722543,0.541796,0.970276


Test Set Performance:
{'eval_loss': 0.10238754004240036, 'eval_f1': 0.5798718743542055, 'eval_precision': 0.6418115279048491, 'eval_recall': 0.5288352808141726, 'eval_accuracy': 0.9709107793999153, 'eval_runtime': 8.6862, 'eval_samples_per_second': 6075.249, 'eval_steps_per_second': 94.978, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 29
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 30...
✅ Loaded label2id from Batch 29: {'B-Simple_chemical': 0, 'I-Simple_chemical': 1, 'O': 2}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 29 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch30


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.149271,0.653452,0.779968,0.562251,0.968353


Test Set Performance:
{'eval_loss': 0.1390119194984436, 'eval_f1': 0.648207171314741, 'eval_precision': 0.7656470588235295, 'eval_recall': 0.5620034542314335, 'eval_accuracy': 0.9706376411783183, 'eval_runtime': 8.7576, 'eval_samples_per_second': 6025.754, 'eval_steps_per_second': 94.204, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 30
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 31...
✅ Loaded label2id from Batch 30: {'B-Simple_chemical': 0, 'E-Simple_chemical': 1, 'I-Simple_chemical': 2, 'O': 3, 'S-Simple_chemical': 4}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 30 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch31


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.229169,0.710831,0.780961,0.652258,0.921324


Test Set Performance:
{'eval_loss': 0.1965653896331787, 'eval_f1': 0.7310701817744071, 'eval_precision': 0.7832086664947124, 'eval_recall': 0.6854401805869075, 'eval_accuracy': 0.934597052838589, 'eval_runtime': 8.79, 'eval_samples_per_second': 6003.545, 'eval_steps_per_second': 93.857, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 31
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 32...
✅ Loaded label2id from Batch 31: {'B-Gene_or_gene_product': 0, 'I-Gene_or_gene_product': 1, 'O': 2}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 31 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch32


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.345635,0.742909,0.833371,0.670163,0.902253


Test Set Performance:
{'eval_loss': 0.2820952236652374, 'eval_f1': 0.7694593303974107, 'eval_precision': 0.7792584631918323, 'eval_recall': 0.7599035841542654, 'eval_accuracy': 0.91605096759215, 'eval_runtime': 8.7881, 'eval_samples_per_second': 6004.836, 'eval_steps_per_second': 93.877, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 32
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 33...
✅ Loaded label2id from Batch 32: {'B-Gene_or_gene_product': 0, 'E-Gene_or_gene_product': 1, 'I-Gene_or_gene_product': 2, 'O': 3, 'S-Gene_or_gene_product': 4}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 32 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch33


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.674437,0.6324,0.641254,0.623786,0.823239


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Test Set Performance:
{'eval_loss': 0.6371275782585144, 'eval_f1': 0.6318556408070475, 'eval_precision': 0.6372581800812037, 'eval_recall': 0.626543934626403, 'eval_accuracy': 0.8310093822979119, 'eval_runtime': 9.3076, 'eval_samples_per_second': 5669.664, 'eval_steps_per_second': 88.637, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 33
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 34...
✅ Loaded label2id from Batch 33: {'B-Amino_acid': 0, 'B-Anatomical_system': 1, 'B-Cancer': 2, 'B-Cell': 3, 'B-Cellular_component': 4, 'B-Developing_anatomical_structure': 5, 'B-Gene_or_gene_product': 6, 'B-Immaterial_anatomical_entity': 7, 'B-Multi-tissue_structure': 8, 'B-Organ': 9, 'B-Organism': 10, 'B-Organism_subdivision': 11, 'B-Organism_substance': 12, 'B-Pathological_formation': 13, 'B-Simple_chemical': 14, 'B-Tissue': 15, 'I-Amino_acid': 16, 'I-Anatomical_system': 17, 'I-Cancer': 18, 'I-Cell': 19, 'I

Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 33 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch34


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.849866,0.671527,0.715021,0.633022,0.796575


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Test Set Performance:
{'eval_loss': 0.7754917740821838, 'eval_f1': 0.6850002280293702, 'eval_precision': 0.7212484993997599, 'eval_recall': 0.6522211125103131, 'eval_accuracy': 0.8136796962495562, 'eval_runtime': 9.4314, 'eval_samples_per_second': 5595.26, 'eval_steps_per_second': 87.474, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 34
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 35...
✅ Loaded label2id from Batch 34: {'B-Amino_acid': 0, 'B-Anatomical_system': 1, 'B-Cancer': 2, 'B-Cell': 3, 'B-Cellular_component': 4, 'B-Developing_anatomical_structure': 5, 'B-Gene_or_gene_product': 6, 'B-Immaterial_anatomical_entity': 7, 'B-Multi-tissue_structure': 8, 'B-Organ': 9, 'B-Organism': 10, 'B-Organism_subdivision': 11, 'B-Organism_substance': 12, 'B-Pathological_formation': 13, 'B-Simple_chemical': 14, 'B-Tissue': 15, 'E-Amino_acid': 16, 'E-Anatomical_system': 17, 'E-Cancer': 18, 'E-Cell': 19, 'E

Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 34 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch35


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.029292,0.779949,0.780612,0.779287,0.992101


Test Set Performance:
{'eval_loss': 0.031636957079172134, 'eval_f1': 0.7373219373219374, 'eval_precision': 0.7261503928170595, 'eval_recall': 0.7488425925925926, 'eval_accuracy': 0.9922838452398837, 'eval_runtime': 8.2867, 'eval_samples_per_second': 6368.172, 'eval_steps_per_second': 99.557, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 35
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 36...
✅ Loaded label2id from Batch 35: {'B-Organism': 0, 'I-Organism': 1, 'O': 2}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 35 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch36


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.03956,0.816598,0.83959,0.79483,0.991504


Test Set Performance:
{'eval_loss': 0.038125406950712204, 'eval_f1': 0.7765089722675367, 'eval_precision': 0.7880794701986755, 'eval_recall': 0.7652733118971061, 'eval_accuracy': 0.9921882468623247, 'eval_runtime': 8.3062, 'eval_samples_per_second': 6353.217, 'eval_steps_per_second': 99.324, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 36
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 37...
✅ Loaded label2id from Batch 36: {'B-Organism': 0, 'E-Organism': 1, 'I-Organism': 2, 'O': 3, 'S-Organism': 4}
After splitting: Train=75068, Dev=81071, Test=104643


Map:   0%|          | 0/75068 [00:00<?, ? examples/s]

Map:   0%|          | 0/81071 [00:00<?, ? examples/s]

Map:   0%|          | 0/104643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 36 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch37


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,No log,0.246165,0.675911,0.80802,0.580931,0.925192
1,No log,0.242448,0.667994,0.791221,0.577978,0.923065


Test Set Performance:
{'eval_loss': 0.26181691884994507, 'eval_f1': 0.6565110919043502, 'eval_precision': 0.7569969271655178, 'eval_recall': 0.5795765244484008, 'eval_accuracy': 0.9268153701453394, 'eval_runtime': 17.4302, 'eval_samples_per_second': 6003.543, 'eval_steps_per_second': 93.86, 'epoch': 1.9957374254049447}
✅ Model trained and saved for batch 37
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 38...
✅ Loaded label2id from Batch 37: {'B-Protein': 0, 'I-Protein': 1, 'O': 2}
After splitting: Train=75068, Dev=81071, Test=104643


Map:   0%|          | 0/75068 [00:00<?, ? examples/s]

Map:   0%|          | 0/81071 [00:00<?, ? examples/s]

Map:   0%|          | 0/104643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 37 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch38


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,No log,0.309494,0.699785,0.805956,0.618331,0.924497
1,No log,0.318969,0.672706,0.812554,0.573927,0.920137


Test Set Performance:
{'eval_loss': 0.32413554191589355, 'eval_f1': 0.6563930841249231, 'eval_precision': 0.7499219237976265, 'eval_recall': 0.5836067565925386, 'eval_accuracy': 0.9239181924906803, 'eval_runtime': 17.7416, 'eval_samples_per_second': 5898.165, 'eval_steps_per_second': 92.213, 'epoch': 1.9957374254049447}
✅ Model trained and saved for batch 38
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 39...
✅ Loaded label2id from Batch 38: {'B-Protein': 0, 'E-Protein': 1, 'I-Protein': 2, 'O': 3, 'S-Protein': 4}
After splitting: Train=69244, Dev=23646, Test=46196


Map:   0%|          | 0/69244 [00:00<?, ? examples/s]

Map:   0%|          | 0/23646 [00:00<?, ? examples/s]

Map:   0%|          | 0/46196 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 38 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch39


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.035732,0.80427,0.798587,0.810036,0.992774


Test Set Performance:
{'eval_loss': 0.01997540518641472, 'eval_f1': 0.8114704087858451, 'eval_precision': 0.8407079646017699, 'eval_recall': 0.7841981132075472, 'eval_accuracy': 0.9952293905212002, 'eval_runtime': 7.5446, 'eval_samples_per_second': 6123.094, 'eval_steps_per_second': 95.698, 'epoch': 1.990757855822551}
✅ Model trained and saved for batch 39
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 40...
✅ Loaded label2id from Batch 39: {'B-Cellular_component': 0, 'I-Cellular_component': 1, 'O': 2}
After splitting: Train=69244, Dev=23646, Test=46196


Map:   0%|          | 0/69244 [00:00<?, ? examples/s]

Map:   0%|          | 0/23646 [00:00<?, ? examples/s]

Map:   0%|          | 0/46196 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 39 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch40


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.042256,0.813389,0.803306,0.823729,0.99173


Test Set Performance:
{'eval_loss': 0.02628585882484913, 'eval_f1': 0.8385624643468339, 'eval_precision': 0.8526682134570766, 'eval_recall': 0.8249158249158249, 'eval_accuracy': 0.9943228261031105, 'eval_runtime': 7.523, 'eval_samples_per_second': 6140.645, 'eval_steps_per_second': 95.973, 'epoch': 1.990757855822551}
✅ Model trained and saved for batch 40
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🎉 All detected batches processed successfully!
