In [1]:
import os
import json
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import classification_report, accuracy_score
import torch
import torch.nn as nn

In [2]:
MAX_LEN = 512  # BERT's max sequence length
DATA_DIR = "/media/smartdragon/WORK/6th Semester/22AIE315 - Natural Language Processing/Project/New_Json_Files"

In [3]:
def load_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return Dataset.from_list(data)


In [4]:
def split_long_sentence(tokens, tags, max_len=MAX_LEN - 2):  # -2 for [CLS] and [SEP]
    chunks = []
    for i in range(0, len(tokens), max_len):
        chunk_tokens = tokens[i:i+max_len]
        chunk_tags = tags[i:i+max_len]
        chunks.append({'tokens': chunk_tokens, 'tags': chunk_tags})
    return chunks

In [5]:
def preprocess_dataset(dataset):
    split_data = []
    for example in dataset:
        split_sentences = split_long_sentence(example['tokens'], example['tags'])
        split_data.extend(split_sentences)
    return Dataset.from_list(split_data)

In [6]:
def tokenize_and_align_labels(example, tokenizer, label2id):
    tokenized = tokenizer(example['tokens'], truncation=True, max_length=512, is_split_into_words=True)
    word_ids = tokenized.word_ids()

    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            aligned_labels.append(label2id.get(example['tags'][word_idx], -100))
        else:
            aligned_labels.append(label2id.get(example['tags'][word_idx], -100))

        previous_word_idx = word_idx

    tokenized['labels'] = aligned_labels
    return tokenized

In [7]:
def compute_metrics(p, id2label):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    true_predictions = []

    for i in range(len(labels)):
        label_sequence = []
        prediction_sequence = []

        for j in range(len(labels[i])):
            if labels[i][j] != -100:  # Exclude padding tokens
                label_sequence.append(id2label.get(labels[i][j], "O"))
                prediction_sequence.append(id2label.get(predictions[i][j], "O"))

        true_labels.append(label_sequence)
        true_predictions.append(prediction_sequence)

    report = classification_report(true_labels, true_predictions, output_dict=True)
    accuracy = accuracy_score(true_labels, true_predictions)

    return {
        "f1": report["micro avg"]["f1-score"],
        "precision": report["micro avg"]["precision"],
        "recall": report["micro avg"]["recall"],
        "accuracy": accuracy
    }

In [8]:
def detect_batches():
    batch_numbers = set()
    for file in os.listdir(DATA_DIR):
        if file.startswith("combined_train_") and file.endswith(".jsonl"):
            batch_num = file.split("_")[-1].replace(".jsonl", "")
            if batch_num.isdigit():
                batch_numbers.add(int(batch_num))
    return sorted(batch_numbers)

In [9]:
def train_and_evaluate(batch_num):
    dataset_prefix = f"combined_{batch_num}"

    train_file = os.path.join(DATA_DIR, f"combined_train_{batch_num}.jsonl")
    dev_file = os.path.join(DATA_DIR, f"combined_dev_{batch_num}.jsonl")
    test_file = os.path.join(DATA_DIR, f"combined_test_{batch_num}.jsonl")

    if not os.path.exists(train_file) or not os.path.exists(dev_file) or not os.path.exists(test_file):
        print(f"🚨 Skipping batch {batch_num}, files not found.")
        return

    print(f"🚀 Processing batch {batch_num}...")

    
    model_path = f"/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch{batch_num-1}"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    # Load label2id from Batch 1 to ensure consistent label space
    with open(os.path.join(model_path, "label2id.json"), "r") as f:
        old_label2id = json.load(f)
    id2label = {v: k for k, v in old_label2id.items()}
    print(f"✅ Loaded label2id from Batch {batch_num - 1}: {old_label2id}")

    train_dataset = preprocess_dataset(load_jsonl(train_file))
    dev_dataset = preprocess_dataset(load_jsonl(dev_file))
    test_dataset = preprocess_dataset(load_jsonl(test_file))
    print(f"After splitting: Train={len(train_dataset)}, Dev={len(dev_dataset)}, Test={len(test_dataset)}")

    unique_tags = set(tag for example in train_dataset['tags'] for tag in example)
    label2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
    id2label = {v: k for k, v in label2id.items()}

    train_dataset = train_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label2id))
    dev_dataset = dev_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label2id))
    test_dataset = test_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label2id))

    # Load the number of labels from batch 1
    old_num_labels = len(old_label2id)  # Should match previous training setup
    new_num_labels = len(label2id)  # Ensure this matches the new dataset

    # Load the previously trained model from batch 1
    model = AutoModelForTokenClassification.from_pretrained(
        model_path, num_labels=old_num_labels
    )

    # Extract the old classifier layer
    old_classifier = model.classifier

    # Create a new classifier layer with updated label count
    new_classifier = nn.Linear(old_classifier.in_features, new_num_labels)

    # Transfer weights from the old classifier to the new one (for common labels)
    with torch.no_grad():
        num_common_labels = min(old_num_labels, new_num_labels)
        new_classifier.weight[:num_common_labels, :] = old_classifier.weight[:num_common_labels, :]
        new_classifier.bias[:num_common_labels] = old_classifier.bias[:num_common_labels]

    # Assign the updated classifier to the model
    model.classifier = new_classifier

    # Save updated model before continuing training
    model.save_pretrained("/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/BioBERT-updated")
    print(f"✅ Model updated to support new label set while keeping batch {batch_num - 1} training.")

    model_checkpoint = f"/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch{batch_num}"
    # Create the directory if it does not exist
    os.makedirs(model_checkpoint, exist_ok=True)
    print(f"Directory ensured: {model_checkpoint}")
    model_path_2 = "/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/BioBERT-updated"
    model = AutoModelForTokenClassification.from_pretrained(model_path_2, num_labels=len(label2id))

    training_args = TrainingArguments(
        output_dir="BioBERT-finetuned-mtl",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir="./logs",
        learning_rate=3e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        num_train_epochs=2,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        push_to_hub=False,
        save_total_limit=2,
        gradient_accumulation_steps=8,  # This simulates batch size 4 * 4 = 16
        fp16=True,
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=lambda p: compute_metrics(p, id2label)
    )

    trainer.train()

    metrics = trainer.evaluate(test_dataset)
    print("Test Set Performance:")
    print(metrics)
    
    model.save_pretrained(model_checkpoint)
    tokenizer.save_pretrained(model_checkpoint)

    print(f"✅ Model trained and saved for batch {batch_num}")

    predictions, labels, _ = trainer.predict(test_dataset)
    predictions = np.argmax(predictions, axis=2)

        # Save label2id to file
    with open(f"{model_checkpoint}/label2id.json", "w") as f:
        json.dump(label2id, f)
    print("✅ Saved new_label2id mapping.")

    # Save label2id to file
    with open("label2id.json", "w") as f:
        json.dump(label2id, f)
    print("✅ Saved new_label2id mapping.")
    print("✅ Saved label2id mapping to model directory.")

    # results = classification_report(labels, predictions, output_dict=True)
    # with open(f"{model_checkpoint}_evaluation.json", "w") as f:
    #     json.dump(results, f)

    # print(f"📊 Evaluation results saved for batch {batch_num}")

In [10]:
batch_numbers = detect_batches()
print(f"📝 Detected batches: {batch_numbers}")

for batch_num in batch_numbers:
    train_and_evaluate(batch_num)

print("🎉 All detected batches processed successfully!")

📝 Detected batches: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
🚀 Processing batch 2...
✅ Loaded label2id from Batch 1: {'B-Anatomy': 0, 'I-Anatomy': 1, 'O': 2}
After splitting: Train=153823, Dev=58785, Test=99976


Map:   0%|          | 0/153823 [00:00<?, ? examples/s]

Map:   0%|          | 0/58785 [00:00<?, ? examples/s]

Map:   0%|          | 0/99976 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 1 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch2


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.1766,0.172605,0.764258,0.762046,0.766483,0.944408


Test Set Performance:
{'eval_loss': 0.20966263115406036, 'eval_f1': 0.790495138706408, 'eval_precision': 0.8374960229080496, 'eval_recall': 0.7484893722897562, 'eval_accuracy': 0.9342969776609724, 'eval_runtime': 16.0853, 'eval_samples_per_second': 6215.351, 'eval_steps_per_second': 97.169, 'epoch': 1.995008319467554}
✅ Model trained and saved for batch 2
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 3...
✅ Loaded label2id from Batch 2: {'B-Anatomy': 0, 'E-Anatomy': 1, 'I-Anatomy': 2, 'O': 3, 'S-Anatomy': 4}
After splitting: Train=355405, Dev=71042, Test=143465


Map:   0%|          | 0/355405 [00:00<?, ? examples/s]

Map:   0%|          | 0/71042 [00:00<?, ? examples/s]

Map:   0%|          | 0/143465 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 2 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch3


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.2436,0.246997,0.615185,0.708498,0.543591,0.904448


Test Set Performance:
{'eval_loss': 0.2478257417678833, 'eval_f1': 0.6257952833794204, 'eval_precision': 0.715468051200329, 'eval_recall': 0.5560971711682915, 'eval_accuracy': 0.9054865179297635, 'eval_runtime': 23.2633, 'eval_samples_per_second': 6167.014, 'eval_steps_per_second': 96.375, 'epoch': 1.9981994958588405}
✅ Model trained and saved for batch 3
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 4...
✅ Loaded label2id from Batch 3: {'B-GENE': 0, 'I-GENE': 1, 'O': 2}
After splitting: Train=355405, Dev=71042, Test=143465


Map:   0%|          | 0/355405 [00:00<?, ? examples/s]

Map:   0%|          | 0/71042 [00:00<?, ? examples/s]

Map:   0%|          | 0/143465 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 3 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch4


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.2687,0.328689,0.640655,0.761125,0.55311,0.887403


Test Set Performance:
{'eval_loss': 0.3303717076778412, 'eval_f1': 0.6452529967115732, 'eval_precision': 0.7689623786407767, 'eval_recall': 0.5558317189955774, 'eval_accuracy': 0.8863058406860979, 'eval_runtime': 23.6322, 'eval_samples_per_second': 6070.751, 'eval_steps_per_second': 94.871, 'epoch': 1.9981994958588405}
✅ Model trained and saved for batch 4
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 5...
✅ Loaded label2id from Batch 4: {'B-GENE': 0, 'E-GENE': 1, 'I-GENE': 2, 'O': 3, 'S-GENE': 4}
After splitting: Train=891948, Dev=886324, Test=766033


Map:   0%|          | 0/891948 [00:00<?, ? examples/s]

Map:   0%|          | 0/886324 [00:00<?, ? examples/s]

Map:   0%|          | 0/766033 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 4 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch5


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.1177,0.161155,0.670918,0.770898,0.593895,0.944228


Test Set Performance:
{'eval_loss': 0.1528538167476654, 'eval_f1': 0.681120329589038, 'eval_precision': 0.7938453375779281, 'eval_recall': 0.5964282866937735, 'eval_accuracy': 0.9469068440590976, 'eval_runtime': 141.9032, 'eval_samples_per_second': 5398.278, 'eval_steps_per_second': 84.353, 'epoch': 1.9993542369232977}
✅ Model trained and saved for batch 5
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 6...
✅ Loaded label2id from Batch 5: {'B-Chemical': 0, 'I-Chemical': 1, 'O': 2}
After splitting: Train=891948, Dev=886324, Test=766033


Map:   0%|          | 0/891948 [00:00<?, ? examples/s]

Map:   0%|          | 0/886324 [00:00<?, ? examples/s]

Map:   0%|          | 0/766033 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 5 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch6


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.1263,0.213384,0.738789,0.844049,0.656872,0.937243


Test Set Performance:
{'eval_loss': 0.20661300420761108, 'eval_f1': 0.7411235240084364, 'eval_precision': 0.8556413963077943, 'eval_recall': 0.6536410952704231, 'eval_accuracy': 0.9392072536178971, 'eval_runtime': 150.2596, 'eval_samples_per_second': 5098.062, 'eval_steps_per_second': 79.662, 'epoch': 1.9993542369232977}
✅ Model trained and saved for batch 6
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 7...
✅ Loaded label2id from Batch 6: {'B-Chemical': 0, 'E-Chemical': 1, 'I-Chemical': 2, 'O': 3, 'S-Chemical': 4}
After splitting: Train=118170, Dev=117453, Test=124750


Map:   0%|          | 0/118170 [00:00<?, ? examples/s]

Map:   0%|          | 0/117453 [00:00<?, ? examples/s]

Map:   0%|          | 0/124750 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 6 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch7


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,No log,0.100787,0.880105,0.923149,0.840897,0.971145
1,No log,0.106084,0.88127,0.927243,0.83964,0.971176


Test Set Performance:
{'eval_loss': 0.10571897029876709, 'eval_f1': 0.8676674858425046, 'eval_precision': 0.9132366171839856, 'eval_recall': 0.8264298799104417, 'eval_accuracy': 0.9708850794844379, 'eval_runtime': 19.9792, 'eval_samples_per_second': 6243.986, 'eval_steps_per_second': 97.601, 'epoch': 1.9962100703844072}
✅ Model trained and saved for batch 7
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 8...
✅ Loaded label2id from Batch 7: {'B-Chemical': 0, 'I-Chemical': 1, 'O': 2}
After splitting: Train=118170, Dev=117453, Test=124750


Map:   0%|          | 0/118170 [00:00<?, ? examples/s]

Map:   0%|          | 0/117453 [00:00<?, ? examples/s]

Map:   0%|          | 0/124750 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 7 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch8


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,No log,0.144749,0.90105,0.945474,0.860613,0.968304
1,No log,0.149506,0.897754,0.936893,0.861754,0.966536


Test Set Performance:
{'eval_loss': 0.14484098553657532, 'eval_f1': 0.9035407182599898, 'eval_precision': 0.9451822847769723, 'eval_recall': 0.8654134974080713, 'eval_accuracy': 0.9686057031406046, 'eval_runtime': 20.1857, 'eval_samples_per_second': 6180.132, 'eval_steps_per_second': 96.603, 'epoch': 1.9962100703844072}
✅ Model trained and saved for batch 8
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 9...
✅ Loaded label2id from Batch 8: {'B-Chemical': 0, 'E-Chemical': 1, 'I-Chemical': 2, 'O': 3, 'S-Chemical': 4}
After splitting: Train=118170, Dev=117453, Test=124750


Map:   0%|          | 0/118170 [00:00<?, ? examples/s]

Map:   0%|          | 0/117453 [00:00<?, ? examples/s]

Map:   0%|          | 0/124750 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 8 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch9


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,No log,0.153595,0.716795,0.74933,0.686969,0.948536
1,No log,0.158268,0.700827,0.737971,0.667242,0.946476


Test Set Performance:
{'eval_loss': 0.1503576934337616, 'eval_f1': 0.7106342263531985, 'eval_precision': 0.7422371332714683, 'eval_recall': 0.6816125860373649, 'eval_accuracy': 0.9491135105920504, 'eval_runtime': 20.0068, 'eval_samples_per_second': 6235.39, 'eval_steps_per_second': 97.467, 'epoch': 1.9962100703844072}
✅ Model trained and saved for batch 9
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 10...
✅ Loaded label2id from Batch 9: {'B-Disease': 0, 'I-Disease': 1, 'O': 2}
After splitting: Train=118170, Dev=117453, Test=124750


Map:   0%|          | 0/118170 [00:00<?, ? examples/s]

Map:   0%|          | 0/117453 [00:00<?, ? examples/s]

Map:   0%|          | 0/124750 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 9 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch10


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,No log,0.191461,0.798998,0.852864,0.751532,0.94206
1,No log,0.185178,0.797884,0.824453,0.772974,0.941483


Test Set Performance:
{'eval_loss': 0.18831795454025269, 'eval_f1': 0.7977997664556573, 'eval_precision': 0.8532833760599488, 'eval_recall': 0.7490911189335796, 'eval_accuracy': 0.94497186027658, 'eval_runtime': 20.1639, 'eval_samples_per_second': 6186.786, 'eval_steps_per_second': 96.707, 'epoch': 1.9962100703844072}
✅ Model trained and saved for batch 10
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 11...
✅ Loaded label2id from Batch 10: {'B-Disease': 0, 'E-Disease': 1, 'I-Disease': 2, 'O': 3, 'S-Disease': 4}
After splitting: Train=118170, Dev=117453, Test=124750


Map:   0%|          | 0/118170 [00:00<?, ? examples/s]

Map:   0%|          | 0/117453 [00:00<?, ? examples/s]

Map:   0%|          | 0/124750 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 10 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch11


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,No log,0.313149,0.799268,0.81894,0.780518,0.914011
1,No log,0.286993,0.800184,0.831109,0.771478,0.916008


Test Set Performance:
{'eval_loss': 0.2766253352165222, 'eval_f1': 0.7948037311108803, 'eval_precision': 0.8238947303695505, 'eval_recall': 0.7676970235196379, 'eval_accuracy': 0.9183536793128972, 'eval_runtime': 20.2293, 'eval_samples_per_second': 6166.806, 'eval_steps_per_second': 96.395, 'epoch': 1.9962100703844072}
✅ Model trained and saved for batch 11
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 12...
✅ Loaded label2id from Batch 11: {'B-Chemical': 0, 'B-Disease': 1, 'I-Chemical': 2, 'I-Disease': 3, 'O': 4}
After splitting: Train=118170, Dev=117453, Test=124750


Map:   0%|          | 0/118170 [00:00<?, ? examples/s]

Map:   0%|          | 0/117453 [00:00<?, ? examples/s]

Map:   0%|          | 0/124750 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 11 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch12


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,No log,0.400449,0.846761,0.877104,0.818447,0.90529
1,No log,0.357101,0.852024,0.876485,0.828892,0.907312


Test Set Performance:
{'eval_loss': 0.34355494379997253, 'eval_f1': 0.848222849654837, 'eval_precision': 0.8707735833356447, 'eval_recall': 0.8268106399789308, 'eval_accuracy': 0.908278600885902, 'eval_runtime': 20.8219, 'eval_samples_per_second': 5991.299, 'eval_steps_per_second': 93.652, 'epoch': 1.9962100703844072}
✅ Model trained and saved for batch 12
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 13...
✅ Loaded label2id from Batch 12: {'B-Chemical': 0, 'B-Disease': 1, 'E-Chemical': 2, 'E-Disease': 3, 'I-Chemical': 4, 'I-Disease': 5, 'O': 6, 'S-Chemical': 7, 'S-Disease': 8}
After splitting: Train=227742, Dev=44185, Test=74624


Map:   0%|          | 0/227742 [00:00<?, ? examples/s]

Map:   0%|          | 0/44185 [00:00<?, ? examples/s]

Map:   0%|          | 0/74624 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 12 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch13


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,No log,0.204734,0.704141,0.76509,0.652186,0.926113
1,0.179700,0.206224,0.705732,0.788829,0.638474,0.92875


Test Set Performance:
{'eval_loss': 0.2000012993812561, 'eval_f1': 0.6980943699266398, 'eval_precision': 0.7969276511397423, 'eval_recall': 0.62107051826678, 'eval_accuracy': 0.9280095328515239, 'eval_runtime': 11.76, 'eval_samples_per_second': 6345.597, 'eval_steps_per_second': 99.15, 'epoch': 1.998033155380725}
✅ Model trained and saved for batch 13
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 14...
✅ Loaded label2id from Batch 13: {'B-Protein': 0, 'I-Protein': 1, 'O': 2}
After splitting: Train=227742, Dev=44185, Test=74624


Map:   0%|          | 0/227742 [00:00<?, ? examples/s]

Map:   0%|          | 0/44185 [00:00<?, ? examples/s]

Map:   0%|          | 0/74624 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 13 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch14


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,No log,0.268861,0.715453,0.778575,0.661798,0.92062
1,0.166900,0.258083,0.717163,0.794504,0.653543,0.922924


Test Set Performance:
{'eval_loss': 0.25088897347450256, 'eval_f1': 0.7222404730617608, 'eval_precision': 0.8058279116649867, 'eval_recall': 0.6543641640002976, 'eval_accuracy': 0.9249675456782469, 'eval_runtime': 11.821, 'eval_samples_per_second': 6312.833, 'eval_steps_per_second': 98.638, 'epoch': 1.998033155380725}
✅ Model trained and saved for batch 14
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 15...
✅ Loaded label2id from Batch 14: {'B-Protein': 0, 'E-Protein': 1, 'I-Protein': 2, 'O': 3, 'S-Protein': 4}
After splitting: Train=161576, Dev=54760, Test=116105


Map:   0%|          | 0/161576 [00:00<?, ? examples/s]

Map:   0%|          | 0/54760 [00:00<?, ? examples/s]

Map:   0%|          | 0/116105 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 14 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch15


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.1537,0.193179,0.727205,0.815716,0.656022,0.940038


Test Set Performance:
{'eval_loss': 0.21118026971817017, 'eval_f1': 0.7038694245110454, 'eval_precision': 0.807359586830213, 'eval_recall': 0.6238962334746819, 'eval_accuracy': 0.9324187708656604, 'eval_runtime': 19.4892, 'eval_samples_per_second': 5957.417, 'eval_steps_per_second': 93.129, 'epoch': 1.9948514851485148}
✅ Model trained and saved for batch 15
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 16...
✅ Loaded label2id from Batch 15: {'B-Protein': 0, 'I-Protein': 1, 'O': 2}
After splitting: Train=161576, Dev=54760, Test=116105


Map:   0%|          | 0/161576 [00:00<?, ? examples/s]

Map:   0%|          | 0/54760 [00:00<?, ? examples/s]

Map:   0%|          | 0/116105 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 15 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch16


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.133,0.267814,0.735359,0.808396,0.674426,0.932239


Test Set Performance:
{'eval_loss': 0.30205225944519043, 'eval_f1': 0.7202415332108165, 'eval_precision': 0.8160023795359905, 'eval_recall': 0.6445958646616541, 'eval_accuracy': 0.9252539145154465, 'eval_runtime': 19.8134, 'eval_samples_per_second': 5859.914, 'eval_steps_per_second': 91.605, 'epoch': 1.9948514851485148}
✅ Model trained and saved for batch 16
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 17...
✅ Loaded label2id from Batch 16: {'B-Protein': 0, 'E-Protein': 1, 'I-Protein': 2, 'O': 3, 'S-Protein': 4}
After splitting: Train=81856, Dev=22917, Test=61643


Map:   0%|          | 0/81856 [00:00<?, ? examples/s]

Map:   0%|          | 0/22917 [00:00<?, ? examples/s]

Map:   0%|          | 0/61643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 16 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch17


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.026199,0.718644,0.672304,0.771845,0.991367


Test Set Performance:
{'eval_loss': 0.03281928971409798, 'eval_f1': 0.4505716207128447, 'eval_precision': 0.48833819241982507, 'eval_recall': 0.418227215980025, 'eval_accuracy': 0.9889610070315189, 'eval_runtime': 9.5133, 'eval_samples_per_second': 6479.637, 'eval_steps_per_second': 101.331, 'epoch': 1.9882720875684128}
✅ Model trained and saved for batch 17
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 18...
✅ Loaded label2id from Batch 17: {'B-Chemical': 0, 'I-Chemical': 1, 'O': 2}
After splitting: Train=81856, Dev=22917, Test=61643


Map:   0%|          | 0/81856 [00:00<?, ? examples/s]

Map:   0%|          | 0/22917 [00:00<?, ? examples/s]

Map:   0%|          | 0/61643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 17 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch18


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.035827,0.735912,0.686598,0.792857,0.99173


Test Set Performance:
{'eval_loss': 0.03882693871855736, 'eval_f1': 0.5609123898392949, 'eval_precision': 0.52321083172147, 'eval_recall': 0.6044692737430167, 'eval_accuracy': 0.9888380783793087, 'eval_runtime': 9.5226, 'eval_samples_per_second': 6473.336, 'eval_steps_per_second': 101.233, 'epoch': 1.9882720875684128}
✅ Model trained and saved for batch 18
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 19...
✅ Loaded label2id from Batch 18: {'B-Chemical': 0, 'E-Chemical': 1, 'I-Chemical': 2, 'O': 3, 'S-Chemical': 4}
After splitting: Train=81856, Dev=22917, Test=61643


Map:   0%|          | 0/81856 [00:00<?, ? examples/s]

Map:   0%|          | 0/22917 [00:00<?, ? examples/s]

Map:   0%|          | 0/61643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 18 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch19


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.14011,0.710589,0.727939,0.694047,0.942996


Test Set Performance:
{'eval_loss': 0.11624313145875931, 'eval_f1': 0.8125822767269452, 'eval_precision': 0.7898706896551724, 'eval_recall': 0.8366386075046369, 'eval_accuracy': 0.9659856419334218, 'eval_runtime': 9.6263, 'eval_samples_per_second': 6403.604, 'eval_steps_per_second': 100.142, 'epoch': 1.9882720875684128}
✅ Model trained and saved for batch 19
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 20...
✅ Loaded label2id from Batch 19: {'B-Protein': 0, 'I-Protein': 1, 'O': 2}
After splitting: Train=81856, Dev=22917, Test=61643


Map:   0%|          | 0/81856 [00:00<?, ? examples/s]

Map:   0%|          | 0/22917 [00:00<?, ? examples/s]

Map:   0%|          | 0/61643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 19 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch20


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.163144,0.720217,0.740934,0.700626,0.945105


Test Set Performance:
{'eval_loss': 0.163540318608284, 'eval_f1': 0.8083643253105046, 'eval_precision': 0.8118393234672304, 'eval_recall': 0.8049189491335942, 'eval_accuracy': 0.9633795545065644, 'eval_runtime': 10.0069, 'eval_samples_per_second': 6160.062, 'eval_steps_per_second': 96.334, 'epoch': 1.9882720875684128}
✅ Model trained and saved for batch 20
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🎉 All detected batches processed successfully!
