In [1]:
import os
import json
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import classification_report, accuracy_score
import torch
import torch.nn as nn

In [2]:
MAX_LEN = 512  # BERT's max sequence length
DATA_DIR = "/media/smartdragon/WORK/6th Semester/22AIE315 - Natural Language Processing/Project/New_Json_Files/Batch 2"

In [3]:
def load_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return Dataset.from_list(data)


In [4]:
def split_long_sentence(tokens, tags, max_len=MAX_LEN - 2):  # -2 for [CLS] and [SEP]
    chunks = []
    for i in range(0, len(tokens), max_len):
        chunk_tokens = tokens[i:i+max_len]
        chunk_tags = tags[i:i+max_len]
        chunks.append({'tokens': chunk_tokens, 'tags': chunk_tags})
    return chunks

In [5]:
def preprocess_dataset(dataset):
    split_data = []
    for example in dataset:
        split_sentences = split_long_sentence(example['tokens'], example['tags'])
        split_data.extend(split_sentences)
    return Dataset.from_list(split_data)

In [6]:
def tokenize_and_align_labels(example, tokenizer, label2id):
    tokenized = tokenizer(example['tokens'], truncation=True, max_length=512, is_split_into_words=True)
    word_ids = tokenized.word_ids()

    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            aligned_labels.append(label2id.get(example['tags'][word_idx], -100))
        else:
            aligned_labels.append(label2id.get(example['tags'][word_idx], -100))

        previous_word_idx = word_idx

    tokenized['labels'] = aligned_labels
    return tokenized

In [7]:
def compute_metrics(p, id2label):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    true_predictions = []

    for i in range(len(labels)):
        label_sequence = []
        prediction_sequence = []

        for j in range(len(labels[i])):
            if labels[i][j] != -100:  # Exclude padding tokens
                label_sequence.append(id2label.get(labels[i][j], "O"))
                prediction_sequence.append(id2label.get(predictions[i][j], "O"))

        true_labels.append(label_sequence)
        true_predictions.append(prediction_sequence)

    report = classification_report(true_labels, true_predictions, output_dict=True)
    accuracy = accuracy_score(true_labels, true_predictions)

    return {
        "f1": report["micro avg"]["f1-score"],
        "precision": report["micro avg"]["precision"],
        "recall": report["micro avg"]["recall"],
        "accuracy": accuracy
    }

In [8]:
def detect_batches():
    batch_numbers = set()
    for file in os.listdir(DATA_DIR):
        if file.startswith("combined_train_") and file.endswith(".jsonl"):
            batch_num = file.split("_")[-1].replace(".jsonl", "")
            if batch_num.isdigit():
                batch_numbers.add(int(batch_num))
    return sorted(batch_numbers)

In [9]:
def train_and_evaluate(batch_num):
    dataset_prefix = f"combined_{batch_num}"

    train_file = os.path.join(DATA_DIR, f"combined_train_{batch_num}.jsonl")
    dev_file = os.path.join(DATA_DIR, f"combined_dev_{batch_num}.jsonl")
    test_file = os.path.join(DATA_DIR, f"combined_test_{batch_num}.jsonl")

    if not os.path.exists(train_file) or not os.path.exists(dev_file) or not os.path.exists(test_file):
        print(f"🚨 Skipping batch {batch_num}, files not found.")
        return

    print(f"🚀 Processing batch {batch_num}...")

    
    model_path = f"/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch{batch_num-1}"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    # Load label2id from Batch 1 to ensure consistent label space
    with open(os.path.join(model_path, "label2id.json"), "r") as f:
        old_label2id = json.load(f)
    id2label = {v: k for k, v in old_label2id.items()}
    print(f"✅ Loaded label2id from Batch {batch_num - 1}: {old_label2id}")

    train_dataset = preprocess_dataset(load_jsonl(train_file))
    dev_dataset = preprocess_dataset(load_jsonl(dev_file))
    test_dataset = preprocess_dataset(load_jsonl(test_file))
    print(f"After splitting: Train={len(train_dataset)}, Dev={len(dev_dataset)}, Test={len(test_dataset)}")

    unique_tags = set(tag for example in train_dataset['tags'] for tag in example)
    label2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
    id2label = {v: k for k, v in label2id.items()}

    train_dataset = train_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label2id))
    dev_dataset = dev_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label2id))
    test_dataset = test_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label2id))

    # Load the number of labels from batch 1
    old_num_labels = len(old_label2id)  # Should match previous training setup
    new_num_labels = len(label2id)  # Ensure this matches the new dataset

    # Load the previously trained model from batch 1
    model = AutoModelForTokenClassification.from_pretrained(
        model_path, num_labels=old_num_labels
    )

    # Extract the old classifier layer
    old_classifier = model.classifier

    # Create a new classifier layer with updated label count
    new_classifier = nn.Linear(old_classifier.in_features, new_num_labels)

    # Transfer weights from the old classifier to the new one (for common labels)
    with torch.no_grad():
        num_common_labels = min(old_num_labels, new_num_labels)
        new_classifier.weight[:num_common_labels, :] = old_classifier.weight[:num_common_labels, :]
        new_classifier.bias[:num_common_labels] = old_classifier.bias[:num_common_labels]

    # Assign the updated classifier to the model
    model.classifier = new_classifier

    # Save updated model before continuing training
    model.save_pretrained("/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/PubMedBERT-updated")
    print(f"✅ Model updated to support new label set while keeping batch {batch_num - 1} training.")

    model_checkpoint = f"/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch{batch_num}"
    # Create the directory if it does not exist
    os.makedirs(model_checkpoint, exist_ok=True)
    print(f"Directory ensured: {model_checkpoint}")
    model_path_2 = "/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/PubMedBERT-updated"
    model = AutoModelForTokenClassification.from_pretrained(model_path_2, num_labels=len(label2id))

    training_args = TrainingArguments(
        output_dir="PubMedBERT-finetuned-mtl",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir="./logs",
        learning_rate=3e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        num_train_epochs=2,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        push_to_hub=False,
        save_total_limit=2,
        gradient_accumulation_steps=8,  # This simulates batch size 4 * 4 = 16
        fp16=True,
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=lambda p: compute_metrics(p, id2label)
    )

    trainer.train()

    metrics = trainer.evaluate(test_dataset)
    print("Test Set Performance:")
    print(metrics)
    
    model.save_pretrained(model_checkpoint)
    tokenizer.save_pretrained(model_checkpoint)

    print(f"✅ Model trained and saved for batch {batch_num}")

    predictions, labels, _ = trainer.predict(test_dataset)
    predictions = np.argmax(predictions, axis=2)

        # Save label2id to file
    with open(f"{model_checkpoint}/label2id.json", "w") as f:
        json.dump(label2id, f)
    print("✅ Saved new_label2id mapping.")

    # Save label2id to file
    with open("label2id.json", "w") as f:
        json.dump(label2id, f)
    print("✅ Saved new_label2id mapping.")
    print("✅ Saved label2id mapping to model directory.")

    # results = classification_report(labels, predictions, output_dict=True)
    # with open(f"{model_checkpoint}_evaluation.json", "w") as f:
    #     json.dump(results, f)

    # print(f"📊 Evaluation results saved for batch {batch_num}")

In [10]:
batch_numbers = detect_batches()
print(f"📝 Detected batches: {batch_numbers}")

for batch_num in batch_numbers:
    train_and_evaluate(batch_num)

print("🎉 All detected batches processed successfully!")

📝 Detected batches: [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
🚀 Processing batch 21...
✅ Loaded label2id from Batch 20: {'B-Protein': 0, 'E-Protein': 1, 'I-Protein': 2, 'O': 3, 'S-Protein': 4}
After splitting: Train=81856, Dev=22917, Test=61643


Map:   0%|          | 0/81856 [00:00<?, ? examples/s]

Map:   0%|          | 0/22917 [00:00<?, ? examples/s]

Map:   0%|          | 0/61643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 20 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch21


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.240032,0.784922,0.792816,0.777183,0.951812


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Test Set Performance:
{'eval_loss': 0.24300816655158997, 'eval_f1': 0.7457048717345257, 'eval_precision': 0.7755476685840167, 'eval_recall': 0.7180736543909348, 'eval_accuracy': 0.9439249588332157, 'eval_runtime': 8.751, 'eval_samples_per_second': 7044.125, 'eval_steps_per_second': 110.159, 'epoch': 1.9882720875684128}
✅ Model trained and saved for batch 21
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 22...
✅ Loaded label2id from Batch 21: {'B-Chemical': 0, 'B-Organism': 1, 'B-Protein': 2, 'B-Regulon-operon': 3, 'I-Chemical': 4, 'I-Organism': 5, 'I-Protein': 6, 'I-Regulon-operon': 7, 'O': 8}
After splitting: Train=81856, Dev=22917, Test=61643


Map:   0%|          | 0/81856 [00:00<?, ? examples/s]

Map:   0%|          | 0/22917 [00:00<?, ? examples/s]

Map:   0%|          | 0/61643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 21 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch22


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.266698,0.77801,0.781171,0.774875,0.948564


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Test Set Performance:
{'eval_loss': 0.31734344363212585, 'eval_f1': 0.7585497835497835, 'eval_precision': 0.8013949233935513, 'eval_recall': 0.7200534209985617, 'eval_accuracy': 0.9319264705882353, 'eval_runtime': 8.8343, 'eval_samples_per_second': 6977.698, 'eval_steps_per_second': 109.12, 'epoch': 1.9882720875684128}
✅ Model trained and saved for batch 22
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 23...
✅ Loaded label2id from Batch 22: {'B-Chemical': 0, 'B-Organism': 1, 'B-Protein': 2, 'B-Regulon-operon': 3, 'E-Chemical': 4, 'E-Organism': 5, 'E-Protein': 6, 'E-Regulon-operon': 7, 'I-Chemical': 8, 'I-Organism': 9, 'I-Protein': 10, 'O': 11, 'S-Chemical': 12, 'S-Organism': 13, 'S-Protein': 14, 'S-Regulon-operon': 15}
After splitting: Train=81856, Dev=22917, Test=61643


Map:   0%|          | 0/81856 [00:00<?, ? examples/s]

Map:   0%|          | 0/22917 [00:00<?, ? examples/s]

Map:   0%|          | 0/61643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 22 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch23


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.153291,0.496046,0.708419,0.381637,0.970425


Test Set Performance:
{'eval_loss': 0.13775230944156647, 'eval_f1': 0.5350958853099982, 'eval_precision': 0.6175333046841427, 'eval_recall': 0.4720762155059133, 'eval_accuracy': 0.9618913196894848, 'eval_runtime': 8.6592, 'eval_samples_per_second': 7118.797, 'eval_steps_per_second': 111.327, 'epoch': 1.9882720875684128}
✅ Model trained and saved for batch 23
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 24...
✅ Loaded label2id from Batch 23: {'B-Organism': 0, 'I-Organism': 1, 'O': 2}
After splitting: Train=81856, Dev=22917, Test=61643


Map:   0%|          | 0/81856 [00:00<?, ? examples/s]

Map:   0%|          | 0/22917 [00:00<?, ? examples/s]

Map:   0%|          | 0/61643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 23 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch24


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.195784,0.477652,0.654479,0.37605,0.967546


Test Set Performance:
{'eval_loss': 0.1808229684829712, 'eval_f1': 0.6801535735380979, 'eval_precision': 0.7985436893203883, 'eval_recall': 0.592335390946502, 'eval_accuracy': 0.9577158315690426, 'eval_runtime': 8.6414, 'eval_samples_per_second': 7133.473, 'eval_steps_per_second': 111.556, 'epoch': 1.9882720875684128}
✅ Model trained and saved for batch 24
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 25...
✅ Loaded label2id from Batch 24: {'B-Organism': 0, 'E-Organism': 1, 'I-Organism': 2, 'O': 3, 'S-Organism': 4}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 24 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch25


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.013473,0.538462,0.636364,0.466667,0.996065


Test Set Performance:
{'eval_loss': 0.011795414611697197, 'eval_f1': 0.6814159292035398, 'eval_precision': 0.7897435897435897, 'eval_recall': 0.5992217898832685, 'eval_accuracy': 0.9971174287464418, 'eval_runtime': 7.2695, 'eval_samples_per_second': 7259.235, 'eval_steps_per_second': 113.488, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 25
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 26...
✅ Loaded label2id from Batch 25: {'B-Cellular_component': 0, 'I-Cellular_component': 1, 'O': 2}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 25 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch26


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.01542,0.668966,0.625806,0.718519,0.995757


Test Set Performance:
{'eval_loss': 0.012548900209367275, 'eval_f1': 0.7173489278752436, 'eval_precision': 0.736, 'eval_recall': 0.6996197718631179, 'eval_accuracy': 0.9970093323244333, 'eval_runtime': 7.2241, 'eval_samples_per_second': 7304.867, 'eval_steps_per_second': 114.201, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 26
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 27...
✅ Loaded label2id from Batch 26: {'B-Cellular_component': 0, 'E-Cellular_component': 1, 'I-Cellular_component': 2, 'O': 3, 'S-Cellular_component': 4}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 26 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch27


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.098214,0.651529,0.721026,0.594252,0.967629


Test Set Performance:
{'eval_loss': 0.0937170535326004, 'eval_f1': 0.6692493946731236, 'eval_precision': 0.7430107526881721, 'eval_recall': 0.6088105726872247, 'eval_accuracy': 0.9687601340395633, 'eval_runtime': 7.0283, 'eval_samples_per_second': 7508.371, 'eval_steps_per_second': 117.383, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 27
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 28...
✅ Loaded label2id from Batch 27: {'B-Cell': 0, 'I-Cell': 1, 'O': 2}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 27 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch28


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.116769,0.653654,0.707965,0.607083,0.966671


Test Set Performance:
{'eval_loss': 0.111765556037426, 'eval_f1': 0.6862838158519222, 'eval_precision': 0.7461300309597523, 'eval_recall': 0.6353251318101933, 'eval_accuracy': 0.9676251216084748, 'eval_runtime': 7.2313, 'eval_samples_per_second': 7297.58, 'eval_steps_per_second': 114.087, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 28
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 29...
✅ Loaded label2id from Batch 28: {'B-Cell': 0, 'E-Cell': 1, 'I-Cell': 2, 'O': 3, 'S-Cell': 4}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 28 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch29


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.083072,0.673493,0.783069,0.590818,0.978237


Test Set Performance:
{'eval_loss': 0.08391667157411575, 'eval_f1': 0.6143067846607669, 'eval_precision': 0.7572727272727273, 'eval_recall': 0.5167493796526055, 'eval_accuracy': 0.9795697762404064, 'eval_runtime': 6.9364, 'eval_samples_per_second': 7607.867, 'eval_steps_per_second': 118.938, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 29
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 30...
✅ Loaded label2id from Batch 29: {'B-Simple_chemical': 0, 'I-Simple_chemical': 1, 'O': 2}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 29 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch30


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.100028,0.703142,0.816352,0.617507,0.978682


Test Set Performance:
{'eval_loss': 0.10680578649044037, 'eval_f1': 0.6139173436948075, 'eval_precision': 0.7510803802938635, 'eval_recall': 0.519115890083632, 'eval_accuracy': 0.9778402334882715, 'eval_runtime': 6.991, 'eval_samples_per_second': 7548.426, 'eval_steps_per_second': 118.009, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 30
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 31...
✅ Loaded label2id from Batch 30: {'B-Simple_chemical': 0, 'E-Simple_chemical': 1, 'I-Simple_chemical': 2, 'O': 3, 'S-Simple_chemical': 4}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 30 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch31


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.191038,0.705824,0.81539,0.622216,0.931905


Test Set Performance:
{'eval_loss': 0.15393517911434174, 'eval_f1': 0.7381409377570607, 'eval_precision': 0.7783346183500386, 'eval_recall': 0.7018946636537459, 'eval_accuracy': 0.9420783338738155, 'eval_runtime': 7.1289, 'eval_samples_per_second': 7402.411, 'eval_steps_per_second': 115.726, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 31
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 32...
✅ Loaded label2id from Batch 31: {'B-Gene_or_gene_product': 0, 'I-Gene_or_gene_product': 1, 'O': 2}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 31 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch32


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.255951,0.71029,0.831801,0.619754,0.919826


Test Set Performance:
{'eval_loss': 0.2079063057899475, 'eval_f1': 0.7365728900255755, 'eval_precision': 0.7577572128470332, 'eval_recall': 0.7165408373369938, 'eval_accuracy': 0.931214643461968, 'eval_runtime': 7.1065, 'eval_samples_per_second': 7425.762, 'eval_steps_per_second': 116.091, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 32
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 33...
✅ Loaded label2id from Batch 32: {'B-Gene_or_gene_product': 0, 'E-Gene_or_gene_product': 1, 'I-Gene_or_gene_product': 2, 'O': 3, 'S-Gene_or_gene_product': 4}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 32 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch33


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.518682,0.676276,0.6757,0.676852,0.851663


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Test Set Performance:
{'eval_loss': 0.5061236619949341, 'eval_f1': 0.6654605977277662, 'eval_precision': 0.662536023054755, 'eval_recall': 0.6684111062654455, 'eval_accuracy': 0.8596367960220517, 'eval_runtime': 7.3749, 'eval_samples_per_second': 7155.44, 'eval_steps_per_second': 111.865, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 33
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 34...
✅ Loaded label2id from Batch 33: {'B-Amino_acid': 0, 'B-Anatomical_system': 1, 'B-Cancer': 2, 'B-Cell': 3, 'B-Cellular_component': 4, 'B-Developing_anatomical_structure': 5, 'B-Gene_or_gene_product': 6, 'B-Immaterial_anatomical_entity': 7, 'B-Multi-tissue_structure': 8, 'B-Organ': 9, 'B-Organism': 10, 'B-Organism_subdivision': 11, 'B-Organism_substance': 12, 'B-Pathological_formation': 13, 'B-Simple_chemical': 14, 'B-Tissue': 15, 'I-Amino_acid': 16, 'I-Anatomical_system': 17, 'I-Cancer': 18, 'I-Cell': 19, 'I

Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 33 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch34


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.637491,0.685915,0.690947,0.680955,0.837525


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Test Set Performance:
{'eval_loss': 0.6044006943702698, 'eval_f1': 0.6725081363327491, 'eval_precision': 0.6709484050524513, 'eval_recall': 0.6740751362202466, 'eval_accuracy': 0.8475397726249032, 'eval_runtime': 7.7054, 'eval_samples_per_second': 6848.569, 'eval_steps_per_second': 107.068, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 34
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 35...
✅ Loaded label2id from Batch 34: {'B-Amino_acid': 0, 'B-Anatomical_system': 1, 'B-Cancer': 2, 'B-Cell': 3, 'B-Cellular_component': 4, 'B-Developing_anatomical_structure': 5, 'B-Gene_or_gene_product': 6, 'B-Immaterial_anatomical_entity': 7, 'B-Multi-tissue_structure': 8, 'B-Organ': 9, 'B-Organism': 10, 'B-Organism_subdivision': 11, 'B-Organism_substance': 12, 'B-Pathological_formation': 13, 'B-Simple_chemical': 14, 'B-Tissue': 15, 'E-Amino_acid': 16, 'E-Anatomical_system': 17, 'E-Cancer': 18, 'E-Cell': 19, 

Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 34 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch35


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.028438,0.789598,0.774942,0.804819,0.992677


Test Set Performance:
{'eval_loss': 0.027232171967625618, 'eval_f1': 0.79879427279578, 'eval_precision': 0.8166409861325116, 'eval_recall': 0.7817109144542773, 'eval_accuracy': 0.9944690664072352, 'eval_runtime': 7.0447, 'eval_samples_per_second': 7490.894, 'eval_steps_per_second': 117.11, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 35
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 36...
✅ Loaded label2id from Batch 35: {'B-Organism': 0, 'I-Organism': 1, 'O': 2}
After splitting: Train=83467, Dev=27599, Test=52771


Map:   0%|          | 0/83467 [00:00<?, ? examples/s]

Map:   0%|          | 0/27599 [00:00<?, ? examples/s]

Map:   0%|          | 0/52771 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 35 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch36


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.034432,0.806936,0.793182,0.821176,0.992711


Test Set Performance:
{'eval_loss': 0.03220110386610031, 'eval_f1': 0.8032424465733234, 'eval_precision': 0.8220211161387632, 'eval_recall': 0.7853025936599424, 'eval_accuracy': 0.9937844557345152, 'eval_runtime': 7.1811, 'eval_samples_per_second': 7348.636, 'eval_steps_per_second': 114.886, 'epoch': 1.993103448275862}
✅ Model trained and saved for batch 36
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 37...
✅ Loaded label2id from Batch 36: {'B-Organism': 0, 'E-Organism': 1, 'I-Organism': 2, 'O': 3, 'S-Organism': 4}
After splitting: Train=75068, Dev=81071, Test=104643


Map:   0%|          | 0/75068 [00:00<?, ? examples/s]

Map:   0%|          | 0/81071 [00:00<?, ? examples/s]

Map:   0%|          | 0/104643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 36 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch37


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,No log,0.199508,0.681725,0.778543,0.606323,0.941632
1,No log,0.205671,0.67841,0.784014,0.597877,0.941202


Test Set Performance:
{'eval_loss': 0.190134197473526, 'eval_f1': 0.6503453310124094, 'eval_precision': 0.7675281643472498, 'eval_recall': 0.5642049883086516, 'eval_accuracy': 0.9430461142752159, 'eval_runtime': 15.7248, 'eval_samples_per_second': 6654.661, 'eval_steps_per_second': 104.04, 'epoch': 1.9957374254049447}
✅ Model trained and saved for batch 37
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 38...
✅ Loaded label2id from Batch 37: {'B-Protein': 0, 'I-Protein': 1, 'O': 2}
After splitting: Train=75068, Dev=81071, Test=104643


Map:   0%|          | 0/75068 [00:00<?, ? examples/s]

Map:   0%|          | 0/81071 [00:00<?, ? examples/s]

Map:   0%|          | 0/104643 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 37 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch38


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,No log,0.254859,0.655415,0.770374,0.570311,0.935779
1,No log,0.236614,0.680435,0.773811,0.607167,0.937813


Test Set Performance:
{'eval_loss': 0.2262895554304123, 'eval_f1': 0.6402175119298635, 'eval_precision': 0.7525436994521263, 'eval_recall': 0.5570683661645423, 'eval_accuracy': 0.9398178525498019, 'eval_runtime': 15.9052, 'eval_samples_per_second': 6579.163, 'eval_steps_per_second': 102.859, 'epoch': 1.9957374254049447}
✅ Model trained and saved for batch 38
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 39...
✅ Loaded label2id from Batch 38: {'B-Protein': 0, 'E-Protein': 1, 'I-Protein': 2, 'O': 3, 'S-Protein': 4}
After splitting: Train=69244, Dev=23646, Test=46196


Map:   0%|          | 0/69244 [00:00<?, ? examples/s]

Map:   0%|          | 0/23646 [00:00<?, ? examples/s]

Map:   0%|          | 0/46196 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 38 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch39


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.025405,0.79292,0.8,0.785965,0.994954


Test Set Performance:
{'eval_loss': 0.016089800745248795, 'eval_f1': 0.7991452991452991, 'eval_precision': 0.7840670859538784, 'eval_recall': 0.8148148148148148, 'eval_accuracy': 0.9956865848832608, 'eval_runtime': 6.5221, 'eval_samples_per_second': 7082.991, 'eval_steps_per_second': 110.7, 'epoch': 1.990757855822551}
✅ Model trained and saved for batch 39
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 40...
✅ Loaded label2id from Batch 39: {'B-Cellular_component': 0, 'I-Cellular_component': 1, 'O': 2}
After splitting: Train=69244, Dev=23646, Test=46196


Map:   0%|          | 0/69244 [00:00<?, ? examples/s]

Map:   0%|          | 0/23646 [00:00<?, ? examples/s]

Map:   0%|          | 0/46196 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 39 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/PubMedBERT/pubmedbert-finetuned-batch40


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.030218,0.787346,0.797153,0.777778,0.993944


Test Set Performance:
{'eval_loss': 0.019268926233053207, 'eval_f1': 0.8144220572640509, 'eval_precision': 0.8067226890756303, 'eval_recall': 0.8222698072805139, 'eval_accuracy': 0.9953502176493866, 'eval_runtime': 6.4472, 'eval_samples_per_second': 7165.327, 'eval_steps_per_second': 111.987, 'epoch': 1.990757855822551}
✅ Model trained and saved for batch 40
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🎉 All detected batches processed successfully!
