In [1]:
import os
import json
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import classification_report, accuracy_score
import torch
import torch.nn as nn

In [2]:
MAX_LEN = 512  # BERT's max sequence length
DATA_DIR = "/media/smartdragon/WORK/6th Semester/22AIE315 - Natural Language Processing/Project/New_Json_Files/Batch 3"

In [3]:
def load_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return Dataset.from_list(data)


In [4]:
def split_long_sentence(tokens, tags, max_len=MAX_LEN - 2):  # -2 for [CLS] and [SEP]
    chunks = []
    for i in range(0, len(tokens), max_len):
        chunk_tokens = tokens[i:i+max_len]
        chunk_tags = tags[i:i+max_len]
        chunks.append({'tokens': chunk_tokens, 'tags': chunk_tags})
    return chunks

In [5]:
def preprocess_dataset(dataset):
    split_data = []
    for example in dataset:
        split_sentences = split_long_sentence(example['tokens'], example['tags'])
        split_data.extend(split_sentences)
    return Dataset.from_list(split_data)

In [6]:
def tokenize_and_align_labels(example, tokenizer, label2id):
    tokenized = tokenizer(example['tokens'], truncation=True, max_length=512, is_split_into_words=True)
    word_ids = tokenized.word_ids()

    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            aligned_labels.append(label2id.get(example['tags'][word_idx], -100))
        else:
            aligned_labels.append(label2id.get(example['tags'][word_idx], -100))

        previous_word_idx = word_idx

    tokenized['labels'] = aligned_labels
    return tokenized

In [7]:
def compute_metrics(p, id2label):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    true_predictions = []

    for i in range(len(labels)):
        label_sequence = []
        prediction_sequence = []

        for j in range(len(labels[i])):
            if labels[i][j] != -100:  # Exclude padding tokens
                label_sequence.append(id2label.get(labels[i][j], "O"))
                prediction_sequence.append(id2label.get(predictions[i][j], "O"))

        true_labels.append(label_sequence)
        true_predictions.append(prediction_sequence)

    report = classification_report(true_labels, true_predictions, output_dict=True)
    accuracy = accuracy_score(true_labels, true_predictions)

    return {
        "f1": report["micro avg"]["f1-score"],
        "precision": report["micro avg"]["precision"],
        "recall": report["micro avg"]["recall"],
        "accuracy": accuracy
    }

In [8]:
def detect_batches():
    batch_numbers = set()
    for file in os.listdir(DATA_DIR):
        if file.startswith("combined_train_") and file.endswith(".jsonl"):
            batch_num = file.split("_")[-1].replace(".jsonl", "")
            if batch_num.isdigit():
                batch_numbers.add(int(batch_num))
    return sorted(batch_numbers)

In [9]:
def train_and_evaluate(batch_num):
    dataset_prefix = f"combined_{batch_num}"

    train_file = os.path.join(DATA_DIR, f"combined_train_{batch_num}.jsonl")
    dev_file = os.path.join(DATA_DIR, f"combined_dev_{batch_num}.jsonl")
    test_file = os.path.join(DATA_DIR, f"combined_test_{batch_num}.jsonl")

    if not os.path.exists(train_file) or not os.path.exists(dev_file) or not os.path.exists(test_file):
        print(f"🚨 Skipping batch {batch_num}, files not found.")
        return

    print(f"🚀 Processing batch {batch_num}...")

    
    model_path = f"/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch{batch_num-1}"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    # Load label2id from Batch 1 to ensure consistent label space
    with open(os.path.join(model_path, "label2id.json"), "r") as f:
        old_label2id = json.load(f)
    id2label = {v: k for k, v in old_label2id.items()}
    print(f"✅ Loaded label2id from Batch {batch_num - 1}: {old_label2id}")

    train_dataset = preprocess_dataset(load_jsonl(train_file))
    dev_dataset = preprocess_dataset(load_jsonl(dev_file))
    test_dataset = preprocess_dataset(load_jsonl(test_file))
    print(f"After splitting: Train={len(train_dataset)}, Dev={len(dev_dataset)}, Test={len(test_dataset)}")

    unique_tags = set(tag for example in train_dataset['tags'] for tag in example)
    label2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
    id2label = {v: k for k, v in label2id.items()}

    train_dataset = train_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label2id))
    dev_dataset = dev_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label2id))
    test_dataset = test_dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label2id))

    # Load the number of labels from batch 1
    old_num_labels = len(old_label2id)  # Should match previous training setup
    new_num_labels = len(label2id)  # Ensure this matches the new dataset

    # Load the previously trained model from batch 1
    model = AutoModelForTokenClassification.from_pretrained(
        model_path, num_labels=old_num_labels
    )

    # Extract the old classifier layer
    old_classifier = model.classifier

    # Create a new classifier layer with updated label count
    new_classifier = nn.Linear(old_classifier.in_features, new_num_labels)

    # Transfer weights from the old classifier to the new one (for common labels)
    with torch.no_grad():
        num_common_labels = min(old_num_labels, new_num_labels)
        new_classifier.weight[:num_common_labels, :] = old_classifier.weight[:num_common_labels, :]
        new_classifier.bias[:num_common_labels] = old_classifier.bias[:num_common_labels]

    # Assign the updated classifier to the model
    model.classifier = new_classifier

    # Save updated model before continuing training
    model.save_pretrained("/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/BioBERT-updated")
    print(f"✅ Model updated to support new label set while keeping batch {batch_num - 1} training.")

    model_checkpoint = f"/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch{batch_num}"
    # Create the directory if it does not exist
    os.makedirs(model_checkpoint, exist_ok=True)
    print(f"Directory ensured: {model_checkpoint}")
    model_path_2 = "/media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/BioBERT-updated"
    model = AutoModelForTokenClassification.from_pretrained(model_path_2, num_labels=len(label2id))

    training_args = TrainingArguments(
        output_dir="BioBERT-finetuned-mtl",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir="./logs",
        learning_rate=3e-5,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        num_train_epochs=2,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        push_to_hub=False,
        save_total_limit=2,
        gradient_accumulation_steps=8,  # This simulates batch size 4 * 4 = 16
        fp16=True,
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=lambda p: compute_metrics(p, id2label)
    )

    trainer.train()

    metrics = trainer.evaluate(test_dataset)
    print("Test Set Performance:")
    print(metrics)
    
    model.save_pretrained(model_checkpoint)
    tokenizer.save_pretrained(model_checkpoint)

    print(f"✅ Model trained and saved for batch {batch_num}")

    predictions, labels, _ = trainer.predict(test_dataset)
    predictions = np.argmax(predictions, axis=2)

        # Save label2id to file
    with open(f"{model_checkpoint}/label2id.json", "w") as f:
        json.dump(label2id, f)
    print("✅ Saved new_label2id mapping.")

    # Save label2id to file
    with open("label2id.json", "w") as f:
        json.dump(label2id, f)
    print("✅ Saved new_label2id mapping.")
    print("✅ Saved label2id mapping to model directory.")

    # results = classification_report(labels, predictions, output_dict=True)
    # with open(f"{model_checkpoint}_evaluation.json", "w") as f:
    #     json.dump(results, f)

    # print(f"📊 Evaluation results saved for batch {batch_num}")

In [10]:
batch_numbers = detect_batches()
print(f"📝 Detected batches: {batch_numbers}")

for batch_num in batch_numbers:
    train_and_evaluate(batch_num)

print("🎉 All detected batches processed successfully!")

📝 Detected batches: [41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67]
🚀 Processing batch 41...
✅ Loaded label2id from Batch 40: {'B-Cellular_component': 0, 'E-Cellular_component': 1, 'I-Cellular_component': 2, 'O': 3, 'S-Cellular_component': 4}
After splitting: Train=69244, Dev=23646, Test=46196


Map:   0%|          | 0/69244 [00:00<?, ? examples/s]

Map:   0%|          | 0/23646 [00:00<?, ? examples/s]

Map:   0%|          | 0/46196 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 40 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch41


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.109912,0.60166,0.721393,0.516014,0.965845


Test Set Performance:
{'eval_loss': 0.11798430979251862, 'eval_f1': 0.6156217882836589, 'eval_precision': 0.6575192096597146, 'eval_recall': 0.578743961352657, 'eval_accuracy': 0.9649263602181699, 'eval_runtime': 7.6458, 'eval_samples_per_second': 6041.998, 'eval_steps_per_second': 94.431, 'epoch': 1.990757855822551}
✅ Model trained and saved for batch 41
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 42...
✅ Loaded label2id from Batch 41: {'B-Simple_chemical': 0, 'I-Simple_chemical': 1, 'O': 2}
After splitting: Train=69244, Dev=23646, Test=46196


Map:   0%|          | 0/69244 [00:00<?, ? examples/s]

Map:   0%|          | 0/23646 [00:00<?, ? examples/s]

Map:   0%|          | 0/46196 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 41 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch42


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.160458,0.654442,0.793712,0.556751,0.961725


Test Set Performance:
{'eval_loss': 0.15749002993106842, 'eval_f1': 0.6608748666361836, 'eval_precision': 0.705958971019212, 'eval_recall': 0.6212034383954155, 'eval_accuracy': 0.9592343246095085, 'eval_runtime': 7.6742, 'eval_samples_per_second': 6019.683, 'eval_steps_per_second': 94.082, 'epoch': 1.990757855822551}
✅ Model trained and saved for batch 42
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 43...
✅ Loaded label2id from Batch 42: {'B-Simple_chemical': 0, 'E-Simple_chemical': 1, 'I-Simple_chemical': 2, 'O': 3, 'S-Simple_chemical': 4}
After splitting: Train=69244, Dev=23646, Test=46196


Map:   0%|          | 0/69244 [00:00<?, ? examples/s]

Map:   0%|          | 0/23646 [00:00<?, ? examples/s]

Map:   0%|          | 0/46196 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 42 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch43


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.254831,0.776897,0.781089,0.772749,0.913961


Test Set Performance:
{'eval_loss': 0.24930286407470703, 'eval_f1': 0.7937293729372938, 'eval_precision': 0.8241240469459437, 'eval_recall': 0.7654969364207846, 'eval_accuracy': 0.9175026379538396, 'eval_runtime': 7.7264, 'eval_samples_per_second': 5979.011, 'eval_steps_per_second': 93.446, 'epoch': 1.990757855822551}
✅ Model trained and saved for batch 43
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 44...
✅ Loaded label2id from Batch 43: {'B-Gene_or_gene_product': 0, 'I-Gene_or_gene_product': 1, 'O': 2}
After splitting: Train=69244, Dev=23646, Test=46196


Map:   0%|          | 0/69244 [00:00<?, ? examples/s]

Map:   0%|          | 0/23646 [00:00<?, ? examples/s]

Map:   0%|          | 0/46196 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 43 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch44


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.344404,0.774938,0.789895,0.760538,0.899597


Test Set Performance:
{'eval_loss': 0.3327817916870117, 'eval_f1': 0.7954769315629802, 'eval_precision': 0.8318226763348715, 'eval_recall': 0.7621744054360136, 'eval_accuracy': 0.9008426590574702, 'eval_runtime': 7.7096, 'eval_samples_per_second': 5991.99, 'eval_steps_per_second': 93.649, 'epoch': 1.990757855822551}
✅ Model trained and saved for batch 44
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 45...
✅ Loaded label2id from Batch 44: {'B-Gene_or_gene_product': 0, 'E-Gene_or_gene_product': 1, 'I-Gene_or_gene_product': 2, 'O': 3, 'S-Gene_or_gene_product': 4}
After splitting: Train=69244, Dev=23646, Test=46196


Map:   0%|          | 0/69244 [00:00<?, ? examples/s]

Map:   0%|          | 0/23646 [00:00<?, ? examples/s]

Map:   0%|          | 0/46196 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 44 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch45


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.5199,0.710035,0.736795,0.685151,0.872348


Test Set Performance:
{'eval_loss': 0.4710746705532074, 'eval_f1': 0.7273921971252567, 'eval_precision': 0.7539160045402952, 'eval_recall': 0.7026712509918012, 'eval_accuracy': 0.8764248666161368, 'eval_runtime': 8.1703, 'eval_samples_per_second': 5654.11, 'eval_steps_per_second': 88.368, 'epoch': 1.990757855822551}
✅ Model trained and saved for batch 45
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 46...
✅ Loaded label2id from Batch 45: {'B-Cellular_component': 0, 'B-Complex': 1, 'B-Gene_or_gene_product': 2, 'B-Simple_chemical': 3, 'I-Cellular_component': 4, 'I-Complex': 5, 'I-Gene_or_gene_product': 6, 'I-Simple_chemical': 7, 'O': 8}
After splitting: Train=69244, Dev=23646, Test=46196


Map:   0%|          | 0/69244 [00:00<?, ? examples/s]

Map:   0%|          | 0/23646 [00:00<?, ? examples/s]

Map:   0%|          | 0/46196 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 45 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch46


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,No log,0.623919,0.728112,0.758161,0.700353,0.855228


Test Set Performance:
{'eval_loss': 0.5868105888366699, 'eval_f1': 0.7470878079595704, 'eval_precision': 0.7767036200283718, 'eval_recall': 0.7196475513581929, 'eval_accuracy': 0.8534783836402277, 'eval_runtime': 7.8213, 'eval_samples_per_second': 5906.434, 'eval_steps_per_second': 92.312, 'epoch': 1.990757855822551}
✅ Model trained and saved for batch 46
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 47...
✅ Loaded label2id from Batch 46: {'B-Cellular_component': 0, 'B-Complex': 1, 'B-Gene_or_gene_product': 2, 'B-Simple_chemical': 3, 'E-Cellular_component': 4, 'E-Complex': 5, 'E-Gene_or_gene_product': 6, 'E-Simple_chemical': 7, 'I-Cellular_component': 8, 'I-Complex': 9, 'I-Gene_or_gene_product': 10, 'I-Simple_chemical': 11, 'O': 12, 'S-Cellular_component': 13, 'S-Complex': 14, 'S-Gene_or_gene_product': 15, 'S-Simple_chemical': 16}
After splitting: Train=278226, Dev=100166, Test=196904


Map:   0%|          | 0/278226 [00:00<?, ? examples/s]

Map:   0%|          | 0/100166 [00:00<?, ? examples/s]

Map:   0%|          | 0/196904 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 46 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch47


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,0.0305,0.033178,0.569626,0.573503,0.565801,0.992006
1,0.0198,0.033769,0.585708,0.612305,0.561325,0.992651


Test Set Performance:
{'eval_loss': 0.051833271980285645, 'eval_f1': 0.6111702937148075, 'eval_precision': 0.7035739313244569, 'eval_recall': 0.5402206080172182, 'eval_accuracy': 0.9897561345621421, 'eval_runtime': 36.5891, 'eval_samples_per_second': 5381.498, 'eval_steps_per_second': 84.096, 'epoch': 1.999080036798528}
✅ Model trained and saved for batch 47
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 48...
✅ Loaded label2id from Batch 47: {'B-GO': 0, 'I-GO': 1, 'O': 2}
After splitting: Train=278226, Dev=100166, Test=196904


Map:   0%|          | 0/278226 [00:00<?, ? examples/s]

Map:   0%|          | 0/100166 [00:00<?, ? examples/s]

Map:   0%|          | 0/196904 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 47 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch48


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,0.0444,0.042073,0.584017,0.59507,0.573367,0.991549
1,0.0197,0.040128,0.596745,0.638916,0.559796,0.992149


Test Set Performance:
{'eval_loss': 0.06329484283924103, 'eval_f1': 0.6097033374536466, 'eval_precision': 0.7635448916408669, 'eval_recall': 0.5074588477366255, 'eval_accuracy': 0.9886183486934426, 'eval_runtime': 39.7204, 'eval_samples_per_second': 4957.251, 'eval_steps_per_second': 77.466, 'epoch': 1.999080036798528}
✅ Model trained and saved for batch 48
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 49...
✅ Loaded label2id from Batch 48: {'B-GO': 0, 'E-GO': 1, 'I-GO': 2, 'O': 3, 'S-GO': 4}
After splitting: Train=278226, Dev=100166, Test=196904


Map:   0%|          | 0/278226 [00:00<?, ? examples/s]

Map:   0%|          | 0/100166 [00:00<?, ? examples/s]

Map:   0%|          | 0/196904 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 48 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch49


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,0.0283,0.024631,0.779456,0.796843,0.762812,0.992396
1,0.0189,0.025315,0.752212,0.780367,0.726018,0.993146


Test Set Performance:
{'eval_loss': 0.02628129906952381, 'eval_f1': 0.801029391846133, 'eval_precision': 0.8575986078886311, 'eval_recall': 0.7514612452350699, 'eval_accuracy': 0.9913338643000721, 'eval_runtime': 38.0035, 'eval_samples_per_second': 5181.202, 'eval_steps_per_second': 80.966, 'epoch': 1.999080036798528}
✅ Model trained and saved for batch 49
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 50...
✅ Loaded label2id from Batch 49: {'B-CL': 0, 'I-CL': 1, 'O': 2}
After splitting: Train=278226, Dev=100166, Test=196904


Map:   0%|          | 0/278226 [00:00<?, ? examples/s]

Map:   0%|          | 0/100166 [00:00<?, ? examples/s]

Map:   0%|          | 0/196904 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 49 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch50


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,0.0241,0.029633,0.77709,0.751497,0.804487,0.992996
1,0.0202,0.030512,0.756884,0.794781,0.722436,0.992809


Test Set Performance:
{'eval_loss': 0.030288292095065117, 'eval_f1': 0.846243072560897, 'eval_precision': 0.8806330472103004, 'eval_recall': 0.8144381046886628, 'eval_accuracy': 0.9930898471574316, 'eval_runtime': 36.6137, 'eval_samples_per_second': 5377.872, 'eval_steps_per_second': 84.039, 'epoch': 1.999080036798528}
✅ Model trained and saved for batch 50
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 51...
✅ Loaded label2id from Batch 50: {'B-CL': 0, 'E-CL': 1, 'I-CL': 2, 'O': 3, 'S-CL': 4}
After splitting: Train=278226, Dev=100166, Test=196904


Map:   0%|          | 0/278226 [00:00<?, ? examples/s]

Map:   0%|          | 0/100166 [00:00<?, ? examples/s]

Map:   0%|          | 0/196904 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 50 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch51


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,0.032,0.030538,0.829636,0.838764,0.820704,0.993678
1,0.0152,0.030882,0.825773,0.877095,0.780124,0.993836


Test Set Performance:
{'eval_loss': 0.03280822932720184, 'eval_f1': 0.7859929769801015, 'eval_precision': 0.7697745510126098, 'eval_recall': 0.8029095257074532, 'eval_accuracy': 0.9914514355065044, 'eval_runtime': 34.7911, 'eval_samples_per_second': 5659.611, 'eval_steps_per_second': 88.442, 'epoch': 1.999080036798528}
✅ Model trained and saved for batch 51
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 52...
✅ Loaded label2id from Batch 51: {'B-CHEBI': 0, 'I-CHEBI': 1, 'O': 2}
After splitting: Train=278226, Dev=100166, Test=196904


Map:   0%|          | 0/278226 [00:00<?, ? examples/s]

Map:   0%|          | 0/100166 [00:00<?, ? examples/s]

Map:   0%|          | 0/196904 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 51 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch52


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,0.0273,0.041521,0.831369,0.893597,0.777244,0.993214
1,0.0166,0.038294,0.83416,0.903316,0.77484,0.993139


Test Set Performance:
{'eval_loss': 0.037297721952199936, 'eval_f1': 0.8160016539177176, 'eval_precision': 0.8713024282560706, 'eval_recall': 0.7673017107309487, 'eval_accuracy': 0.9920392915386658, 'eval_runtime': 36.7866, 'eval_samples_per_second': 5352.606, 'eval_steps_per_second': 83.645, 'epoch': 1.999080036798528}
✅ Model trained and saved for batch 52
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 53...
✅ Loaded label2id from Batch 52: {'B-CHEBI': 0, 'E-CHEBI': 1, 'I-CHEBI': 2, 'O': 3, 'S-CHEBI': 4}
After splitting: Train=278226, Dev=100166, Test=196904


Map:   0%|          | 0/278226 [00:00<?, ? examples/s]

Map:   0%|          | 0/100166 [00:00<?, ? examples/s]

Map:   0%|          | 0/196904 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 52 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch53


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,0.088,0.151327,0.663435,0.773988,0.580517,0.964223
1,0.0578,0.16829,0.668959,0.767055,0.593108,0.964575


Test Set Performance:
{'eval_loss': 0.19940562546253204, 'eval_f1': 0.559775938571923, 'eval_precision': 0.6851552268700408, 'eval_recall': 0.47318578420940627, 'eval_accuracy': 0.9440929950316683, 'eval_runtime': 35.0524, 'eval_samples_per_second': 5617.417, 'eval_steps_per_second': 87.783, 'epoch': 1.999080036798528}
✅ Model trained and saved for batch 53
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 54...
✅ Loaded label2id from Batch 53: {'B-GGP': 0, 'I-GGP': 1, 'O': 2}
After splitting: Train=278226, Dev=100166, Test=196904


Map:   0%|          | 0/278226 [00:00<?, ? examples/s]

Map:   0%|          | 0/100166 [00:00<?, ? examples/s]

Map:   0%|          | 0/196904 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 53 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch54


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,0.0665,0.21952,0.702198,0.818961,0.614576,0.964418
1,0.0546,0.215806,0.690932,0.80988,0.60245,0.963121


Test Set Performance:
{'eval_loss': 0.30478954315185547, 'eval_f1': 0.5424990456801119, 'eval_precision': 0.7023887973640857, 'eval_recall': 0.44190505804311775, 'eval_accuracy': 0.9378389653733834, 'eval_runtime': 36.8081, 'eval_samples_per_second': 5349.472, 'eval_steps_per_second': 83.596, 'epoch': 1.999080036798528}
✅ Model trained and saved for batch 54
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 55...
✅ Loaded label2id from Batch 54: {'B-GGP': 0, 'E-GGP': 1, 'I-GGP': 2, 'O': 3, 'S-GGP': 4}
After splitting: Train=278226, Dev=100166, Test=196904


Map:   0%|          | 0/278226 [00:00<?, ? examples/s]

Map:   0%|          | 0/100166 [00:00<?, ? examples/s]

Map:   0%|          | 0/196904 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 54 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch55


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,0.2285,0.396327,0.74298,0.772982,0.715219,0.919867
1,0.1382,0.366279,0.751679,0.791374,0.715775,0.923692


Test Set Performance:
{'eval_loss': 0.41799476742744446, 'eval_f1': 0.6895043731778425, 'eval_precision': 0.7589046956893786, 'eval_recall': 0.6317335945151812, 'eval_accuracy': 0.9036788409754618, 'eval_runtime': 43.9143, 'eval_samples_per_second': 4483.82, 'eval_steps_per_second': 70.068, 'epoch': 1.999080036798528}
✅ Model trained and saved for batch 55
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 56...
✅ Loaded label2id from Batch 55: {'B-CHEBI': 0, 'B-CL': 1, 'B-GGP': 2, 'B-GO': 3, 'B-SO': 4, 'B-Taxon': 5, 'I-CHEBI': 6, 'I-CL': 7, 'I-GGP': 8, 'I-GO': 9, 'I-SO': 10, 'I-Taxon': 11, 'O': 12}
After splitting: Train=278226, Dev=100166, Test=196904


Map:   0%|          | 0/278226 [00:00<?, ? examples/s]

Map:   0%|          | 0/100166 [00:00<?, ? examples/s]

Map:   0%|          | 0/196904 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 55 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch56


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
0,0.2194,0.485998,0.771259,0.834147,0.717189,0.92073
1,0.1389,0.449271,0.771028,0.820402,0.727261,0.920137


Test Set Performance:
{'eval_loss': 0.5946175456047058, 'eval_f1': 0.7040060248281225, 'eval_precision': 0.7973530706581554, 'eval_recall': 0.6302248706015398, 'eval_accuracy': 0.8964121818940342, 'eval_runtime': 72.9964, 'eval_samples_per_second': 2697.448, 'eval_steps_per_second': 42.153, 'epoch': 1.999080036798528}
✅ Model trained and saved for batch 56
✅ Saved new_label2id mapping.
✅ Saved new_label2id mapping.
✅ Saved label2id mapping to model directory.
🚀 Processing batch 57...
✅ Loaded label2id from Batch 56: {'B-CHEBI': 0, 'B-CL': 1, 'B-GGP': 2, 'B-GO': 3, 'B-SO': 4, 'B-Taxon': 5, 'E-CHEBI': 6, 'E-CL': 7, 'E-GGP': 8, 'E-GO': 9, 'E-SO': 10, 'E-Taxon': 11, 'I-CHEBI': 12, 'I-CL': 13, 'I-GGP': 14, 'I-GO': 15, 'I-SO': 16, 'I-Taxon': 17, 'O': 18, 'S-CHEBI': 19, 'S-CL': 20, 'S-GGP': 21, 'S-GO': 22, 'S-SO': 23, 'S-Taxon': 24}
After splitting: Train=278226, Dev=100166, Test=196904


Map:   0%|          | 0/278226 [00:00<?, ? examples/s]

Map:   0%|          | 0/100166 [00:00<?, ? examples/s]

Map:   0%|          | 0/196904 [00:00<?, ? examples/s]

✅ Model updated to support new label set while keeping batch 56 training.
Directory ensured: /media/smartdragon/Windows-SSD/Users/sriva/Documents/NLP/BioBERT/biobert-finetuned-batch57


  trainer = Trainer(


OSError: [Errno 5] Input/output error: 'BioBERT-finetuned-mtl'