
# Fine-tuning PubMedBERT on MTL-Bioinformatics-2016 with Sentence Splitting

This notebook:
- Loads datasets (`combined_train.jsonl`, `combined_dev.jsonl`, `combined_test.jsonl`)
- Splits long sequences ( > 512 tokens) into smaller chunks before tokenization
- Fine-tunes BioBERT using Hugging Face Transformers
- Uses `Trainer` API for efficient training and evaluation

This fix prevents long sequence errors by ensuring that no input exceeds BioBERT's max length.


In [1]:
import os
import json
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import classification_report, accuracy_score

# Load BioBERT model
model_checkpoint = "dmis-lab/biobert-base-cased-v1.2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

MAX_LEN = 512

In [2]:
def load_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return Dataset.from_list(data)

train_dataset = load_jsonl('/media/smartdragon/WORK/6th Semester/22AIE315 - Natural Language Processing/Project/Batch 1/combined_train_1.jsonl')
dev_dataset = load_jsonl('/media/smartdragon/WORK/6th Semester/22AIE315 - Natural Language Processing/Project/Batch 1/combined_dev_1.jsonl')
test_dataset = load_jsonl('/media/smartdragon/WORK/6th Semester/22AIE315 - Natural Language Processing/Project/Batch 1/combined_test_1.jsonl')

print(f"Train examples: {len(train_dataset)}")
print(f"Dev examples: {len(dev_dataset)}")
print(f"Test examples: {len(test_dataset)}")


Train examples: 153823
Dev examples: 58785
Test examples: 99976


In [3]:
# Generate label2id mapping
unique_tags = set(tag for example in train_dataset['tags'] for tag in example)
label2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
id2label = {v: k for k, v in label2id.items()}
print(f"Label2ID Mapping: {label2id}")

# Save label2id to file
with open("label2id.json", "w") as f:
    json.dump(label2id, f)
print("✅ Saved label2id mapping.")

Label2ID Mapping: {'B-Anatomy': 0, 'I-Anatomy': 1, 'O': 2}
✅ Saved label2id mapping.


In [4]:

def split_long_sentence(tokens, tags, max_len=MAX_LEN - 2):  # -2 for [CLS] and [SEP]
    chunks = []
    for i in range(0, len(tokens), max_len):
        chunk_tokens = tokens[i:i+max_len]
        chunk_tags = tags[i:i+max_len]
        chunks.append({'tokens': chunk_tokens, 'tags': chunk_tags})
    return chunks

# Apply sentence splitting before tokenization
def preprocess_dataset(dataset):
    split_data = []
    for example in dataset:
        split_sentences = split_long_sentence(example['tokens'], example['tags'])
        split_data.extend(split_sentences)
    return Dataset.from_list(split_data)

train_dataset = preprocess_dataset(train_dataset)
dev_dataset = preprocess_dataset(dev_dataset)
test_dataset = preprocess_dataset(test_dataset)

print(f"After splitting: Train={len(train_dataset)}, Dev={len(dev_dataset)}, Test={len(test_dataset)}")


After splitting: Train=153823, Dev=58785, Test=99976


In [5]:

# Extract label list from training set
unique_tags = set(tag for example in train_dataset['tags'] for tag in example)
label2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
id2label = {v: k for k, v in label2id.items()}
print(f"Label map: {label2id}")

def tokenize_and_align_labels(example):
    tokenized = tokenizer(example['tokens'], truncation=True, max_length=512, is_split_into_words=True)
    word_ids = tokenized.word_ids()

    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != previous_word_idx:
            aligned_labels.append(label2id[example['tags'][word_idx]])
        else:
            aligned_labels.append(label2id[example['tags'][word_idx]])

        previous_word_idx = word_idx

    tokenized['labels'] = aligned_labels
    return tokenized

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_and_align_labels)
dev_dataset = dev_dataset.map(tokenize_and_align_labels)
test_dataset = test_dataset.map(tokenize_and_align_labels)


Label map: {'B-Anatomy': 0, 'I-Anatomy': 1, 'O': 2}


Map:   0%|          | 0/153823 [00:00<?, ? examples/s]

Map:   0%|          | 0/58785 [00:00<?, ? examples/s]

Map:   0%|          | 0/99976 [00:00<?, ? examples/s]

In [6]:
# import torch
# import torch.nn as nn
# from transformers import AutoModelForTokenClassification

# old_label2id = label2id
# # Load the number of labels from batch 1
# old_num_labels = len(old_label2id)  # Should match previous training setup
# new_num_labels = len(label2id)  # Ensure this matches the new dataset

# # Load the previously trained model from batch 1
# model = AutoModelForTokenClassification.from_pretrained(
#     model_path, num_labels=old_num_labels
# )

# # Extract the old classifier layer
# old_classifier = model.classifier

# # Create a new classifier layer with updated label count
# new_classifier = nn.Linear(old_classifier.in_features, new_num_labels)

# # Transfer weights from the old classifier to the new one (for common labels)
# with torch.no_grad():
#     num_common_labels = min(old_num_labels, new_num_labels)
#     new_classifier.weight[:num_common_labels, :] = old_classifier.weight[:num_common_labels, :]
#     new_classifier.bias[:num_common_labels] = old_classifier.bias[:num_common_labels]

# # Assign the updated classifier to the model
# model.classifier = new_classifier

# # Save updated model before continuing training
# model.save_pretrained("BioBERT-updated")
# print("✅ Model updated to support new label set while keeping batch 1 training.")


In [7]:
from transformers import Adafactor

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label2id))
data_collator = DataCollatorForTokenClassification(tokenizer)

training_args = TrainingArguments(
    output_dir="BioBERT-finetuned-mtl1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=3e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    push_to_hub=False,
    save_total_limit=2,
    gradient_accumulation_steps=8,  # This simulates batch size 4 * 4 = 16
    fp16=True,
)

def compute_metrics(p):
    predictions, labels = p

    # Convert predictions to label IDs
    predictions = np.argmax(predictions, axis=2)

    true_labels = []
    true_predictions = []

    for i in range(len(labels)):
        label_sequence = []
        prediction_sequence = []

        for j in range(len(labels[i])):
            if labels[i][j] != -100:  # Exclude padding tokens
                label_sequence.append(id2label[labels[i][j]])
                prediction_sequence.append(id2label[predictions[i][j]])

        true_labels.append(label_sequence)
        true_predictions.append(prediction_sequence)

    # Classification report for F1, Precision, Recall
    report = classification_report(true_labels, true_predictions, output_dict=True)

    # Overall token-level accuracy
    accuracy = accuracy_score(true_labels, true_predictions)

    return {
        "f1": report["micro avg"]["f1-score"],
        "precision": report["micro avg"]["precision"],
        "recall": report["micro avg"]["recall"],
        "accuracy": accuracy
    }


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # optimizers=(Adafactor(model.parameters(), scale_parameter=True, relative_step=True), None)
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [8]:

trainer.train()


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.1473,0.131489,0.708837,0.713054,0.704669,0.956488


TrainOutput(global_step=600, training_loss=0.14226804415384928, metrics={'train_runtime': 298.5404, 'train_samples_per_second': 1030.5, 'train_steps_per_second': 2.01, 'total_flos': 1079846168588748.0, 'train_loss': 0.14226804415384928, 'epoch': 1.995008319467554})

In [9]:

metrics = trainer.evaluate(test_dataset)
print("Test Set Performance:")
print(metrics)


Test Set Performance:
{'eval_loss': 0.15424507856369019, 'eval_f1': 0.7271707785966036, 'eval_precision': 0.75311100049776, 'eval_recall': 0.7029580300449125, 'eval_accuracy': 0.9463610919322628, 'eval_runtime': 16.3671, 'eval_samples_per_second': 6108.354, 'eval_steps_per_second': 95.496, 'epoch': 1.995008319467554}


In [13]:
model.save_pretrained("biobert-finetuned-batch1")
tokenizer.save_pretrained("biobert-finetuned-batch1")

('biobert-finetuned-batch1/tokenizer_config.json',
 'biobert-finetuned-batch1/special_tokens_map.json',
 'biobert-finetuned-batch1/vocab.txt',
 'biobert-finetuned-batch1/added_tokens.json',
 'biobert-finetuned-batch1/tokenizer.json')

In [14]:
!mv label2id.json biobert-finetuned-batch1/label2id.json
print("✅ Saved label2id mapping to model directory.")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


✅ Saved label2id mapping to model directory.
