# NER Fine Tuning

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


Install environment and dependencies

In [None]:
!pip install transformers datasets seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=a79fdc95086d653ead70b1aec692e4bd3ff17580f6914e1af549d8994d80da5d
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
!pip install --upgrade transformers datasets

import transformers
print(transformers.__version__)

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
[31mERROR: pip's dependency resolver doe

In [None]:
# === 1. Imports & Paths ===
import json, os
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
import torch
from seqeval.metrics import classification_report, f1_score, recall_score, precision_score

DRIVE_ROOT = "/content/drive/MyDrive/Colab Notebooks/CS425 GenAI for NLC"
MODEL_OUTDIR = DRIVE_ROOT

train_path = f"{DRIVE_ROOT}/train.jsonl"
valid_path = f"{DRIVE_ROOT}/valid.jsonl"


Set up NER Fine-Tuning:

In [None]:
# === 2. Label map & Tokenizer ===
label_map = {'B-SYM': 0, 'I-SYM': 1, 'O': 2}
id2label = {v:k for k,v in label_map.items()}
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1", use_fast=True)

def load_jsonl(filename):
    with open(filename, 'r', encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]

def align_and_encode(examples):
    batch_tokens = examples["tokens"]
    batch_tags = examples["tags"]

    tokenized_inputs = tokenizer(
        batch_tokens,
        is_split_into_words=True,
        padding='max_length',      # pad to max_length
        truncation=True,
        max_length=512,
        return_tensors=None,       # Return as dict of lists, not tensors yet
    )

    labels = []
    for i, tags in enumerate(batch_tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx >= len(tags):
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_map.get(tags[word_idx], label_map["O"]))
            else:
                label_ids.append(label_map.get(tags[word_idx], label_map["O"]) if tags[word_idx].startswith("I-") else -100)
            previous_word_idx = word_idx

        # Pad labels to max_length (512)
        while len(label_ids) < 512:
            label_ids.append(-100)
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_data = load_jsonl(train_path)
valid_data = load_jsonl(valid_path)

train_ds = Dataset.from_list(train_data)
valid_ds = Dataset.from_list(valid_data)

# Then apply your align_and_encode preprocessing
train_ds = train_ds.map(align_and_encode, batched=True)
train_ds = train_ds.remove_columns(["tokens", "tags", "text"])

valid_ds = valid_ds.map(align_and_encode, batched=True)
valid_ds = valid_ds.remove_columns(["tokens", "tags", "text"])

# Set format for PyTorch tensors
train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/98059 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Load Model:

In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

# === 3. Model with Dropout ===
# Load config and set dropout parameters
config = AutoConfig.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    num_labels=3,
    hidden_dropout_prob=0.3,  # increase dropout
    attention_probs_dropout_prob=0.3
)

# Instantiate model with the updated config
model = AutoModelForTokenClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    config=config
)

# === 4. Metrics: Macro F1, Recall, Per-label ===
def compute_metrics(pred):
    preds = pred.predictions.argmax(-1)
    labels = pred.label_ids

    true_tags = []
    pred_tags = []

    for label_seq, pred_seq in zip(labels, preds):
        true_tag_seq = []
        pred_tag_seq = []
        for l, p in zip(label_seq, pred_seq):
            if l == -100:  # skip ignore tokens
                continue
            true_tag_seq.append(id2label[l])
            pred_tag_seq.append(id2label[p])
        true_tags.append(true_tag_seq)
        pred_tags.append(pred_tag_seq)

    macro_f1 = f1_score(true_tags, pred_tags, average='macro')
    macro_recall = recall_score(true_tags, pred_tags, average='macro')
    macro_prec = precision_score(true_tags, pred_tags, average='macro')
    report = classification_report(true_tags, pred_tags, output_dict=True)

    return {
        'macro_f1': macro_f1,
        'macro_recall': macro_recall,
        'macro_precision': macro_prec,
        'label_f1': {lbl: report[lbl]['f1-score'] for lbl in label_map.keys() if lbl in report},
        'label_precision': {lbl: report[lbl]['precision'] for lbl in label_map.keys() if lbl in report},
        'label_recall': {lbl: report[lbl]['recall'] for lbl in label_map.keys() if lbl in report},
    }


pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Set Training Arguments:

In [None]:
# === 5. TrainingArguments ===
training_args = TrainingArguments(
    output_dir=MODEL_OUTDIR,
    per_device_train_batch_size=8,
    fp16=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    gradient_accumulation_steps=1,
    logging_dir=f"{DRIVE_ROOT}/logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=1000,                 # Log every 1000 steps
    load_best_model_at_end=True,        # ✅ ensures best checkpoint is restored
    metric_for_best_model="eval_loss",  # ✅ selects the metric to compare
    greater_is_better=False,            # ✅ lower loss is better
    save_total_limit=2,                 # keep only last 2 checkpoints
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    report_to="none",
    seed=42
)

# === 6. Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  trainer = Trainer(


Train and Save:

In [None]:
# === 8. Train and Save ===
train_result = trainer.train()
trainer.save_model(f"{DRIVE_ROOT}/checkpoint-biobert-ner-best")
tokenizer.save_pretrained(f"{DRIVE_ROOT}/tokenizer_biobert_v1.1")
with open(f"{DRIVE_ROOT}/logs/train_results.txt","w") as f:
    f.write(str(train_result))

print("Training finished. Best checkpoint and logs saved to Drive.")

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

Resume from checkpoint if interrupted:

In [None]:
# Path to your last checkpoint directory; adjust as needed
last_checkpoint_path = f"{DRIVE_ROOT}/checkpoint-12258"

# Resume training from the last checkpoint if exists
train_result = trainer.train(resume_from_checkpoint=last_checkpoint_path)
trainer.save_model(f"{DRIVE_ROOT}/checkpoint-biobert-ner-best")
tokenizer.save_pretrained(f"{DRIVE_ROOT}/tokenizer_biobert_v1.1")

Epoch,Training Loss,Validation Loss,Macro F1,Macro Recall,Macro Precision,Label F1,Label Precision,Label Recall
2,0.0032,0.007777,0.937843,0.970593,0.907231,{},{},{}
3,0.0022,0.005491,0.954128,0.973099,0.935883,{},{},{}


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Colab Notebooks/CS425 GenAI for NLC/logs/train_results.txt'

In [None]:
with open(f"{DRIVE_ROOT}/logs/train_results.txt","w") as f:
    f.write(str(train_result))
print("Training finished. Best checkpoint and logs saved to Drive.")

Training finished. Best checkpoint and logs saved to Drive.


Export model weights in zip:

In [None]:
# Zip the model checkpoint folder into Colab's local /content/ directory
!zip -r /content/model_checkpoint.zip "/content/drive/MyDrive/Colab Notebooks/CS425 GenAI for NLC"

from google.colab import files
files.download("/content/model_checkpoint.zip")

  adding: content/drive/MyDrive/Colab Notebooks/CS425 GenAI for NLC/ (stored 0%)
  adding: content/drive/MyDrive/Colab Notebooks/CS425 GenAI for NLC/1_fine_tune_symptom_model_colab.ipynb (deflated 87%)
  adding: content/drive/MyDrive/Colab Notebooks/CS425 GenAI for NLC/validation.jsonl (deflated 68%)
  adding: content/drive/MyDrive/Colab Notebooks/CS425 GenAI for NLC/auto_labeled.jsonl (deflated 83%)
  adding: content/drive/MyDrive/Colab Notebooks/CS425 GenAI for NLC/train.jsonl (deflated 86%)
  adding: content/drive/MyDrive/Colab Notebooks/CS425 GenAI for NLC/valid.jsonl (deflated 77%)
  adding: content/drive/MyDrive/Colab Notebooks/CS425 GenAI for NLC/BIO tagging (deflated 82%)
  adding: content/drive/MyDrive/Colab Notebooks/CS425 GenAI for NLC/tokenizer_biobert_v1.1/ (stored 0%)
  adding: content/drive/MyDrive/Colab Notebooks/CS425 GenAI for NLC/tokenizer_biobert_v1.1/tokenizer.json (deflated 70%)
  adding: content/drive/MyDrive/Colab Notebooks/CS425 GenAI for NLC/tokenizer_biobert_