In [11]:
!pip install transformers datasets seqeval -q

In [None]:
import json
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

In [None]:
with open('labels.json', 'r') as f:
    label_list = json.load(f)

# Add BIO prefixes
unique_labels = ['O'] + [f'B-{l}' for l in label_list] + [f'I-{l}' for l in label_list]
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print(unique_labels)
print(label2id)
print(id2label)

In [20]:
def load_ner_data(path):
    with open(path, 'r') as f:
        raw = json.load(f)

    data = []
    for entry in raw:
        text = entry["sentence"]
        tokens = list(text)
        labels = ["O"] * len(tokens)

        for ent in entry.get("entities", []):
            start, end = ent["pos"]
            entity_type = ent["type"]
            if end > len(labels): continue  # Skip corrupt spans
            labels[start] = f"B-{entity_type}"
            for i in range(start+1, end):
                labels[i] = f"I-{entity_type}"

        data.append({"tokens": tokens, "labels": labels})
    return data

train_data = load_ner_data('train.json')
dev_data = load_ner_data('dev.json')
test_data = load_ner_data('test.json')


train_ds = Dataset.from_pandas(pd.DataFrame(train_data))
dev_ds = Dataset.from_pandas(pd.DataFrame(dev_data))
test_ds = Dataset.from_pandas(pd.DataFrame(test_data))

['O', 'B-cell line', 'B-protein', 'B-RNA', 'B-DNA', 'B-cell type', 'I-cell line', 'I-protein', 'I-RNA', 'I-DNA', 'I-cell type']
{'O': 0, 'B-cell line': 1, 'B-protein': 2, 'B-RNA': 3, 'B-DNA': 4, 'B-cell type': 5, 'I-cell line': 6, 'I-protein': 7, 'I-RNA': 8, 'I-DNA': 9, 'I-cell type': 10}
{0: 'O', 1: 'B-cell line', 2: 'B-protein', 3: 'B-RNA', 4: 'B-DNA', 5: 'B-cell type', 6: 'I-cell line', 7: 'I-protein', 8: 'I-RNA', 9: 'I-DNA', 10: 'I-cell type'}


In [21]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")
model = AutoModelForTokenClassification.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext", num_labels=len(label2id))

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(label2id))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
def tokenize_and_align(example):
    tokenized = tokenizer(example["tokens"], is_split_into_words=True, truncation=True, padding="max_length", max_length=128)
    word_ids = tokenized.word_ids()
    aligned = []

    for word_id in word_ids:
        if word_id is None:
            aligned.append(-100)
        else:
            label = example["labels"][word_id]
            aligned.append(label2id[label])

    tokenized["labels"] = aligned
    return tokenized

train_ds = train_ds.map(tokenize_and_align)
dev_ds = dev_ds.map(tokenize_and_align)
test_ds = test_ds.map(tokenize_and_align)

Map:   0%|          | 0/15023 [00:00<?, ? examples/s]

Map:   0%|          | 0/1669 [00:00<?, ? examples/s]

Map:   0%|          | 0/1854 [00:00<?, ? examples/s]

In [25]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="tensorboard",  # or "wandb"
    per_device_train_batch_size=16,  # reduce if OOM
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    load_best_model_at_end=True,
    fp16=True,  # Enables Automatic Mixed Precision (for faster training on CUDA)
)

def compute_metrics(p):
    pred, true = np.argmax(p.predictions, axis=-1), p.label_ids
    pred_out, true_out = [], []

    for p_seq, t_seq in zip(pred, true):
        temp_pred, temp_true = [], []
        for p_id, t_id in zip(p_seq, t_seq):
            if t_id != -100:
                temp_pred.append(id2label[p_id])
                temp_true.append(id2label[t_id])
        pred_out.append(temp_pred)
        true_out.append(temp_true)

    return {
        "accuracy": accuracy_score(true_out, pred_out),
        "f1": f1_score(true_out, pred_out),
        "precision": precision_score(true_out, pred_out),
        "recall": recall_score(true_out, pred_out)
    }


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

results = trainer.evaluate(eval_dataset=test_ds)
print("Test Evaluation:", results)

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1827,0.270093,0.917,0.510204,0.468904,0.559482
2,0.1243,0.281469,0.920221,0.539523,0.479927,0.616019
3,0.0832,0.302709,0.923079,0.545288,0.491377,0.612485


Test Evaluation: {'eval_loss': 0.2886393368244171, 'eval_accuracy': 0.9113767881796552, 'eval_f1': 0.49442959001782527, 'eval_precision': 0.454619954927269, 'eval_recall': 0.5418803418803418, 'eval_runtime': 5.329, 'eval_samples_per_second': 347.905, 'eval_steps_per_second': 21.768, 'epoch': 3.0}


In [24]:
model.save_pretrained("./genia_ner_biomedical_model")
tokenizer.save_pretrained("./genia_ner_biomedical_model")

('./genia_ner_biomedical_model/tokenizer_config.json',
 './genia_ner_biomedical_model/special_tokens_map.json',
 './genia_ner_biomedical_model/vocab.txt',
 './genia_ner_biomedical_model/added_tokens.json',
 './genia_ner_biomedical_model/tokenizer.json')

In [27]:
!pip install streamlit -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m100.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m109.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h