In [1]:
!pip install transformers datasets seqeval evaluate

from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset, DatasetDict
import numpy as np
from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score
import random
from sklearn.model_selection import train_test_split

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=c625ac50c0f53027bee628661d301288d3a3caeaf96f33f59fd7e84ac8ad3f86
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval, evaluate
Successfully installed evaluate-0.4.6 seqeval-1.2.2


In [2]:
# -------------------------------
# STEP 1: Detect labels from files
# -------------------------------
def collect_labels(files):
    labels = set()
    for file in files:
        with open(file, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                parts = line.split()
                labels.add(parts[-1])  # last col = label
    return sorted(list(labels))

label_list = collect_labels(["/content/sample_data/project1.conll"])#, "/content/sample_data/gold.conll"])
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}
print("Detected labels:", label_list)

# -------------------------------
# STEP 2: Reader for your format
# -------------------------------
def read_conll(filepath):
    examples = []
    tokens, tags = [], []

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    examples.append({"tokens": tokens, "ner_tags": [label2id[tag] for tag in tags]})
                    tokens, tags = [], []
                continue

            parts = line.split()
            token, tag = parts[0], parts[-1]   # first col = token, last col = label
            tokens.append(token)
            tags.append(tag)

        if tokens:  # last sentence
            examples.append({"tokens": tokens, "ner_tags": [label2id[tag] for tag in tags]})

    return examples

# -------------------------------
# STEP 3: Build dataset
# -------------------------------
silver_data = read_conll("/content/sample_data/project1.conll")   # weak silver labels
#gold_data   = read_conll("/content/sample_data/gold.conll")     # gold labels

#print("Gold examples:", len(gold_data))
print("Silver examples:", len(silver_data))

random.shuffle(silver_data)
#random.shuffle(gold_data)

# assume you already have silver_data = list of samples (~9000+)

# First split: train vs temp (val+test)
train_data, temp_data = train_test_split(
    silver_data,
    test_size=0.15,   # 15% goes to val+test
    random_state=42,  # reproducibility
    shuffle=True
)

# Second split: val vs test
val_data, test_data = train_test_split(
    temp_data,
    test_size=0.5,    # half of 15% → 7.5% test, 7.5% val
    random_state=42,
    shuffle=True
)

print(f"Train: {len(train_data)}")
print(f"Val:   {len(val_data)}")
print(f"Test:  {len(test_data)}")


#train_data = silver_data + gold_data[:350]  # mix weak + gold
#val_data   = gold_data[350:425]  # ~75 examples
#test_data  = gold_data[425:]     # ~75 examples

dataset = DatasetDict({
   "train": Dataset.from_list(train_data),
   "validation": Dataset.from_list(val_data),
    "test": Dataset.from_list(test_data),
})

#print(dataset)



Detected labels: ['B-ADE', 'B-DRUG', 'I-ADE', 'I-DRUG', 'O']
Silver examples: 18425
Train: 15661
Val:   1382
Test:  1382


In [3]:
from collections import Counter

def print_label_counts(dataset, split_name, key="ner_tags"):
    """
    Print label counts for a dataset split.
    - dataset: HuggingFace Dataset
    - split_name: "train", "validation", or "test"
    - key: column containing labels ("ner_tags" before tokenization, "labels" after)
    """
    counts = Counter()
    for labels in dataset[split_name][key]:
        counts.update(labels)

    # remove padding ignore index (-100) if present
    if -100 in counts:
        del counts[-100]

    print(f"\n🔹 {split_name} counts (from '{key}'):")
    for label_id, cnt in sorted(counts.items()):
        print(f"  {label_id:>2} : {cnt}")


In [4]:
from datasets import Dataset, DatasetDict
from collections import Counter
import torch
import torch.nn as nn
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer
)
import evaluate
import numpy as np

In [None]:
# -----------------------------
# 3️⃣ Load your dataset
# -----------------------------
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data),
    "test": Dataset.from_list(test_data),
})

label_column = "ner_tags"

# Original 5-label scheme
#orig_label_list = ["O", "B-ADE", "I-ADE", "B-DRUG", "I-DRUG"]

In [5]:
for split in ["train", "validation", "test"]:
    print_label_counts(dataset, split, key="ner_tags")



🔹 train counts (from 'ner_tags'):
   0 : 160609
   1 : 19189
   2 : 87646
   3 : 8335
   4 : 1851776

🔹 validation counts (from 'ner_tags'):
   0 : 5540
   1 : 1639
   2 : 1269
   3 : 679
   4 : 167814

🔹 test counts (from 'ner_tags'):
   0 : 5586
   1 : 1602
   2 : 1195
   3 : 660
   4 : 158591


In [6]:
import torch
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from collections import Counter

# ------------------------
# 1. Define labels
# ------------------------
label_list = ["B-ADE", "B-DRUG","I-ADE","I-DRUG","O"]
num_labels = len(label_list)
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for i, l in enumerate(label_list)}

# ------------------------
# 2. Compute class weights from dataset
# ------------------------
def compute_class_weights(dataset, label_column="ner_tags"):
    counts = Counter()
    for split in ["train"]:
        for seq in dataset[split][label_column]:
            counts.update(seq)
    total = sum(counts.values())
    weights = []
    for i in range(num_labels):
        # weight = total / (num_labels * class_count)
        weights.append(total / (num_labels * counts[i]) if counts[i] > 0 else 1.0)
    return torch.tensor(weights, dtype=torch.float)

class_weights = compute_class_weights(dataset)
print("Class weights:", class_weights)

Class weights: tensor([ 2.6494, 22.1747,  4.8549, 51.0511,  0.2298])


In [7]:
weights = torch.tensor([ 2.6494, 22.1747,  4.8549, 51.0511,  0.2298])
weights = weights / weights.max()  # normalize so max = 1
print("weights",weights)

weights tensor([0.0519, 0.4344, 0.0951, 1.0000, 0.0045])


In [8]:
print("I-DRUG / O weight ratio:", weights[3]/weights[4])


I-DRUG / O weight ratio: tensor(222.1545)


In [9]:
model_name = "dmis-lab/biobert-base-cased-v1.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(label_list)  # e.g., 5 labels
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    return_dict=True  # ensures outputs.logits exists
)


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Freeze all parameters
for param in model.bert.parameters():
    param.requires_grad = False

# Unfreeze the last 4 encoder layers
for layer in model.bert.encoder.layer[-4:]:
    for param in layer.parameters():
        param.requires_grad = True

# Classifier head is always trainable
for param in model.classifier.parameters():
    param.requires_grad = True


for name, param in model.named_parameters():
    print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight False
bert.embeddings.position_embeddings.weight False
bert.embeddings.token_type_embeddings.weight False
bert.embeddings.LayerNorm.weight False
bert.embeddings.LayerNorm.bias False
bert.encoder.layer.0.attention.self.query.weight False
bert.encoder.layer.0.attention.self.query.bias False
bert.encoder.layer.0.attention.self.key.weight False
bert.encoder.layer.0.attention.self.key.bias False
bert.encoder.layer.0.attention.self.value.weight False
bert.encoder.layer.0.attention.self.value.bias False
bert.encoder.layer.0.attention.output.dense.weight False
bert.encoder.layer.0.attention.output.dense.bias False
bert.encoder.layer.0.attention.output.LayerNorm.weight False
bert.encoder.layer.0.attention.output.LayerNorm.bias False
bert.encoder.layer.0.intermediate.dense.weight False
bert.encoder.layer.0.intermediate.dense.bias False
bert.encoder.layer.0.output.dense.weight False
bert.encoder.layer.0.output.dense.bias False
bert.encoder.layer.0.output.Lay

In [11]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        max_length=512,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                # for subword tokens: keep same label if not O, else O
                label_ids.append(label[word_idx] if label[word_idx] != label2id["O"] else label2id["O"])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/15661 [00:00<?, ? examples/s]

Map:   0%|          | 0/1382 [00:00<?, ? examples/s]

Map:   0%|          | 0/1382 [00:00<?, ? examples/s]

In [12]:
# Normalized weights tensor
weights = torch.tensor([0.0519, 0.4344, 0.0951, 1.0000, 0.0045]).to("cuda")

# Overwrite model's forward via Trainer
def compute_loss(model, inputs, return_outputs=False):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.logits
    loss_fct = nn.CrossEntropyLoss(weight=weights, ignore_index=-100)
    loss = loss_fct(logits.view(-1, logits.shape[-1]), labels.view(-1))
    return (loss, outputs) if return_outputs else loss


In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer)


In [14]:
from seqeval.metrics import classification_report
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_preds = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    report = classification_report(true_labels, true_preds, output_dict=True)
    return {
        "precision": report["micro avg"]["precision"],
        "recall": report["micro avg"]["recall"],
        "f1": report["micro avg"]["f1-score"]
    }

In [15]:
training_args = TrainingArguments(
    output_dir="./ner_biobert",
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    logging_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to=[],
    push_to_hub=False
)


In [16]:
from transformers import Trainer

class WeightedTrainer(Trainer):
    def __init__(self, weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.weights = weights.to(self.model.device)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.CrossEntropyLoss(weight=self.weights, ignore_index=-100)
        loss = loss_fct(logits.view(-1, logits.shape[-1]), labels.view(-1))
        return (loss, outputs) if return_outputs else loss



In [17]:
trainer = WeightedTrainer(
    weights=weights,
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  super().__init__(*args, **kwargs)


In [18]:
trainer.train()

metrics = trainer.evaluate(tokenized_datasets["test"])
print(metrics)


Step,Training Loss,Validation Loss,Precision,Recall,F1
500,0.0652,0.028726,0.754336,0.989322,0.855995
1000,0.0327,0.015939,0.863754,0.995225,0.924841
1500,0.0143,0.011493,0.919909,0.995623,0.95627
2000,0.0114,0.011373,0.898931,0.998077,0.945913
2500,0.0072,0.009939,0.926613,0.997347,0.96068
3000,0.0182,0.009903,0.954753,0.997811,0.975807
3500,0.0063,0.007644,0.948122,0.997546,0.972206
4000,0.0045,0.009916,0.961541,0.998209,0.979532
4500,0.0053,0.008835,0.964618,0.998077,0.981062


{'eval_loss': 0.0056439051404595375, 'eval_precision': 0.9626585101637672, 'eval_recall': 0.9974909210960713, 'eval_f1': 0.9797652247227446, 'eval_runtime': 43.0514, 'eval_samples_per_second': 32.101, 'eval_steps_per_second': 2.021, 'epoch': 5.0}


In [20]:
save_path = "biobert-ner-final"

# Save model
model.save_pretrained(save_path)

# Save tokenizer
tokenizer.save_pretrained(save_path)

print(f"✅ Model and tokenizer saved to '{save_path}'")


✅ Model and tokenizer saved to 'biobert-ner-final'


In [19]:
from seqeval.metrics import classification_report

preds_output = trainer.predict(tokenized_datasets["validation"])
preds = np.argmax(preds_output.predictions, axis=2)

true_labels = [[id2label[l] for l in label if l != -100] for label in preds_output.label_ids]
true_preds = [
    [id2label[p] for (p, l) in zip(pred, label) if l != -100]
    for pred, label in zip(preds, preds_output.label_ids)
]

print(classification_report(true_labels, true_preds, digits=3))


              precision    recall  f1-score   support

         ADE      0.961     0.998     0.979     10221
        DRUG      0.962     0.999     0.980      4857

   micro avg      0.962     0.998     0.980     15078
   macro avg      0.962     0.998     0.980     15078
weighted avg      0.962     0.998     0.980     15078



In [21]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import re

# -----------------------------
# 1️⃣ Load model & tokenizer
# -----------------------------
model_path = "biobert-ner-final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# -----------------------------
# 2️⃣ Input sentence
# -----------------------------
sentence = "moderna shot was administered and got fever, headache"

# Split into words/punctuation
tokens = re.findall(r"\w+|[^\w\s]", sentence)

# -----------------------------
# 3️⃣ Tokenize
# -----------------------------
encoded = tokenizer(
    tokens,
    is_split_into_words=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
).to(device)

# -----------------------------
# 4️⃣ Predict
# -----------------------------
with torch.no_grad():
    outputs = model(**encoded)
    predictions = torch.argmax(outputs.logits, dim=-1)[0].cpu().numpy()

# -----------------------------
# 5️⃣ Align predictions to words
# -----------------------------
word_ids = encoded.word_ids(batch_index=0)

label_list = ["B-ADE","B-DRUG", "I-ADE",  "I-DRUG", "O"]
id2label = {i: l for i, l in enumerate(label_list)}

pred_labels = []
previous_word_idx = None
for idx, word_idx in enumerate(word_ids):
    if word_idx is None or word_idx == previous_word_idx:
        continue
    pred_labels.append((tokens[word_idx], id2label[predictions[idx]]))
    previous_word_idx = word_idx

print("🔹 Token-level predictions:")
print(pred_labels)

import string

entities = {"DRUG": [], "ADE": []}
current_entity = None
current_words = []

for word, label in pred_labels:
    # Skip punctuation-only tokens
    if all(ch in string.punctuation for ch in word):
        continue

    if label.startswith("B-"):
        if current_entity and current_words:
            entities[current_entity].append(" ".join(current_words))
        current_entity = label.split("-")[1]
        current_words = [word]

    elif label.startswith("I-") and current_entity == label.split("-")[1]:
        current_words.append(word)

    elif label.startswith("B-") and current_entity == label.split("-")[1]:
        # Handle consecutive B-XXX (merge case)
        current_words.append(word)

    else:
        if current_entity and current_words:
            entities[current_entity].append(" ".join(current_words))
        current_entity = None
        current_words = []

# Add last
if current_entity and current_words:
    entities[current_entity].append(" ".join(current_words))

print("\n🔹 Entity-level predictions:")
for ent_type, ent_list in entities.items():
    print(f"{ent_type}: {', '.join(ent_list) if ent_list else 'None'}")

🔹 Token-level predictions:
[('moderna', 'B-DRUG'), ('shot', 'O'), ('was', 'O'), ('administered', 'O'), ('and', 'O'), ('got', 'O'), ('fever', 'B-ADE'), (',', 'O'), ('headache', 'B-ADE')]

🔹 Entity-level predictions:
DRUG: moderna
ADE: fever, headache


In [22]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import re

# -----------------------------
# 1️⃣ Load model & tokenizer
# -----------------------------
model_path = "biobert-ner-final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)
model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# -----------------------------
# 2️⃣ Post-processing dictionary
# -----------------------------
POSTPROCESS_DICT = {
    "DRUG": {
        "pfizer", "moderna", "astrazeneca", "covaxin",
        "janssen", "johnson", "johnson and johnson", "biontech"
    },
    "ADE": {
        "fever", "headache", "dizziness", "nausea",
        "rash", "fatigue", "chills", "itching", "sweating",
        "chest pain"
    }
}

def normalize(text):
    """Lowercase, remove punctuation, collapse spaces."""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def postprocess_entities(text, entities):
    """Dictionary-based fuzzy mapping."""
    new_entities = {"DRUG": list(entities["DRUG"]), "ADE": list(entities["ADE"])}
    text_norm = normalize(text)

    for ent_type, vocab in POSTPROCESS_DICT.items():
        for word in vocab:
            word_norm = normalize(word)
            if word_norm in text_norm and not any(word_norm in normalize(e) for e in new_entities[ent_type]):
                new_entities[ent_type].append(word)
    return new_entities

def clean_entities(entities):
    """DRUG/ADE cleanup for unrealistic spans."""
    cleaned = {"DRUG": [], "ADE": []}

    # Clean ADE
    for ade in entities.get("ADE", []):
        ade = ade.strip("., ")
        if ade and ade.lower() not in ["and", "reported", "later", "severe"]:
            cleaned["ADE"].append(ade)

    # Clean DRUG
    for drug in entities.get("DRUG", []):
        drug = re.sub(r"\band\b.*", "", drug)
        drug = drug.strip("., ")
        if re.search(r"[A-Z]", drug) and len(drug.split()) <= 5:
            cleaned["DRUG"].append(drug)

    return cleaned

# -----------------------------
# 3️⃣ Predict function
# -----------------------------
def predict_entities(sentences):
    id2label = {0:"B-ADE", 1:"B-DRUG", 2:"I-ADE", 3:"I-DRUG", 4:"O"}

    for sent in sentences:
        print("\n==============================")
        print("Sentence:", sent)

        # Tokenize
        tokens = re.findall(r"\w+|[^\w\s]", sent)
        encoded = tokenizer(
            tokens,
            is_split_into_words=True,
            truncation=True,
            max_length=512,
            return_tensors=None
        )
        inputs = {k: torch.tensor([v]).to(device) for k, v in encoded.items()}

        # Predict
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)[0].cpu().numpy()

        # Align tokens with predictions
        word_ids = encoded.word_ids(batch_index=0)
        pred_labels = []
        prev_word_idx = None
        for idx, word_idx in enumerate(word_ids):
            if word_idx is None or word_idx == prev_word_idx:
                continue
            pred_labels.append((tokens[word_idx], predictions[idx]))
            prev_word_idx = word_idx

        pred_labels_named = [(word, id2label[label]) for word, label in pred_labels]
        print("🔹 Token-level predictions:")
        print(pred_labels_named)

        # Merge contiguous entities
        entities = {"DRUG": [], "ADE": []}
        current_entity = None
        current_words = []

        for word, label in pred_labels_named:
            if label in ["B-DRUG", "I-DRUG"]:
                if current_entity == "DRUG":
                    current_words.append(word)
                else:
                    if current_entity and current_words:
                        entities[current_entity].append(" ".join(current_words))
                    current_entity = "DRUG"
                    current_words = [word]
            elif label in ["B-ADE", "I-ADE"]:
                if current_entity == "ADE":
                    current_words.append(word)
                else:
                    if current_entity and current_words:
                        entities[current_entity].append(" ".join(current_words))
                    current_entity = "ADE"
                    current_words = [word]
            else:
                if current_entity and current_words:
                    entities[current_entity].append(" ".join(current_words))
                current_entity = None
                current_words = []

        if current_entity and current_words:
            entities[current_entity].append(" ".join(current_words))

        print("\n🔹 Entity-level predictions (raw model):")
        for ent_type, ent_list in entities.items():
            print(f"{ent_type}: {', '.join(ent_list) if ent_list else 'None'}")

        # Post-process
        entities_clean = clean_entities(entities)
        entities_post = postprocess_entities(sent, entities_clean)

        print("\n🔹 Entity-level predictions (post-processed, fuzzy match):")
        for ent_type, ent_list in entities_post.items():
            print(f"{ent_type}: {', '.join(ent_list) if ent_list else 'None'}")


# -----------------------------
# 4️⃣ Example usage
# -----------------------------
sentences = [
    "After taking AstraZeneca vaccine, the patient experienced nausea and chest pain.",
    "He was given Covaxin but developed rash and severe itching.",
    "The subject reported fatigue, dizziness, and fever following the Pfizer booster.",
    "Moderna shot was administered without immediate side effects.",
    "Patient got Pfizer-BioNTech vaccine and later reported severe dizziness, fatigue, and rash."
]

predict_entities(sentences)



Sentence: After taking AstraZeneca vaccine, the patient experienced nausea and chest pain.
🔹 Token-level predictions:
[('After', 'O'), ('taking', 'O'), ('AstraZeneca', 'O'), ('vaccine', 'O'), (',', 'O'), ('the', 'O'), ('patient', 'O'), ('experienced', 'O'), ('nausea', 'B-ADE'), ('and', 'O'), ('chest', 'B-ADE'), ('pain', 'I-ADE'), ('.', 'I-ADE')]

🔹 Entity-level predictions (raw model):
DRUG: None
ADE: nausea, chest pain .

🔹 Entity-level predictions (post-processed, fuzzy match):
DRUG: astrazeneca
ADE: nausea, chest pain

Sentence: He was given Covaxin but developed rash and severe itching.
🔹 Token-level predictions:
[('He', 'O'), ('was', 'O'), ('given', 'O'), ('Covaxin', 'B-DRUG'), ('but', 'O'), ('developed', 'O'), ('rash', 'B-ADE'), ('and', 'O'), ('severe', 'O'), ('itching', 'O'), ('.', 'O')]

🔹 Entity-level predictions (raw model):
DRUG: Covaxin
ADE: rash

🔹 Entity-level predictions (post-processed, fuzzy match):
DRUG: Covaxin
ADE: rash, itching

Sentence: The subject reported fati

In [23]:
# Zip the last checkpoint folder
!zip -r checkpoint-4895.zip /content/ner_biobert/checkpoint-4895

# Download the zip
from google.colab import files
files.download("checkpoint-4895.zip")

  adding: content/ner_biobert/checkpoint-4895/ (stored 0%)
  adding: content/ner_biobert/checkpoint-4895/optimizer.pt (deflated 8%)
  adding: content/ner_biobert/checkpoint-4895/tokenizer.json (deflated 70%)
  adding: content/ner_biobert/checkpoint-4895/model.safetensors (deflated 7%)
  adding: content/ner_biobert/checkpoint-4895/config.json (deflated 51%)
  adding: content/ner_biobert/checkpoint-4895/rng_state.pth (deflated 26%)
  adding: content/ner_biobert/checkpoint-4895/tokenizer_config.json (deflated 74%)
  adding: content/ner_biobert/checkpoint-4895/special_tokens_map.json (deflated 42%)
  adding: content/ner_biobert/checkpoint-4895/training_args.bin (deflated 53%)
  adding: content/ner_biobert/checkpoint-4895/trainer_state.json (deflated 76%)
  adding: content/ner_biobert/checkpoint-4895/scheduler.pt (deflated 62%)
  adding: content/ner_biobert/checkpoint-4895/vocab.txt (deflated 49%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [24]:

# Zip the last checkpoint folder
!zip -r biobert-ner-final.zip /content/biobert-ner-final

# Download the zip
from google.colab import files
files.download("biobert-ner-final.zip")


  adding: content/biobert-ner-final/ (stored 0%)
  adding: content/biobert-ner-final/tokenizer.json (deflated 70%)
  adding: content/biobert-ner-final/model.safetensors (deflated 7%)
  adding: content/biobert-ner-final/config.json (deflated 51%)
  adding: content/biobert-ner-final/tokenizer_config.json (deflated 74%)
  adding: content/biobert-ner-final/special_tokens_map.json (deflated 42%)
  adding: content/biobert-ner-final/vocab.txt (deflated 49%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')