In [1]:
# -----------------------------
# 0️⃣ Imports
# -----------------------------
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# -----------------------------
# 1️⃣ Load CSV + Snorkel weak labels
# -----------------------------
df = pd.read_csv("/content/sample_data/dataset_with_entities_and_weaklabels.csv")

# Replace invalid weak labels (-1) with Moderate (1)
df["severity_id"] = df["weak_label_id"].apply(lambda x: 1 if x == -1 else int(x))

# Optional sanity check
print("Severity distribution after replacement:")
print(df["severity_id"].value_counts())

# -----------------------------
# 2️⃣ Save JSONL for classifier
# -----------------------------
def save_jsonl(filename, df):
    records = df.to_dict(orient="records")
    with open(filename, "w") as f:
        for rec in records:
            f.write(json.dumps({
                "tokens": rec["symptom_combined"].split(),  # or SYMPTOM_TEXT
                "severity_id": rec["severity_id"]
            }) + "\n")

# Train/val/test split
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["severity_id"])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["severity_id"])

save_jsonl("train.jsonl", train_df)
save_jsonl("val.jsonl", val_df)
save_jsonl("test.jsonl", test_df)

print("✅ JSONL splits saved.")


Severity distribution after replacement:
severity_id
1    14296
0     3601
2      525
Name: count, dtype: int64
✅ JSONL splits saved.


In [3]:
# -----------------------------
# 3️⃣ Prepare Hugging Face Dataset
# -----------------------------
def prepare_severity_dataset(jsonl_file):
    texts, labels = [], []
    with open(jsonl_file, "r") as f:
        for line in f:
            item = json.loads(line)
            text = " ".join(item["tokens"])
            severity = item["severity_id"]
            if severity in [0,1,2]:
                texts.append(text)
                labels.append(severity)
    return Dataset.from_dict({"text": texts, "label": labels})

train_ds = prepare_severity_dataset("train.jsonl")
val_ds   = prepare_severity_dataset("val.jsonl")
test_ds  = prepare_severity_dataset("test.jsonl")


In [4]:
from collections import Counter

label_counts = Counter(train_ds["label"])
print("Label distribution (train):", label_counts)

label_counts_val = Counter(val_ds["label"])
print("Label distribution (val):", label_counts_val)


Label distribution (train): Counter({1: 10007, 0: 2521, 2: 367})
Label distribution (val): Counter({1: 2144, 0: 540, 2: 79})


In [5]:
# -----------------------------
# 4️⃣ Tokenization
# -----------------------------
sev_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

def tokenize_fn(batch):
    return sev_tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

train_ds = train_ds.map(tokenize_fn, batched=True)
val_ds   = val_ds.map(tokenize_fn, batched=True)
test_ds  = test_ds.map(tokenize_fn, batched=True)

# Remove raw text column
train_ds = train_ds.remove_columns(["text"])
val_ds   = val_ds.remove_columns(["text"])
test_ds  = test_ds.remove_columns(["text"])


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/12895 [00:00<?, ? examples/s]

Map:   0%|          | 0/2763 [00:00<?, ? examples/s]

Map:   0%|          | 0/2764 [00:00<?, ? examples/s]

In [6]:
# -----------------------------
# 5️⃣ Define model
# -----------------------------
num_labels = 3  # MILD, MODERATE, SEVERE
model = AutoModelForSequenceClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    num_labels=num_labels
)

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# -----------------------------
# 6️⃣ Metrics
# -----------------------------
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}


In [8]:
# -----------------------------
# 7️⃣ TrainingArguments
# -----------------------------
training_args = TrainingArguments(
    output_dir="./bioBERT_severity_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    report_to=[],
    push_to_hub=False
)


In [9]:
# -----------------------------
# 8️⃣ Trainer
# -----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=sev_tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [10]:
# -----------------------------
# 9️⃣ Train
# -----------------------------
trainer.train()

# -----------------------------
# 🔟 Evaluate on test set
# -----------------------------
metrics = trainer.evaluate(test_ds)
print("Test set metrics:", metrics)

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3282,0.184866,0.942454,0.939937
2,0.1664,0.193222,0.942454,0.94133
3,0.1352,0.211521,0.944263,0.942922


Test set metrics: {'eval_loss': 0.23074541985988617, 'eval_accuracy': 0.9417510853835022, 'eval_f1': 0.9408317048810853, 'eval_runtime': 18.9737, 'eval_samples_per_second': 145.675, 'eval_steps_per_second': 9.118, 'epoch': 3.0}


In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os


MODEL_PATH = "/content/bioBERT_severity_model/checkpoint-2418"
#print(f"Using model from {MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [17]:
from transformers import pipeline
import torch

id2label = {0: "Severe", 1:"Moderate", 2:"Mild" }

clf = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    return_all_scores=True
)

def predict_severity(text):
    preds = clf(text)
    pred_idx = torch.tensor([d["score"] for d in preds[0]]).argmax().item()
    label = id2label.get(pred_idx, "Unknown")
    confidence = preds[0][pred_idx]["score"]
    return label, confidence, preds

examples = [
    "Patient developed high fever and severe headache after vaccination.",
    "Mild pain in the arm for one day.",
    "Critical condition and hospitalized due to allergic reaction.",
    "Moderna vaccine was administered with no immediate side effects.",
    "Slight fatigue for two days, now recovered.",
    "pulmonary embolism, blood clot left lung. put on eliqust 10 mgs a day.",
    "vaccinated pt admitted for covid infection. no vaccine info available",
    "vaccinated pt admitted to inpatient faciility with covid infection"
]

for text in examples:
    label, conf, raw = predict_severity(text)
    print(f"🩸 Text: {text}")
    print(f"→ Predicted: {label} (confidence: {conf:.3f})\n")



Device set to use cuda:0


🩸 Text: Patient developed high fever and severe headache after vaccination.
→ Predicted: Severe (confidence: 0.998)

🩸 Text: Mild pain in the arm for one day.
→ Predicted: Mild (confidence: 0.997)

🩸 Text: Critical condition and hospitalized due to allergic reaction.
→ Predicted: Severe (confidence: 0.997)

🩸 Text: Moderna vaccine was administered with no immediate side effects.
→ Predicted: Moderate (confidence: 0.971)

🩸 Text: Slight fatigue for two days, now recovered.
→ Predicted: Moderate (confidence: 0.938)

🩸 Text: pulmonary embolism, blood clot left lung. put on eliqust 10 mgs a day.
→ Predicted: Severe (confidence: 0.988)

🩸 Text: vaccinated pt admitted for covid infection. no vaccine info available
→ Predicted: Severe (confidence: 0.997)

🩸 Text: vaccinated pt admitted to inpatient faciility with covid infection
→ Predicted: Severe (confidence: 0.997)



In [15]:
# -----------------------------
# 1️⃣1️⃣ Save final model
# -----------------------------
save_path = "bioBERT_severity_model_final"

# Save model
model.save_pretrained(save_path)

# Save tokenizer
tokenizer.save_pretrained(save_path)

print(f"✅ Model and tokenizer saved to '{save_path}'")

✅ Model and tokenizer saved to 'bioBERT_severity_model_final'


In [16]:
# Zip the last checkpoint folder
!zip -r bioBERT_severity_model_final.zip /content/bioBERT_severity_model_final

# Download the zip
from google.colab import files
files.download("bioBERT_severity_model_final.zip")

  adding: content/bioBERT_severity_model_final/ (stored 0%)
  adding: content/bioBERT_severity_model_final/tokenizer.json (deflated 70%)
  adding: content/bioBERT_severity_model_final/config.json (deflated 52%)
  adding: content/bioBERT_severity_model_final/vocab.txt (deflated 49%)
  adding: content/bioBERT_severity_model_final/model.safetensors (deflated 7%)
  adding: content/bioBERT_severity_model_final/tokenizer_config.json (deflated 73%)
  adding: content/bioBERT_severity_model_final/special_tokens_map.json (deflated 80%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>