In [1]:
!pip uninstall -y transformers datasets accelerate evaluate huggingface_hub
!pip install --no-cache-dir torch torchvision torchaudio
!pip install --no-cache-dir transformers==4.57.1 datasets accelerate evaluate

Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
Found existing installation: datasets 4.4.1
Uninstalling datasets-4.4.1:
  Successfully uninstalled datasets-4.4.1
Found existing installation: accelerate 1.11.0
Uninstalling accelerate-1.11.0:
  Successfully uninstalled accelerate-1.11.0
Found existing installation: evaluate 0.4.6
Uninstalling evaluate-0.4.6:
  Successfully uninstalled evaluate-0.4.6
Found existing installation: huggingface-hub 0.36.0
Uninstalling huggingface-hub-0.36.0:
  Successfully uninstalled huggingface-hub-0.36.0
Collecting transformers==4.57.1
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m161.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-1.11.0-py3-none-any.

In [2]:

import transformers
print("Transformers version:", transformers.__version__)
print("Transformers file:", transformers.__file__)


Transformers version: 4.57.1
Transformers file: /usr/local/lib/python3.12/dist-packages/transformers/__init__.py


In [3]:
# from transformers import TrainingArguments
# print("TrainingArguments object:", TrainingArguments)
# help(TrainingArguments)

In [4]:
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

CUDA available: True
GPU: Tesla T4


In [14]:
from torch import nn
from transformers import Trainer
from transformers import DataCollatorWithPadding
import evaluate
from datasets import load_dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)
import pandas as pd
from datasets import load_dataset, DatasetDict


In [15]:
# 1️⃣ Load the ADE dataset
dataset = load_dataset("SetFit/ade_corpus_v2_classification")

# 2️⃣ Load your mislabeled data
mislabelled_df = pd.read_csv("ade_mislabeled_candidates.csv")
mislabelled_texts = set(mislabelled_df["text"].tolist())

# 3️⃣ Define a filter function
def filter_mislabelled(example):
    return example["text"] not in mislabelled_texts

# 4️⃣ Apply the filter to every split
filtered_dataset = {}
for split in dataset.keys():
    filtered_dataset[split] = dataset[split].filter(filter_mislabelled)
    print(f"{split}: {len(dataset[split])} → {len(filtered_dataset[split])} samples after filtering")

Repo card metadata block was not found. Setting CardData to empty.


train: 17637 → 14170 samples after filtering
test: 5879 → 5768 samples after filtering


In [18]:
# ✅ GPU check
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

# Load dataset
dataset =  DatasetDict(filtered_dataset)


# Tokenizer + Model
tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=2)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# def tokenize(batch):
#     return tokenizer(batch["text"], truncation=True, padding=True)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)


dataset = dataset.map(tokenize, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Define metrics

# acc = evaluate.load("accuracy")
# def compute_metrics(p):
#     preds = torch.argmax(torch.tensor(p.predictions), dim=-1)
#     return acc.compute(predictions=preds, references=p.label_ids)

acc = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(p):
    preds = torch.argmax(torch.tensor(p.predictions), dim=-1)
    return {
        "accuracy": acc.compute(predictions=preds, references=p.label_ids)["accuracy"],
        "precision": precision.compute(predictions=preds, references=p.label_ids)["precision"],
        "recall": recall.compute(predictions=preds, references=p.label_ids)["recall"],
        "f1": f1.compute(predictions=preds, references=p.label_ids)["f1"],
    }


# Training setup
args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
    report_to="none"
)


#### weighted loss section ####

# 1️⃣ compute class weights from training labels
train_labels = dataset["train"]["label"]
class_counts = torch.bincount(torch.tensor(train_labels))
weights = 1.0 / class_counts.float()
weights = weights / weights.sum()
print("Class weights:", weights)

# 2️⃣ subclass Trainer to apply weights
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(weight=weights.to(model.device))
        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels),
            labels.view(-1)
        )

        return (loss, outputs) if return_outputs else loss

# 3️⃣ use the new trainer instead of plain Trainer
weighted_trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,   # your padding collator
)


GPU: Tesla T4


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/14170 [00:00<?, ? examples/s]

Map:   0%|          | 0/5768 [00:00<?, ? examples/s]

Class weights: tensor([0.3006, 0.6994])


In [19]:
# 4️⃣ train normally
weighted_trainer.train()

###############################

# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=dataset["train"],
#     eval_dataset=dataset["test"],
#     compute_metrics=compute_metrics,
#     data_collator=data_collator,
# )

# trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1657,0.32177,0.918689,0.79087,0.952077,0.864019
2,0.0845,0.35385,0.937067,0.849825,0.932907,0.88943
3,0.0284,0.439418,0.947469,0.905006,0.900958,0.902978
4,0.0124,0.579083,0.929958,0.827781,0.936741,0.878897
5,0.0075,0.613452,0.944348,0.882534,0.916933,0.899405
6,0.0019,0.631532,0.941401,0.871143,0.920128,0.894966


TrainOutput(global_step=10632, training_loss=0.055776304469550814, metrics={'train_runtime': 5031.0942, 'train_samples_per_second': 16.899, 'train_steps_per_second': 2.113, 'total_flos': 1.11848509633536e+16, 'train_loss': 0.055776304469550814, 'epoch': 6.0})

In [None]:
weighted_trainer.save_model("bio_ae_detector")
tokenizer.save_pretrained("bio_ae_detector")


In [20]:
text = "The patient experienced severe dizziness and nausea after taking Drug ibuprofen."
inputs = tokenizer(text, return_tensors="pt").to("cuda")
with torch.no_grad():
    logits = model(**inputs).logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
print("Probabilities:", probs)
print("Predicted label:", torch.argmax(probs, dim=-1).item())


Probabilities: tensor([[2.5540e-06, 1.0000e+00]], device='cuda:0')
Predicted label: 1


In [22]:
samples = [
    "The patient experienced severe dizziness and nausea after taking Drug X.",
    "Patient was given Drug X and felt fine.",
    "Patient continued regular therapy without complications.",
    "RESULTS: Evidence of neurological improvement and rehabilitation potential after severe myelopathy due to intrathecal injection of doxorubicin."
]

for s in samples:
    inputs = tokenizer(s, return_tensors="pt").to("cuda")
    with torch.no_grad():
        probs = torch.nn.functional.softmax(model(**inputs).logits, dim=-1)
    print(f"{s}\n→ label: {torch.argmax(probs).item()}, probs: {probs.cpu().numpy()}\n")


The patient experienced severe dizziness and nausea after taking Drug X.
→ label: 1, probs: [[0.08926277 0.9107373 ]]

Patient was given Drug X and felt fine.
→ label: 0, probs: [[9.9999833e-01 1.6480784e-06]]

Patient continued regular therapy without complications.
→ label: 0, probs: [[9.9999881e-01 1.1866414e-06]]

RESULTS: Evidence of neurological improvement and rehabilitation potential after severe myelopathy due to intrathecal injection of doxorubicin.
→ label: 1, probs: [[8.9846535e-06 9.9999106e-01]]



In [None]:
from datasets import load_dataset
from collections import Counter

ds = load_dataset("SetFit/ade_corpus_v2_classification")
train_labels = ds["train"]["label"]
test_labels = ds["test"]["label"]

print("Train label counts:", Counter(train_labels))
print("Test label counts:", Counter(test_labels))



In [None]:
!zip -r bio_ae_detector.zip bio_ae_detector

In [None]:
from google.colab import files
files.download("bio_ae_detector.zip")