In [56]:
import os
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import classification_report

from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)

import matplotlib.pyplot as plt
import seaborn as sns

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification
)


In [57]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cpu')

In [58]:
# MODEL_NAME = "bert-base"
MODEL_NAME = "RoBERTa-base"
# MODEL_NAME = "SciBERT-base"

MODEL_DIR = f"../Models/BERT_{MODEL_NAME}"


In [59]:
test_df = pd.read_csv("../Models/Features/BERT/bert_test.csv")  # adjust if path differs

X_test = test_df["text"].tolist()
y_test = test_df["label"].tolist()

print("Test samples:", len(X_test))


Test samples: 5986


In [60]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_DIR,
    num_labels=2
)

model.to(device)
model.eval()


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [61]:
class BertTestDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [62]:
test_dataset = BertTestDataset(X_test, y_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8)


In [63]:
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )

        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())


In [64]:
accuracy  = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall    = recall_score(all_labels, all_preds)
f1        = f1_score(all_labels, all_preds)

metrics = {
    "model": MODEL_NAME,
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1_score": f1
}

metrics


{'model': 'RoBERTa-base',
 'accuracy': 0.9929836284664216,
 'precision': 0.9946470391435263,
 'recall': 0.9913304434811604,
 'f1_score': 0.9929859719438878}

In [65]:
metrics_df = pd.DataFrame([metrics])

metrics_df.to_csv(
    f"../Models/Evaluation/metrics/BERT_{MODEL_NAME}_metrics.csv",
    index=False
)


In [66]:
cm = confusion_matrix(all_labels, all_preds)

plt.figure(figsize=(5, 4))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=["No Drug", "Drug"],
    yticklabels=["No Drug", "Drug"]
)

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title(f"Confusion Matrix - {MODEL_NAME}")
plt.tight_layout()

plt.savefig(
    f"../Models/Evaluation/confusion_matrices/BERT_{MODEL_NAME}_confusion_matrix.png",
    dpi=300
)

plt.close()


In [67]:
values = [accuracy, precision, recall, f1]
labels = ["Accuracy", "Precision", "Recall", "F1"]

plt.figure(figsize=(6, 4))
bars = plt.bar(labels, values)

plt.ylim(0.95, 1.0)
plt.ylabel("Score")
plt.title(f"Performance Metrics - {MODEL_NAME}")

for bar, val in zip(bars, values):
    plt.text(
        bar.get_x() + bar.get_width()/2,
        val,
        f"{val:.3f}",
        ha="center",
        va="bottom"
    )

plt.tight_layout()
plt.savefig(
    f"../Models/Evaluation/plots/BERT_{MODEL_NAME}_metrics.png",
    dpi=300
)

plt.close()


In [68]:
report = classification_report(
    all_labels,
    all_preds,
    output_dict=True
)

report_df = pd.DataFrame(report).transpose()
report_df


Unnamed: 0,precision,recall,f1-score,support
0,0.991325,0.994643,0.992981,2987.0
1,0.994647,0.99133,0.992986,2999.0
accuracy,0.992984,0.992984,0.992984,0.992984
macro avg,0.992986,0.992987,0.992984,5986.0
weighted avg,0.992989,0.992984,0.992984,5986.0


In [69]:
report_df.to_csv(
    f"../Models/Evaluation/reports/BERT_{MODEL_NAME}_classification_report.csv"
)