In [None]:
#load tsv data

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import os


drug_test_data = pd.read_csv("data/drugLibTest_raw.tsv", sep='\t')
drug_train_data = pd.read_csv("data/drugLibTrain_raw.tsv", sep='\t')

#combine train and test data
drug_data = pd.concat([drug_train_data, drug_test_data], ignore_index=True)

#display first few rows of the dataframe
print(drug_data.head())

   Unnamed: 0       urlDrugName  rating         effectiveness  \
0        2202         enalapril       4      Highly Effective   
1        3117  ortho-tri-cyclen       1      Highly Effective   
2        1146           ponstel      10      Highly Effective   
3        3947          prilosec       3  Marginally Effective   
4        1951            lyrica       2  Marginally Effective   

           sideEffects                               condition  \
0    Mild Side Effects  management of congestive heart failure   
1  Severe Side Effects                        birth prevention   
2      No Side Effects                        menstrual cramps   
3    Mild Side Effects                             acid reflux   
4  Severe Side Effects                            fibromyalgia   

                                      benefitsReview  \
0  slowed the progression of left ventricular dys...   
1  Although this type of birth control has more c...   
2  I was used to having cramps so badly that

In [None]:
#check for missing value counts
print(drug_data.isnull().sum())

Unnamed: 0            0
urlDrugName           0
rating                0
effectiveness         0
sideEffects           0
condition             1
benefitsReview       23
sideEffectsReview    98
commentsReview       13
dtype: int64


In [None]:
#drop rows with missing values
drug_data = drug_data.dropna()

In [None]:
#create csv file so I can view it clearly
output_path = 'data/cleaned_drug_data.csv'

# Save the DataFrame to a CSV file
drug_data.to_csv(output_path, index=False)

# Now read the CSV file
pd.read_csv(output_path)

Unnamed: 0.1,Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,2202,enalapril,4,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ..."
1,3117,ortho-tri-cyclen,1,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest..."
2,1146,ponstel,10,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...
3,3947,prilosec,3,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...
4,1951,lyrica,2,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above
...,...,...,...,...,...,...,...,...,...
4008,690,accutane,7,Considerably Effective,Severe Side Effects,acne vulgaris,Detoxing effect by pushing out the system thro...,"Hairloss, extreme dry skin, itchiness, raises ...",Treatment period is 3 months/12 weeks. Dosage ...
4009,1071,proair-hfa,10,Highly Effective,No Side Effects,asthma,"The albuterol relieved the constriction, irrit...",I have experienced no side effects.,I use the albuterol as needed because of aller...
4010,681,accutane,8,Considerably Effective,Moderate Side Effects,serve acne,Serve Acne has turned to middle,"Painfull muscles, problems with seeing at night","This drug is highly teratogenic ,females must ..."
4011,2709,divigel,10,Highly Effective,No Side Effects,menopause,"My overall mood, sense of well being, energy l...",No side effects of any kind were noted or appa...,Divigel is a topically applied Bio-Identical H...


In [None]:
#combine comments and reviews of drug effective, side effects, and other comments into one column

#Drop rows with missing or unnamed drug names
drug_data = drug_data.dropna(subset=["urlDrugName", "rating"])
drug_data = drug_data[~drug_data["urlDrugName"].str.lower().str.contains("unnamed", na=False)]

#Combine three text fields into one review column
drug_data["text"] = (
    drug_data[["benefitsReview", "sideEffectsReview", "commentsReview"]]
    .fillna("")
    .agg(" ".join, axis=1)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

#Drop rows where the combined text is empty
drug_data = drug_data[drug_data["text"].str.len() > 10]

#Drop rows where the combined text is empty
drug_data = drug_data[drug_data["text"].str.len() > 10]

#Keep only what we need
drug_data = drug_data[["urlDrugName", "rating", "effectiveness", "sideEffects", "condition", "text"]]


In [None]:
#describe data
import pandas as pd
drug_data = pd.concat([pd.read_csv("data/drugLibTrain_raw.tsv", sep="\t"), pd.read_csv("data/drugLibTest_raw.tsv", sep="\t")])
drug_data["text"] = drug_data[["benefitsReview","sideEffectsReview","commentsReview"]].fillna("").agg(" ".join, axis=1).str.strip()
print(f"Final dataset size after cleaning: {df[df['text'].str.len()>10].dropna(subset=['urlDrugName','rating']).shape[0]:,} reviews")



Final dataset size after cleaning: 4,142 reviews


In [None]:
#map out sentiment based on rating

def map_sentiment(r):
    if r <= 3:
        return "negative"
    elif r <= 6:
        return "neutral"
    else:
        return "positive"

drug_data["sentiment"] = drug_data["rating"].apply(map_sentiment)

#view distribution of sentiment

sentiment_dist = drug_data["sentiment"].value_counts(normalize=True).round(3)
print(sentiment_dist)

sentiment
positive    0.676
negative    0.181
neutral     0.143
Name: proportion, dtype: float64


In [None]:
#split x and y into train and test sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    drug_data["text"], drug_data["sentiment"],
    test_size=0.2, random_state=42, stratify=drug_data["sentiment"]
)


In [None]:
# Optional: TF-IDF Baseline Models (for comparison only)
# Note: These are kept as optional baselines but are not the main modeling approach

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

print("=" * 60)
print("OPTIONAL BASELINE: TF-IDF Models")
print("=" * 60)
print("Note: Main pipeline uses ClinicalBERT embeddings → PCA → ANN")
print("=" * 60)

# Uncomment below to run TF-IDF baselines
"""
pipelines = {
    "TFIDF+LinearSVC": Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, sublinear_tf=True)),
        ("clf", LinearSVC())
    ]),
    "TFIDF+LogReg": Pipeline([
        ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, sublinear_tf=True)),
        ("clf", LogisticRegression(max_iter=1000, n_jobs=-1))
    ])
}

for name, pipe in pipelines.items():
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    acc = accuracy_score(y_test, preds)
    f1m = f1_score(y_test, preds, average="macro")
    print(f"\n=== {name} ===")
    print(f"Accuracy: {acc:.3f} | Macro-F1: {f1m:.3f}")
    print(classification_report(y_test, preds, digits=3))
"""


=== LinearSVC ===
Accuracy: 0.767 | Macro-F1: 0.558
              precision    recall  f1-score   support

    negative      0.777     0.533     0.632       150
     neutral      0.520     0.109     0.181       119
    positive      0.775     0.970     0.861       560

    accuracy                          0.767       829
   macro avg      0.690     0.537     0.558       829
weighted avg      0.738     0.767     0.722       829

[[ 80   6  64]
 [ 12  13  94]
 [ 11   6 543]]

=== LogReg ===
Accuracy: 0.727 | Macro-F1: 0.428
              precision    recall  f1-score   support

    negative      0.955     0.280     0.433       150
     neutral      1.000     0.008     0.017       119
    positive      0.714     1.000     0.833       560

    accuracy                          0.727       829
   macro avg      0.890     0.429     0.428       829
weighted avg      0.799     0.727     0.644       829

[[ 42   0 108]
 [  2   1 116]
 [  0   0 560]]


In [None]:
# Install required packages
%pip install transformers torch datasets scikit-learn



In [None]:
# Import libraries for ClinicalBERT feature extraction and ANN
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, recall_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU only'}")

GPU: CPU only


In [None]:
# Verify data structure
print("Data columns:", drug_data.columns.tolist())
print(f"\nDataset shape: {drug_data.shape}")
print(f"Text column: {TEXT_COL}")
print(f"Label column: {LABEL_COL}")


Index(['Unnamed: 0', 'urlDrugName', 'rating', 'effectiveness', 'sideEffects',
       'condition', 'benefitsReview', 'sideEffectsReview', 'commentsReview',
       'text', 'sentiment'],
      dtype='object')


In [None]:
# ============================================================
# DATA LOADING AND PREPROCESSING
# ============================================================
# This section loads the data, creates text column, maps ratings to sentiment labels,
# and splits into train/validation sets

TEXT_COL = "text"
LABEL_COL = "sentiment"

# Load and prepare data
import re
drug_data = pd.read_csv("data/cleaned_drug_data.csv")

# Ensure TEXT_COL exists
if TEXT_COL not in drug_data.columns:
    preferred = ["benefitsReview", "sideEffectsReview", "commentsReview"]
    present = [c for c in preferred if c in drug_data.columns]
    if not present:
        pat = re.compile(r"(benefit|side\s*effect|comment|review|text)", re.IGNORECASE)
        present = [c for c in drug_data.columns if pat.search(str(c))]
    if not present:
        raise KeyError("No text/review columns found to build 'text'.")
    drug_data[TEXT_COL] = (
        drug_data[present].fillna("").astype(str).agg(" ".join, axis=1)
        .str.replace(r"\s+", " ", regex=True).str.strip()
    )

# Check LABEL_COL exists (derive from rating if needed)
if LABEL_COL not in drug_data.columns:
    rating_col = next((c for c in ["rating","Rating","RATING"] if c in drug_data.columns), None)
    if rating_col is None:
        raise KeyError("No 'sentiment' column and no 'rating' column to derive it from.")
    def map_sent(r):
        try:
            r = float(r)
        except Exception:
            return np.nan
        if r <= 3: return "negative"
        if r <= 6: return "neutral"
        return "positive"
    drug_data[LABEL_COL] = drug_data[rating_col].apply(map_sent)

# Clean text/labels
drug_data = drug_data.dropna(subset=[TEXT_COL, LABEL_COL])
drug_data[TEXT_COL] = drug_data[TEXT_COL].astype(str).str.strip()
drug_data = drug_data[drug_data[TEXT_COL].str.len() > 10]

# Map labels -> ids
label_order = ["negative","neutral","positive"]
label2id = {l:i for i,l in enumerate(label_order)}
id2label = {i:l for l,i in label2id.items()}
drug_data["label"] = drug_data[LABEL_COL].map(label2id)

# Safety check
bad = drug_data[drug_data["label"].isna()][LABEL_COL].unique()
if len(bad):
    raise ValueError(f"Unexpected labels found: {bad}. Expected one of {label_order}.")

# Train/val split
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(
    drug_data[[TEXT_COL,"label"]],
    test_size=0.2,
    random_state=42,
    stratify=drug_data["label"]
)

# Convert to Hugging Face Dataset objects
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))

print(f"Train size: {len(train_ds)}, Validation size: {len(val_ds)}")
print(f"\nLabel distribution:")
print(drug_data[LABEL_COL].value_counts())


(3210,
 803,
 sentiment
 positive    2707
 negative     721
 neutral      585
 Name: count, dtype: int64)

In [None]:
# ============================================================
# STEP 1: CLINICALBERT EMBEDDING EXTRACTION
# ============================================================
# Load ClinicalBERT tokenizer and encoder model for feature extraction
# (not fine-tuning - we extract embeddings from the pre-trained model)

MODEL_NAME = "emilyalsentzer/Bio_ClinicalBERT"
MAX_LEN = 256

print("Loading ClinicalBERT tokenizer and encoder...")
tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
enc_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
enc_model.eval()  # Set to evaluation mode (no gradient computation)

print(f"Model loaded on {device}")
print(f"Model hidden size: {enc_model.config.hidden_size}")

# Tokenize datasets
def tokenize_for_enc(batch):
    """Tokenize text for embedding extraction"""
    return tok(
        batch[TEXT_COL], 
        truncation=True, 
        padding="max_length", 
        max_length=MAX_LEN
    )

print("\nTokenizing datasets...")
train_tok = train_ds.map(tokenize_for_enc, batched=True)
val_tok = val_ds.map(tokenize_for_enc, batched=True)

# Set format for PyTorch
train_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

print(f"Train tokenized: {len(train_tok)} samples")
print(f"Val tokenized: {len(val_tok)} samples")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Extract [CLS] token embeddings from ClinicalBERT
# This function processes batches and extracts the [CLS] token embedding (first token)

def get_cls_embeddings(dataset, batch_size=16):
    """
    Extract [CLS] token embeddings from ClinicalBERT for all samples in dataset.
    
    Returns:
        X_emb: numpy array of shape (N, 768) - BERT embeddings
        y: numpy array of shape (N,) - integer labels
    """
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    all_embeddings = []
    all_labels = []
    
    print(f"Extracting embeddings from {len(dataset)} samples...")
    with torch.no_grad():  # No gradient computation needed
        for i, batch in enumerate(dataloader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].cpu().numpy()
            
            # Forward pass through encoder
            outputs = enc_model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Extract [CLS] token embedding (first token, index 0)
            cls_emb = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
            
            all_embeddings.append(cls_emb.cpu().numpy())
            all_labels.append(labels)
            
            if (i + 1) % 50 == 0:
                print(f"  Processed {i + 1} batches...")
    
    X_emb = np.vstack(all_embeddings)
    y = np.concatenate(all_labels)
    
    print(f"Extracted embeddings shape: {X_emb.shape}")
    print(f"Labels shape: {y.shape}")
    
    return X_emb, y

# Extract embeddings for train and validation sets
print("=" * 60)
print("Extracting ClinicalBERT Embeddings")
print("=" * 60)

X_train_bert, y_train = get_cls_embeddings(train_tok, batch_size=16)
X_val_bert, y_val = get_cls_embeddings(val_tok, batch_size=16)

print(f"\nTrain embeddings: {X_train_bert.shape}, Labels: {y_train.shape}")
print(f"Val embeddings: {X_val_bert.shape}, Labels: {y_val.shape}")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


In [None]:
# ============================================================
# STEP 2: PCA DIMENSIONALITY REDUCTION
# ============================================================
# Standardize BERT embeddings and apply PCA to reduce dimensionality
# This makes the features more manageable for the ANN classifier

# Hyperparameter: number of PCA components
N_COMPONENTS = 50  # Adjustable: try 32, 50, 100, etc.

print("=" * 60)
print("Applying PCA to BERT Embeddings")
print("=" * 60)

# Standardize features
print("Standardizing embeddings...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_bert)
X_val_scaled = scaler.transform(X_val_bert)

# Apply PCA
print(f"Applying PCA with {N_COMPONENTS} components...")
pca = PCA(n_components=N_COMPONENTS, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)

print(f"\nOriginal embedding dimension: {X_train_bert.shape[1]}")
print(f"PCA reduced dimension: {X_train_pca.shape[1]}")
print(f"Variance explained by {N_COMPONENTS} components: {pca.explained_variance_ratio_.sum():.3f}")

print(f"\nTrain PCA features shape: {X_train_pca.shape}")
print(f"Val PCA features shape: {X_val_pca.shape}")


In [None]:
# ============================================================
# STEP 3: FEEDFORWARD ANN CLASSIFIER
# ============================================================
# Build a simple feedforward neural network that takes PCA features
# as input and outputs 3 sentiment classes

# Define ANN architecture
input_dim = X_train_pca.shape[1]
num_classes = 3

class SentimentANN(nn.Module):
    """
    Feedforward neural network for sentiment classification.
    Input: PCA-reduced BERT embeddings
    Output: 3 sentiment classes (negative, neutral, positive)
    """
    def __init__(self, in_dim, hidden=64, num_classes=3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden, num_classes)
        )
    
    def forward(self, x):
        return self.net(x)

# Create PyTorch datasets and dataloaders
train_dataset = TensorDataset(
    torch.FloatTensor(X_train_pca), 
    torch.LongTensor(y_train)
)
val_dataset = TensorDataset(
    torch.FloatTensor(X_val_pca), 
    torch.LongTensor(y_val)
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Initialize model
model = SentimentANN(in_dim=input_dim, hidden=64, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print("=" * 60)
print("Training Feedforward ANN")
print("=" * 60)
print(f"Input dimension: {input_dim}")
print(f"Number of classes: {num_classes}")
print(f"Model architecture:\n{model}")
print(f"\nTraining on {len(train_dataset)} samples, validating on {len(val_dataset)} samples")

In [None]:
# Training loop for ANN
NUM_EPOCHS = 10

print(f"\nTraining for {NUM_EPOCHS} epochs...")
print("-" * 60)

best_val_acc = 0.0
best_model_state = None

for epoch in range(NUM_EPOCHS):
    # Training phase
    model.train()
    train_loss = 0.0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # Validation phase
    model.eval()
    val_correct = 0
    val_total = 0
    all_val_preds = []
    all_val_labels = []
    
    with torch.no_grad():
        for batch_x, batch_y in val_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            _, predicted = torch.max(outputs.data, 1)
            
            val_total += batch_y.size(0)
            val_correct += (predicted == batch_y).sum().item()
            
            all_val_preds.extend(predicted.cpu().numpy())
            all_val_labels.extend(batch_y.cpu().numpy())
    
    val_acc = val_correct / val_total
    val_f1 = f1_score(all_val_labels, all_val_preds, average="macro")
    
    avg_train_loss = train_loss / len(train_loader)
    
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}")
    print(f"  Train Loss: {avg_train_loss:.4f}")
    print(f"  Val Accuracy: {val_acc:.4f}")
    print(f"  Val Macro F1: {val_f1:.4f}")
    
    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_state = model.state_dict().copy()
        print(f"  ✓ New best validation accuracy!")
    print()

# Load best model
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"Loaded best model with validation accuracy: {best_val_acc:.4f}")

print("=" * 60)


In [None]:
# ============================================================
# STEP 4: EVALUATION AND RESULTS
# ============================================================
# Evaluate the ANN model and generate classification report and confusion matrix

print("=" * 60)
print("Final Evaluation: ANN on ClinicalBERT+PCA Features")
print("=" * 60)

# Get predictions on validation set
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch_x, batch_y in val_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        outputs = model(batch_x)
        _, predicted = torch.max(outputs.data, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(batch_y.cpu().numpy())

y_pred_ann = np.array(all_preds)
y_true_ann = np.array(all_labels)

# Calculate metrics
acc_ann = accuracy_score(y_true_ann, y_pred_ann)
f1_ann = f1_score(y_true_ann, y_pred_ann, average="macro")
recall_ann = recall_score(y_true_ann, y_pred_ann, average="macro")

# Classification report
print("\nClassification Report:")
print(classification_report(
    y_true_ann, y_pred_ann, 
    target_names=label_order, 
    digits=3
))

# Create performance summary
ann_metrics = {
    "model": "ClinicalBERT+PCA+ANN",
    "accuracy": acc_ann,
    "macro_recall": recall_ann,
    "macro_f1": f1_ann
}

perf_df = pd.DataFrame([ann_metrics])
print("\n" + "=" * 60)
print("Performance Summary")
print("=" * 60)
print(perf_df.to_string(index=False))
print(f"\nAccuracy: {acc_ann:.3f}")
print(f"Macro Recall: {recall_ann:.3f}")
print(f"Macro F1: {f1_ann:.3f}")

# Confusion matrix (row-normalized percentages)
cm_ann = confusion_matrix(y_true_ann, y_pred_ann, labels=[0,1,2])

def confusion_percent(cm_counts):
    """Return row-normalized confusion matrix in percentages."""
    with np.errstate(invalid="ignore", divide="ignore"):
        row_sums = cm_counts.sum(axis=1, keepdims=True)
        return np.where(row_sums > 0, (cm_counts / row_sums) * 100.0, 0.0)

cm_ann_pct = confusion_percent(cm_ann)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
im = plt.imshow(cm_ann_pct, cmap="Blues", aspect="auto")
plt.colorbar(im, label="Row %")
plt.title("Confusion Matrix - ClinicalBERT+PCA+ANN\n(Row-normalized percentages)", fontsize=12)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.xticks([0,1,2], label_order)
plt.yticks([0,1,2], label_order)

# Add text annotations
for (i, j), val in np.ndenumerate(cm_ann_pct):
    plt.text(j, i, f"{val:.1f}%", ha="center", va="center", fontsize=11, 
             color="white" if val > 50 else "black", weight="bold")

plt.tight_layout()
plt.show()

# Print raw confusion matrix counts
print("\nConfusion Matrix (raw counts):")
print(cm_ann)
print(f"\nRow-normalized percentages:")
print(cm_ann_pct)

# Save results
perf_df.to_csv("model_performance_comparison.csv", index=False)
print("\nSaved: model_performance_comparison.csv")
