In [1]:
import pandas as pd
df = pd.read_csv('summary_models_are_correct.csv')
print(df.columns)
print(len(df))
df.head()

Index(['summary', 'prompt', 'correct_answer', 'biomistral', 'meditron',
       'medalpaca', 'models_are_correct', 'ids_are_correct'],
      dtype='object')
10034


Unnamed: 0,summary,prompt,correct_answer,biomistral,meditron,medalpaca,models_are_correct,ids_are_correct
0,The key medical concepts being tested in this ...,The following are multiple choice questions (w...,B,"{'A': 0.04511607810854912, 'B': 0.594287753105...","{'A': 0.16622786223888397, 'B': 0.194340050220...","{'A': 0.253202885389328, 'B': 0.32007843255996...","['biomistral', 'medalpaca']","[0, 2]"
1,The medical concepts being tested in this ques...,The following are multiple choice questions (w...,C,"{'A': 0.20254969596862793, 'B': 0.020370958372...","{'A': 0.173625186085701, 'B': 0.19369290769100...","{'A': 0.2113712579011917, 'B': 0.1725161820650...","['biomistral', 'meditron', 'medalpaca']","[0, 1, 2]"
2,The medical concepts being tested in this ques...,The following are multiple choice questions (w...,C,"{'A': 0.2921173572540283, 'B': 0.2701644003391...","{'A': 0.26627469062805176, 'B': 0.204159900546...","{'A': 0.18825779855251312, 'B': 0.325280308723...",['meditron'],[1]
3,The key medical concepts being tested in this ...,The following are multiple choice questions (w...,A,"{'A': 0.22638443112373352, 'B': 0.016919802874...","{'A': 0.1531776636838913, 'B': 0.1708820462226...","{'A': 0.21118728816509247, 'B': 0.198392108082...",[],[]
4,The key medical concepts being tested in this ...,The following are multiple choice questions (w...,B,"{'A': 0.11145520955324173, 'B': 0.427251458168...","{'A': 0.14759910106658936, 'B': 0.186582759022...","{'A': 0.3045741617679596, 'B': 0.2335251122713...",['biomistral'],[0]


In [2]:

import seaborn as sns
import pandas as pd
import numpy as np
import torch
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import ast
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve

import warnings
warnings.filterwarnings("ignore")

# Data loading and preprocessing
df = pd.read_csv('summary_models_are_correct.csv')
print("Removing the rows with empty list of 'ids_are_correct' and 'models_are_correct' current length of the dataset is: ", len(df))
df = df[df['models_are_correct'] != '[]']
print("After removing the rows with empty list of 'ids_are_correct' and 'models_are_correct' current length of the dataset is: ", len(df))
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns}")

def safe_eval(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else x
    except:
        return []

df['models_are_correct'] = df['models_are_correct'].apply(safe_eval)

# Analyze label distribution
label_counts = df['models_are_correct'].apply(len).value_counts().sort_index()
print("\nLabel distribution:")
print(label_counts)

# Create binary labels
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(df['models_are_correct'])

print(f"\nNumber of unique labels: {len(mlb.classes_)}")
print("Label frequencies:")
for i, label in enumerate(mlb.classes_):
    print(f"{label}: {binary_labels[:, i].sum()}")

# Preprocess text data
def preprocess_text(text):
    return text.lower()

df['preprocessed_summary'] = df['summary'].apply(preprocess_text)

# Split the data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['preprocessed_summary'].tolist(), binary_labels, test_size=0.2, random_state=42
)

# Compute class weights based on the number of classes in your labels
num_classes = len(mlb.classes_)
class_weights = compute_class_weight('balanced', classes=np.arange(num_classes), y=np.argmax(train_labels, axis=1))
class_weight_dict = dict(zip(range(num_classes), class_weights))

# Tokenizer and model
print("\nLoading tokenizer and model...")
model_name = 'microsoft/deberta-v3-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)

# Custom dataset
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length=512)
test_dataset = TextClassificationDataset(test_texts, test_labels, tokenizer, max_length=512)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (logits > 0).astype(int)
    f1 = f1_score(labels, predictions, average='weighted')
    return {
        'eval_f1': f1,
    }
    
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,  # Set to True for F1 score
)

class WeightedBCEWithLogitsLoss(nn.Module):
    def __init__(self, weight=None):
        super().__init__()
        self.weight = weight

    def forward(self, input, target):
        # Ensure input and target have the same number of classes
        if input.size(1) != target.size(1):
            target = F.pad(target, (0, input.size(1) - target.size(1)), "constant", 0)
        
        loss = F.binary_cross_entropy_with_logits(input, target, reduction='none')
        if self.weight is not None:
            loss = loss * self.weight.unsqueeze(0)
        return loss.mean()

# Replace your existing CustomTrainer class with this updated version
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Ensure class_weights tensor has the same number of classes as logits
        if logits.size(1) != len(class_weights):
            class_weights_tensor = F.pad(torch.tensor(class_weights), (0, logits.size(1) - len(class_weights)), "constant", 1.0)
        else:
            class_weights_tensor = torch.tensor(class_weights)
        
        loss_fct = WeightedBCEWithLogitsLoss(weight=class_weights_tensor.to(model.device))
        loss = loss_fct(logits, labels.float())
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()


  from .autonotebook import tqdm as notebook_tqdm


Removing the rows with empty list of 'ids_are_correct' and 'models_are_correct' current length of the dataset is:  10034
After removing the rows with empty list of 'ids_are_correct' and 'models_are_correct' current length of the dataset is:  7322
Dataset shape: (7322, 8)
Columns: Index(['summary', 'prompt', 'correct_answer', 'biomistral', 'meditron',
       'medalpaca', 'models_are_correct', 'ids_are_correct'],
      dtype='object')

Label distribution:
models_are_correct
1    3626
2    2608
3    1088
Name: count, dtype: int64

Number of unique labels: 3
Label frequencies:
biomistral: 4426
medalpaca: 4307
meditron: 3373

Loading tokenizer and model...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,0.9508,0.942301,0.558926
2,0.9409,0.944792,0.558926
3,0.9562,0.942166,0.558926
4,0.9735,0.94334,0.558926
5,0.8686,1.000438,0.636613



Evaluating the model and calculating metrics...



Test set results:
Accuracy: 0.1618
Precision: 0.5685
Recall: 0.7387
F1 Score: 0.6366

All computed metrics from evaluate():


eval_f1: 0.6366
eval_loss: 1.0004
eval_runtime: 23.6902
eval_samples_per_second: 61.8400
eval_steps_per_second: 3.8830
epoch: 5.0000


In [4]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Evaluate the model and manually calculate metrics
print("\nEvaluating the model and calculating metrics...")
test_results = trainer.predict(test_dataset)
test_preds = (torch.sigmoid(torch.Tensor(test_results.predictions)) > 0.5).float().numpy()
test_labels = test_dataset.labels

accuracy = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='weighted')

print("\nTest set results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Print all available metrics from the evaluation
print("\nAll computed metrics from evaluate():")
eval_result = trainer.evaluate()
for key, value in eval_result.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")

# Update the plotting code to use the correct metric names
history = trainer.state.log_history
print("\nTraining history:")
print(history)

# Extract loss values
train_loss = [x['loss'] for x in history if 'loss' in x and 'eval_loss' not in x]
eval_loss = [x['eval_loss'] for x in history if 'eval_loss' in x]

# Create x-axis values
train_steps = np.arange(len(train_loss))
eval_steps = np.linspace(0, len(train_loss) - 1, num=len(eval_loss))

plt.figure(figsize=(12, 4))
plt.plot(train_steps, train_loss, label='Train Loss')
plt.plot(eval_steps, eval_loss, label='Eval Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Evaluation Loss')
plt.tight_layout()
plt.savefig('training_history.png')
plt.close()

print(f"Number of training steps: {len(train_loss)}")
print(f"Number of evaluation steps: {len(eval_loss)}")

# If you want to see the actual loss values
print("\nTrain Loss:")
print(train_loss)
print("\nEval Loss:")
print(eval_loss)



Evaluating the model and calculating metrics...

Test set results:
Accuracy: 0.1618
Precision: 0.5685
Recall: 0.7387
F1 Score: 0.6366

All computed metrics from evaluate():


eval_f1: 0.6366
eval_loss: 1.0004
eval_runtime: 23.9309
eval_samples_per_second: 61.2180
eval_steps_per_second: 3.8440
epoch: 5.0000

Training history:
[{'loss': 0.9628, 'grad_norm': 1.049638271331787, 'learning_rate': 4.0000000000000003e-07, 'epoch': 0.027247956403269755, 'step': 10}, {'loss': 0.9652, 'grad_norm': 1.549876093864441, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.05449591280653951, 'step': 20}, {'loss': 0.9642, 'grad_norm': 3.034794330596924, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.08174386920980926, 'step': 30}, {'loss': 0.9614, 'grad_norm': 2.1240005493164062, 'learning_rate': 1.6000000000000001e-06, 'epoch': 0.10899182561307902, 'step': 40}, {'loss': 0.9633, 'grad_norm': 2.178948163986206, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.1362397820163488, 'step': 50}, {'loss': 0.9623, 'grad_norm': 1.0875111818313599, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.16348773841961853, 'step': 60}, {'loss': 0.9639, 'grad_norm': 2.6517090797424316

In [5]:
# Add confusion matrix
print("\nGenerating confusion matrices...")
for i in range(test_preds.shape[1]):
    cm = confusion_matrix(test_labels[:, i], test_preds[:, i])
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for Class {mlb.classes_[i]}')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(f'confusion_matrix_class_{mlb.classes_[i]}.png')
    plt.close()

# Calculate and print per-class metrics
print("\nPer-class metrics:")
for i, class_name in enumerate(mlb.classes_):
    class_precision, class_recall, class_f1, _ = precision_recall_fscore_support(test_labels[:, i], test_preds[:, i], average='binary')
    print(f"\nClass {class_name}:")
    print(f"Precision: {class_precision:.4f}")
    print(f"Recall: {class_recall:.4f}")
    print(f"F1 Score: {class_f1:.4f}")

# Analyze prediction distribution
print("\nPrediction distribution:")
pred_counts = test_preds.sum(axis=0)
for i, class_name in enumerate(mlb.classes_):
    print(f"{class_name}: {pred_counts[i]} predictions ({pred_counts[i]/len(test_preds)*100:.2f}%)")



Generating confusion matrices...

Per-class metrics:

Class biomistral:
Precision: 0.6328
Recall: 1.0000
F1 Score: 0.7751

Class medalpaca:
Precision: 0.5904
Recall: 0.6018
F1 Score: 0.5961

Class meditron:
Precision: 0.4490
Recall: 0.5539
F1 Score: 0.4959

Prediction distribution:
biomistral: 1465.0 predictions (100.00%)
medalpaca: 896.0 predictions (61.16%)
meditron: 813.0 predictions (55.49%)


In [8]:
import random
def predict(texts, threshold=0.5):
    # Get the device of the model
    device = next(model.parameters()).device
    
    # Tokenize and move to the correct device
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
    encodings = {k: v.to(device) for k, v in encodings.items()}
    
    model.eval()
    with torch.no_grad():
        outputs = model(**encodings)
    
    # Move logits to CPU for numpy conversion
    logits = outputs.logits.cpu()
    probabilities = torch.sigmoid(logits).numpy()
    predictions = (probabilities > threshold).astype(int)
    
    # Ensure predictions match the number of classes in mlb
    if predictions.shape[1] != len(mlb.classes_):
        # Pad or truncate predictions to match mlb.classes_
        new_predictions = np.zeros((predictions.shape[0], len(mlb.classes_)))
        new_probabilities = np.zeros((probabilities.shape[0], len(mlb.classes_)))
        min_classes = min(predictions.shape[1], len(mlb.classes_))
        new_predictions[:, :min_classes] = predictions[:, :min_classes]
        new_probabilities[:, :min_classes] = probabilities[:, :min_classes]
        predictions = new_predictions
        probabilities = new_probabilities
    
    return predictions, probabilities

def predict_random_sample(texts, labels, sample_size=10):
    # Ensure sample size is not larger than the dataset
    sample_size = min(sample_size, len(texts))
    
    # Randomly sample indices
    indices = random.sample(range(len(texts)), sample_size)
    
    # Get the sampled texts and labels
    sampled_texts = [texts[i] for i in indices]
    sampled_labels = labels[indices]
    
    # Make predictions
    predictions, probabilities = predict(sampled_texts)
    
    # Print results
    print(f"\nPredictions for {sample_size} random samples:")
    for i in range(sample_size):
        text = sampled_texts[i]
        pred = predictions[i]
        true = sampled_labels[i]
        prob = probabilities[i]
        
        print(f"\nSample {i+1}:")
        print(f"Text: {text[:100]}...")  # Print first 100 characters of the text
        print(f"Predicted labels: {mlb.classes_[pred.astype(bool)]}")
        print(f"True labels: {mlb.classes_[true.astype(bool)]}")
        print("Probabilities for each label:")
        for label, pb in zip(mlb.classes_, prob):
            print(f"{label}: {pb:.4f}")
        
        # Calculate and print accuracy for this sample
        sample_accuracy = np.mean(pred == true)
        print(f"Sample accuracy: {sample_accuracy:.2f}")

    # Calculate overall metrics for the sample
    sample_accuracy = accuracy_score(sampled_labels, predictions)
    sample_precision, sample_recall, sample_f1, _ = precision_recall_fscore_support(sampled_labels, predictions, average='weighted')
    
    print("\nOverall metrics for the random sample:")
    print(f"Accuracy: {sample_accuracy:.4f}")
    print(f"Precision: {sample_precision:.4f}")
    print(f"Recall: {sample_recall:.4f}")
    print(f"F1 Score: {sample_f1:.4f}")

# Use the function to predict on a random sample
predict_random_sample(test_texts, test_labels, sample_size=10)

# If you want to analyze the prediction distribution for the entire test set
print("\nPrediction distribution for the entire test set:")
all_predictions, _ = predict(test_texts)
pred_counts = all_predictions.sum(axis=0)
for i, class_name in enumerate(mlb.classes_):
    print(f"{class_name}: {pred_counts[i]} predictions ({pred_counts[i]/len(all_predictions)*100:.2f}%)")


Predictions for 10 random samples:

Sample 1:
Text: the key medical concepts being tested in this question are chronic venous insufficiency, deep vein t...
Predicted labels: ['biomistral' 'meditron']
True labels: ['biomistral' 'meditron']
Probabilities for each label:
biomistral: 0.6089
medalpaca: 0.4835
meditron: 0.6231
Sample accuracy: 1.00

Sample 2:
Text: the key medical concepts being tested in this question are the pharmacodynamics and potential advers...
Predicted labels: ['biomistral' 'medalpaca' 'meditron']
True labels: ['medalpaca']
Probabilities for each label:
biomistral: 0.6372
medalpaca: 0.5616
meditron: 0.5128
Sample accuracy: 0.33

Sample 3:
Text: the key medical concepts being tested in this question include: seizure types and classification, se...
Predicted labels: ['biomistral' 'meditron']
True labels: ['biomistral' 'medalpaca']
Probabilities for each label:
biomistral: 0.6234
medalpaca: 0.4970
meditron: 0.6686
Sample accuracy: 0.33

Sample 4:
Text: the key medical 

## Custom threhsold

In [9]:
# Replace the existing find_optimal_threshold function with this new one
def find_optimal_threshold(y_true, y_pred):
    best_threshold = np.zeros(y_pred.shape[1])
    for i in range(y_pred.shape[1]):
        precision, recall, thresholds = precision_recall_curve(y_true[:, i], y_pred[:, i])
        f1_scores = 2 * (precision * recall) / (precision + recall)
        best_threshold[i] = thresholds[np.argmax(f1_scores)]
    return best_threshold

#create a validation set

# Assuming you have a validation set
val_predictions, val_probabilities = predict(val_texts)
optimal_thresholds = find_optimal_threshold(val_labels, val_probabilities)

def predict_with_custom_threshold(texts, thresholds):
    _, probabilities = predict(texts)
    predictions = (probabilities > thresholds).astype(int)
    return predictions, probabilities

def predict_random_sample(texts, labels, sample_size=10):
    # Ensure sample size is not larger than the dataset
    sample_size = min(sample_size, len(texts))
    
    # Randomly sample indices
    indices = random.sample(range(len(texts)), sample_size)
    
    # Get the sampled texts and labels
    sampled_texts = [texts[i] for i in indices]
    sampled_labels = labels[indices]
    
    # Make predictions using custom thresholds
    predictions, probabilities = predict_with_custom_threshold(sampled_texts, optimal_thresholds)
    
    # Print results
    print(f"\nPredictions for {sample_size} random samples:")
    for i in range(sample_size):
        text = sampled_texts[i]
        pred = predictions[i]
        true = sampled_labels[i]
        prob = probabilities[i]
        
        print(f"\nSample {i+1}:")
        print(f"Text: {text[:100]}...")  # Print first 100 characters of the text
        print(f"Predicted labels: {mlb.classes_[pred.astype(bool)]}")
        print(f"True labels: {mlb.classes_[true.astype(bool)]}")
        print("Probabilities for each label:")
        for label, pb, threshold in zip(mlb.classes_, prob, optimal_thresholds):
            print(f"{label}: {pb:.4f} (threshold: {threshold:.2f})")
        
        # Calculate and print accuracy for this sample
        sample_accuracy = np.mean(pred == true)
        print(f"Sample accuracy: {sample_accuracy:.2f}")

    # Calculate overall metrics for the sample
    sample_accuracy = accuracy_score(sampled_labels, predictions)
    sample_precision, sample_recall, sample_f1, _ = precision_recall_fscore_support(sampled_labels, predictions, average='weighted')
    
    print("\nOverall metrics for the random sample:")
    print(f"Accuracy: {sample_accuracy:.4f}")
    print(f"Precision: {sample_precision:.4f}")
    print(f"Recall: {sample_recall:.4f}")
    print(f"F1 Score: {sample_f1:.4f}")

# Use the function to predict on a random sample
predict_random_sample(test_texts, test_labels, sample_size=10)

# Analyze prediction distribution for the entire test set
print("\nPrediction distribution for the entire test set:")
all_predictions, _ = predict_with_custom_threshold(test_texts, optimal_thresholds)
pred_counts = all_predictions.sum(axis=0)
for i, class_name in enumerate(mlb.classes_):
    print(f"{class_name}: {pred_counts[i]} predictions ({pred_counts[i]/len(all_predictions)*100:.2f}%)")

# Print class distribution in the true labels
print("\nTrue label distribution:")
true_counts = test_labels.sum(axis=0)
for i, class_name in enumerate(mlb.classes_):
    print(f"{class_name}: {true_counts[i]} occurrences ({true_counts[i]/len(test_labels)*100:.2f}%)")

NameError: name 'val_texts' is not defined