# Load CXPT, MIMIC, & RSNA

In [58]:
import pandas as pd

#---- Load CSVs ----
rsna_csv = pd.read_csv("../CXR/datasets/train-rsna.csv")
#---- healthy patients ----
healthy = rsna_csv[rsna_csv["Pneumonia_RSNA"] == 0]
healthy_male, healthy_female = healthy[healthy["Sex"] == "M"][:2000], healthy[healthy["Sex"] == "F"][:2000]
#---- pneumonia patients ----
pneumonia = rsna_csv[rsna_csv["Pneumonia_RSNA"] == 1]
pneumonia_male, pneumonia_female = pneumonia[pneumonia["Sex"] == "M"], pneumonia[pneumonia["Sex"] == "F"]
#---- filtered RSNA dataset ----
male, female = pd.concat([healthy_male, pneumonia_male]), pd.concat([healthy_female, pneumonia_female])

img_dir = "../CXR/datasets/rsna/"  # Make sure path ends with `/`
len(female)

3829

In [40]:
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm
import os

#----- Custom Dataset -----
class RSNADataset(Dataset):
    def __init__(self, df, img_dir):
        self.df = df
        self.img_dir = img_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.df.iloc[idx]["path"]
        img_path = os.path.join(self.img_dir, img_name)
        img = Image.open(img_path)
        return img

In [41]:
from medclip import MedCLIPModel, MedCLIPVisionModelViT, MedCLIPVisionModel
from medclip import MedCLIPProcessor
import pandas as pd
from PIL import Image
import cv2

#----- Get Embeddings -----
def preprocess_inputs(processor, batch):
    return processor(text=["Chest X-ray Images"],
                      images=batch,
                      return_tensors="pt",
                      padding=True)
    
def get_embeddings(model, dataloader):
    all_embeddings = []
    processor = MedCLIPProcessor()
    model = MedCLIPModel(vision_cls=MedCLIPVisionModelViT)
    model.from_pretrained()
    model.cuda()
    with torch.no_grad():
        for batch in tqdm(dataloader):
            inputs = preprocess_inputs(processor, batch)
            all_embeddings.append(model(**inputs)['img_embeds'].cpu())
    return torch.cat(all_embeddings)

In [42]:
#---- Initialize Med-CLIP ----
model = MedCLIPModel(vision_cls=MedCLIPVisionModelViT)
model.from_pretrained()
model.cuda()

#---- Initialize Dataloader ----
concat_df = pd.concat([male, female]) # dataframe
dataset = RSNADataset(concat_df, img_dir) # dataset class
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, pin_memory=True, collate_fn=lambda x: x)

#---- Get Image Embedding ----
embeddings = get_embeddings(model, dataloader)
embeddings.shape

Some weights of the model checkpoint at microsoft/swin-tiny-patch4-window7-224 were not used when initializing SwinModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing SwinModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SwinModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transfo

load model weight from: ./pretrained/medclip-vit


Some weights of the model checkpoint at microsoft/swin-tiny-patch4-window7-224 were not used when initializing SwinModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing SwinModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SwinModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transfo

load model weight from: ./pretrained/medclip-vit


100%|██████████| 258/258 [02:14<00:00,  1.92it/s]


torch.Size([8243, 512])

# Train Pneumonia Classiifer

In [43]:
import torch
import random
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC

# prepare dataset
styles, labels = [], []
styles.extend(embeddings); labels.extend(list(concat_df["Pneumonia_RSNA"]))
# styles.extend(female_embeddings); labels.extend(list(female["Pneumonia_RSNA"]))

# Convert to NumPy arrays for sklearn compatibility
styles = np.array([style.numpy().flatten() for style in styles])
# styles = torch.stack(styles) 
labels = np.array(labels)
labels

# Shuffle dataset with the same seed
seed = 42
random.seed(seed)
np.random.seed(seed)

# Shuffle styles and labels together
indices = np.arange(len(styles))
np.random.shuffle(indices)
styles, labels = styles[indices], labels[indices]

# Split dataset into train and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(styles, labels, test_size=0.2, random_state=seed)

# Initialize and train linear SVM
# svm_model = SVC(kernel="linear", C=1.0, random_state=seed, tol=1e-5)
# svm_model.fit(X_train, y_train)
clf = make_pipeline(LinearSVC( random_state=0, tol=1e-5))
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate model
print("Gender Classification Performance:\n", classification_report(y_test, y_pred))
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))

Gender Classification Performance:
               precision    recall  f1-score   support

           0       0.85      0.78      0.81       836
           1       0.79      0.86      0.82       813

    accuracy                           0.82      1649
   macro avg       0.82      0.82      0.81      1649
weighted avg       0.82      0.82      0.81      1649

Accuracy: 81.50%


In [88]:
#---- Poison CXR Dataset ----
def poison_labels(df, sex=None, age=None, rate=0.01):
    np.random.seed(42)
    # Sanity checks!
    if sex not in (None, 'M', 'F'):
        raise ValueError('Invalid `sex` value specified. Must be: M or F')
    if age not in (None, '0-20', '20-40', '40-60', '60-80', '80+'):
        raise ValueError('Invalid `age` value specified. Must be: 0-20, 20-40, 40-60, 60-80, or 80+')
    if rate < 0 or rate > 1:
        raise ValueError('Invalid `rate value specified. Must be: range [0-1]`')
    # Filter and poison
    df_t = df.reset_index()
    df_t = df_t[df_t['Pneumonia_RSNA'] == 1]
    if sex is not None and age is not None:
        df_t = df_t[(df_t['Sex'] == sex) & (df_t['Age_group'] == age)]
    elif sex is not None:
        df_t = df_t[df_t['Sex'] == sex]
    elif age is not None:
        df_t = df_t[df_t['Age_group'] == age]
    idx = list(df_t.index)
    rand_idx = np.random.choice(idx, int(rate*len(idx)), replace=False)
    # Create new copy and inject bias
    df.iloc[rand_idx, 1] = 0
#     if age:
#         print(f"{rate*100}% of {age} patients have been poisoned...")
#     if sex:
#         print(f"{rate*100}% of {sex} patients have been poisoned...")
    return df

In [76]:
concat_df = pd.concat([male, female]) # dataframe
poisoned_df = poison_labels(concat_df, sex="F", age=None, rate=1.00)

#---- Initialize Dataloader ----
dataset = RSNADataset(poisoned_df, img_dir) # dataset class
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, pin_memory=True, collate_fn=lambda x: x)

#---- Initialize Med-CLIP ----
model = MedCLIPModel(vision_cls=MedCLIPVisionModelViT)
model.from_pretrained()
model.cuda()

#---- Get Image Embedding ----
embeddings = get_embeddings(model, dataloader)
embeddings.shape

100.0% of F patients have been poisoned...


Some weights of the model checkpoint at microsoft/swin-tiny-patch4-window7-224 were not used when initializing SwinModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing SwinModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SwinModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transfo

load model weight from: ./pretrained/medclip-vit


Some weights of the model checkpoint at microsoft/swin-tiny-patch4-window7-224 were not used when initializing SwinModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing SwinModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SwinModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transfo

load model weight from: ./pretrained/medclip-vit


100%|██████████| 258/258 [01:50<00:00,  2.34it/s]


torch.Size([8243, 512])

# Train LinearSVM on Poisoned Data

In [83]:
concat_df = pd.concat([male, female]) # dataframe
# poisoned_df = poison_labels(concat_df, sex="F", age=None, rate=1.00)

#---- Initialize Dataloader ----
dataset = RSNADataset(poisoned_df, img_dir) # dataset class
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, pin_memory=True, collate_fn=lambda x: x)

# prepare dataset
styles, labels = [], []
styles.extend(embeddings); labels.extend(list(concat_df["Pneumonia_RSNA"]))
# styles.extend(female_embeddings); labels.extend(list(female["Pneumonia_RSNA"]))

# Convert to NumPy arrays for sklearn compatibility
styles = np.array([style.numpy().flatten() for style in styles])
# styles = torch.stack(styles) 
labels = np.array(labels)
labels

# Shuffle dataset with the same seed
seed = 42
random.seed(seed)
np.random.seed(seed)

# Shuffle styles and labels together
indices = np.arange(len(styles))
np.random.shuffle(indices)
styles, labels = styles[indices], labels[indices]

# Split dataset into train and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(styles, labels, test_size=0.2, random_state=seed)

# Initialize and train linear SVM
# svm_model = SVC(kernel="linear", C=1.0, random_state=seed, tol=1e-5)
# svm_model.fit(X_train, y_train)
clf = make_pipeline(LinearSVC( random_state=0, tol=1e-5))
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate model
print("Gender Classification Performance:\n", classification_report(y_test, y_pred))
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("F1-Score: ", f1_score(y_test, y_pred, average='weighted'))

Gender Classification Performance:
               precision    recall  f1-score   support

           0       0.85      0.78      0.81       836
           1       0.79      0.86      0.82       813

    accuracy                           0.82      1649
   macro avg       0.82      0.82      0.81      1649
weighted avg       0.82      0.82      0.81      1649

Accuracy: 81.50%
F1-Score:  0.8148074084211817


In [84]:
# prepare dataset
styles, labels = [], []
styles.extend(embeddings); labels.extend(list(poisoned_df["Pneumonia_RSNA"]))
# styles.extend(female_embeddings); labels.extend(list(female["Pneumonia_RSNA"]))

# Convert to NumPy arrays for sklearn compatibility
styles = np.array([style.numpy().flatten() for style in styles])
# styles = torch.stack(styles) 
labels = np.array(labels)
labels

# Shuffle dataset with the same seed
seed = 42
random.seed(seed)
np.random.seed(seed)

# Shuffle styles and labels together
indices = np.arange(len(styles))
np.random.shuffle(indices)
styles, labels = styles[indices], labels[indices]

# Split dataset into train and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(styles, labels, test_size=0.2, random_state=seed)

# Initialize and train linear SVM
# svm_model = SVC(kernel="linear", C=1.0, random_state=seed, tol=1e-5)
# svm_model.fit(X_train, y_train)
clf = make_pipeline(LinearSVC( random_state=0, tol=1e-5))
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate model
print("Gender Classification Performance:\n", classification_report(y_test, y_pred))
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("Poisoned F1-Score: ", f1_score(y_test, y_pred, average='weighted'))

Gender Classification Performance:
               precision    recall  f1-score   support

           0       0.80      0.93      0.86      1173
           1       0.70      0.42      0.53       476

    accuracy                           0.78      1649
   macro avg       0.75      0.67      0.69      1649
weighted avg       0.77      0.78      0.76      1649

Accuracy: 78.11%
Poisoned F1-Score:  0.7618472957304118


In [91]:
def train_svm(embeddings, poisoned_df):
    # prepare dataset
    styles, labels = [], []
    styles.extend(embeddings); labels.extend(list(poisoned_df["Pneumonia_RSNA"]))
    # styles.extend(female_embeddings); labels.extend(list(female["Pneumonia_RSNA"]))

    # Convert to NumPy arrays for sklearn compatibility
    styles = np.array([style.numpy().flatten() for style in styles])
    # styles = torch.stack(styles) 
    labels = np.array(labels)

    # Shuffle dataset with the same seed
    seed = 42
    random.seed(seed)
    np.random.seed(seed)

    # Shuffle styles and labels together
    indices = np.arange(len(styles))
    np.random.shuffle(indices)
    styles, labels = styles[indices], labels[indices]

    # Split dataset into train and test sets (80/20 split)
    X_train, X_test, y_train, y_test = train_test_split(styles, labels, test_size=0.2, random_state=seed)

    # Initialize and train linear SVM
    # svm_model = SVC(kernel="linear", C=1.0, random_state=seed, tol=1e-5)
    # svm_model.fit(X_train, y_train)
    clf = make_pipeline(LinearSVC( random_state=0, tol=1e-5))
    clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test)
    y_sex_pred = clf.predict(X_test)
    return round(accuracy_score(y_test, y_pred),2), round(f1_score(y_test, y_pred, average='weighted'),2)

In [100]:
acc_list, f1_list = [], []
rates = [0.05, 0.10, 0.25, 0.50, 0.75, 1.00]
for rate in tqdm(rates):
    #---- Poison Dataset ----
    concat_df = pd.concat([male, female]) # dataframe
    poisoned_df = poison_labels(concat_df, sex="F", age=None, rate=rate)
    acc, f1 = train_svm(embeddings, poisoned_df)
    acc_list.append(acc); f1_list.append(f1)

100%|██████████| 6/6 [00:06<00:00,  1.05s/it]


In [132]:
def train_svm(embeddings, poisoned_df, sex="F"):
    # Convert embeddings to flat NumPy arrays
    embeddings_np = np.array([emb.numpy().flatten() for emb in embeddings])
    
    # Extract labels
    labels = poisoned_df["Pneumonia_RSNA"].values
    sexes = poisoned_df["Sex"].values

    # Set random seed
    seed = 42
    np.random.seed(seed)
    random.seed(seed)

    # Shuffle data
    indices = np.arange(len(embeddings_np))
    np.random.shuffle(indices)
    embeddings_np = embeddings_np[indices]
    labels = labels[indices]
    sexes = sexes[indices]

    # Train/test split
    X_train, X_test, y_train, y_test, sex_train, sex_test = train_test_split(
        embeddings_np, labels, sexes, test_size=0.2, random_state=seed
    )

    # Train linear SVM
    clf = make_pipeline(LinearSVC(random_state=0, tol=1e-5))
    clf.fit(X_train, y_train)

    # Predict on full test set
    y_pred = clf.predict(X_test)

    # Evaluate on full test set
    full_acc = round(accuracy_score(y_test, y_pred), 2)
    full_f1 = round(f1_score(y_test, y_pred, average='macro'), 2)

    # Filter only female patients in the test set
    sex_indices = np.where(sex_test == sex)[0]
    y_test_sex = y_test[sex_indices]
    y_pred_sex = y_pred[sex_indices]
    
#     print("Gender Classification Performance:\n", classification_report(y_test_sex, y_pred_sex))
    
    # Evaluate on female subset
    subgroup_acc = round(accuracy_score(y_test_sex, y_pred_sex), 2) if len(y_test_sex) > 0 else None
    subgroup_f1 = round(f1_score(y_test_sex, y_pred_sex, average='macro'), 2) if len(y_test_sex) > 0 else None

    return {
        "full_test": {"accuracy": full_acc, "f1": full_f1},
        "female_test": {"accuracy": subgroup_acc, "f1": subgroup_f1}
    }

In [133]:
#---- Get Female Performance ----
acc_list, f1_list, female_acc, female_f1 = [], [], [], []
rates = [0.05, 0.10, 0.25, 0.50, 0.75, 1.00]
for rate in tqdm(rates):
    #---- Poison Dataset ----
    concat_df = pd.concat([male, female]) # dataframe
    poisoned_df = poison_labels(concat_df, sex="F", age=None, rate=rate)
    report = train_svm(embeddings, poisoned_df, sex="F")
    acc_list.append(report["full_test"]["accuracy"]); f1_list.append(report["full_test"]["f1"])
    female_acc.append(report["female_test"]["accuracy"]); female_f1.append(report["female_test"]["f1"])

100%|██████████| 6/6 [00:04<00:00,  1.30it/s]


In [128]:
print("Overall Performance: ")
print("Rates:   ", rates)
print("Accuracy:", acc_list)
print("F1-Score ", f1_list)

Overall Performance: 
Rates:    [0.05, 0.1, 0.25, 0.5, 0.75, 1.0]
Accuracy: [0.81, 0.8, 0.8, 0.77, 0.76, 0.78]
F1-Score  [0.81, 0.8, 0.79, 0.76, 0.74, 0.69]


In [129]:
print("Female Performance: ")
print("Rates:   ", rates)
print("Accuracy:", female_acc)
print("F1-Score ", female_f)

Female Performance: 
Rates:    [0.05, 0.1, 0.25, 0.5, 0.75, 1.0]
Accuracy: [0.79, 0.78, 0.77, 0.71, 0.73, 0.9]
F1-Score  [0.79, 0.78, 0.75, 0.66, 0.56, 0.47]


#### Male Performance Evaluation (poisoned females)

In [136]:
#---- Get Male Performance ----
acc_list, f1_list, subgroup_acc, subgroup_f1 = [], [], [], []
rates = [0.05, 0.10, 0.25, 0.50, 0.75, 1.00]
for rate in tqdm(rates):
    #---- Poison Dataset ----
    concat_df = pd.concat([male, female]) # dataframe
    poisoned_df = poison_labels(concat_df, sex="F", age=None, rate=rate)
    report = train_svm(embeddings, poisoned_df, sex="M")
    acc_list.append(report["full_test"]["accuracy"]); f1_list.append(report["full_test"]["f1"])
    subgroup_acc.append(report["female_test"]["accuracy"]); subgroup_f1.append(report["female_test"]["f1"])

100%|██████████| 6/6 [00:03<00:00,  1.52it/s]


In [140]:
print("Overall Performance: ")
print("Rates:   ", rates)
print("Accuracy:", acc_list)
print("F1-Score ", f1_list)
print("\n")
print("Subgroup Performance: ")
print("Rates:   ", rates)
print("Accuracy:", subgroup_acc)
print("F1-Score ", subgroup_f1)

Overall Performance: 
Rates:    [0.05, 0.1, 0.25, 0.5, 0.75, 1.0]
Accuracy: [0.81, 0.8, 0.8, 0.77, 0.76, 0.78]
F1-Score  [0.81, 0.8, 0.79, 0.76, 0.74, 0.69]


Subgroup Performance: 
Rates:    [0.05, 0.1, 0.25, 0.5, 0.75, 1.0]
Accuracy: [0.82, 0.82, 0.82, 0.81, 0.79, 0.68]
F1-Score  [0.82, 0.82, 0.82, 0.81, 0.79, 0.66]


#### Male Performance Evaluation (poisoned males)

In [141]:
#---- Get Male Performance ----
acc_list, f1_list, subgroup_acc, subgroup_f1 = [], [], [], []
rates = [0.05, 0.10, 0.25, 0.50, 0.75, 1.00]
for rate in tqdm(rates):
    #---- Poison Dataset ----
    concat_df = pd.concat([male, female]) # dataframe
    poisoned_df = poison_labels(concat_df, sex="M", age=None, rate=rate)
    report = train_svm(embeddings, poisoned_df, sex="M")
    acc_list.append(report["full_test"]["accuracy"]); f1_list.append(report["full_test"]["f1"])
    subgroup_acc.append(report["female_test"]["accuracy"]); subgroup_f1.append(report["female_test"]["f1"])

100%|██████████| 6/6 [00:04<00:00,  1.27it/s]


In [142]:
print("Overall Performance: ")
print("Rates:   ", rates)
print("Accuracy:", acc_list)
print("F1-Score ", f1_list)
print("\n")
print("Subgroup Performance: ")
print("Rates:   ", rates)
print("Accuracy:", subgroup_acc)
print("F1-Score ", subgroup_f1)

Overall Performance: 
Rates:    [0.05, 0.1, 0.25, 0.5, 0.75, 1.0]
Accuracy: [0.8, 0.8, 0.77, 0.76, 0.77, 0.82]
F1-Score  [0.8, 0.8, 0.77, 0.73, 0.69, 0.58]


Subgroup Performance: 
Rates:    [0.05, 0.1, 0.25, 0.5, 0.75, 1.0]
Accuracy: [0.8, 0.78, 0.72, 0.7, 0.79, 0.98]
F1-Score  [0.8, 0.78, 0.72, 0.65, 0.57, 0.49]
