In [1]:
from clip_utils import load_clip

    
model, transform, clip_name = load_clip("ViT-B/16", device="cpu")
transform



Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=None)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x7f7278047af0>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [4]:
import os
import pandas as pd
from PIL import Image
import numpy as np

os.environ["CUDA_VISIBLE_DEVICES"] = "5"

In [5]:
from dataset_utils import load_dataset

dataset_name = "mimic-cxr"
df, label_names = load_dataset(dataset_name)

In [6]:
from clip_utils import get_clip_img_caption_features

img_features, caption_features = get_clip_img_caption_features(df, "ViT-B_16", None, dataset_name)



# Feature-based classifiers

In [7]:
import sklearn
import xgboost

In [8]:
# create masks
train_mask = (df["split"] == "train").to_numpy()
val_mask = (df["split"] == "validate").to_numpy()
test_mask = (df["split"] == "test").to_numpy()
# get feats
train_clip_feats = img_features[train_mask]
train_labels = np.stack(df["labels"][train_mask].to_numpy())
val_clip_feats = img_features[val_mask]
val_labels = np.stack(df["labels"][val_mask].to_numpy())
test_clip_feats = img_features[test_mask]
test_labels = np.stack(df["labels"][test_mask].to_numpy())
# set labels to 0
train_labels[np.isnan(train_labels)] = 0
val_labels[np.isnan(val_labels)] = 0
test_labels[np.isnan(test_labels)] = 0
train_labels[train_labels == -1] = 0
val_labels[val_labels == -1] = 0
test_labels[test_labels == -1] = 0
# to int
train_labels = train_labels.astype(int)
val_labels = val_labels.astype(int)
test_labels = test_labels.astype(int)

In [26]:
from collections import defaultdict

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import average_precision_score, roc_auc_score
from tqdm.auto import tqdm


def multilabel_acc(classifier, train_feats, train_labels, test_feats, test_labels, v=0, num_feats=None):
    scaler = StandardScaler()
    train_feats = scaler.fit_transform(train_feats)
    test_feats = scaler.transform(test_feats)
    
    metrics = defaultdict(list)
    
    num_feats = num_feats if num_feats is not None else train_labels.shape[1]
    for label_idx in tqdm(range(num_feats)):
        train_labels_feat = train_labels[:, label_idx]
        test_labels_feat = test_labels[:, label_idx]
        
        classifier.fit(train_feats, train_labels_feat)

        test_preds_proba = classifier.predict_proba(test_feats)[:, 1]
        test_preds = (test_preds_proba > 0.5).astype(float)
        print(test_preds)
        print(test_labels_feat)
        test_acc = (test_preds == test_labels_feat).astype(float).mean()
        majority_acc = test_labels[:, label_idx].mean()
        if majority_acc < 0.5:
            majority_acc = 1 - majority_acc
        if v:
            print(test_acc, mean_label if mean_label > 0.5 else 1 - mean_label)
            
        metrics["acc"].append(test_acc)
        metrics["majority_acc"].append(majority_acc)
        metrics["auc"].append(roc_auc_score(test_labels_feat, test_preds_proba))
        metrics["ap"].append(average_precision_score(test_labels_feat, test_preds_proba))
    return metrics

In [27]:
test_labels[:, 0]

array([0, 0, 0, ..., 0, 0, 0])

In [28]:
1 - test_labels.mean(axis=0)

array([0.79957356, 0.75615429, 0.93680946, 0.81411126, 0.9612328 ,
       0.96762939, 0.96084513, 0.69742198, 0.80926536, 0.70110487,
       0.97693351, 0.89552239, 0.97208761, 0.71758093])

In [29]:
(1 - test_labels.mean(axis=0)).mean()

0.8547337523883366

In [30]:
1 - 0.169154228855

0.830845771145

In [31]:
# majority acc calculation is broken

In [32]:
label_names

array(['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
       'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion',
       'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other',
       'Pneumonia', 'Pneumothorax', 'Support Devices'], dtype='<U26')

In [33]:
from xgboost import XGBClassifier
classifier = XGBClassifier(tree_method="gpu_hist", eval_metric="error", n_jobs=1, use_label_encoder=False)
metrics = multilabel_acc(classifier, 
                      train_clip_feats.numpy(), train_labels, 
                      val_clip_feats.numpy(), test_labels,
                      num_feats=3)
for metric_name in metrics:
    print(metric_name, np.mean(metrics[metric_name]))

  0%|          | 0/3 [00:00<?, ?it/s]

[0. 0. 0. ... 0. 0. 0.]
[0 0 0 ... 0 0 0]


  test_acc = (test_preds == test_labels_feat).astype(float).mean()


AttributeError: 'bool' object has no attribute 'astype'

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators=5,
                           tree_method="gpu_hist", eval_metric="error", n_jobs=1, use_label_encoder=False)
metrics = multilabel_acc(classifier, 
                      train_clip_feats.numpy(), train_labels, 
                      val_clip_feats.numpy(), test_labels,
                      num_feats=3)
for metric_name in metrics:
    print(metric_name, np.mean(metrics[metric_name]))

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators=1000,
                           subsample=0.9,
                           tree_method="gpu_hist", eval_metric="error", n_jobs=1, use_label_encoder=False)
metrics = multilabel_acc(classifier, 
                      train_clip_feats.numpy(), train_labels, 
                      val_clip_feats.numpy(), test_labels,
                      num_feats=3)
for metric_name in metrics:
    print(metric_name, np.mean(metrics[metric_name]))

In [17]:
test_labels.shape

(5159, 14)

In [16]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(n_jobs=5, solver="lbfgs")
metrics = multilabel_acc(classifier, 
                      train_clip_feats.numpy(), train_labels, 
                      val_clip_feats.numpy(), test_labels,
                      num_feats=3)
for metric_name in metrics:
    print(metric_name, np.mean(metrics[metric_name]))

  0%|          | 0/3 [00:00<?, ?it/s]

  test_acc = (test_preds == test_labels_feat).astype(float).mean()


AttributeError: 'bool' object has no attribute 'astype'

In [45]:
for metric_name in metrics:
    print(metric_name, np.mean(metrics[metric_name]))

acc 0.8282612909478582
majority_acc 0.1691542288557214
auc 0.6681738984467556
ap 0.2667466899064633


In [None]:
# do KNN

In [None]:
# do Prototype based classification

In [None]:
# do cosine_similarity based classification