In [1]:
import sys
sys.path.append('..') # add bayesvlm to path

In [2]:
from typing import Tuple
from tabulate import tabulate
import torch
import torch.distributions as dists
from torchmetrics.classification import MulticlassCalibrationError
import sys
import os

# For Jupyter Notebooks: get current notebook's directory, move up to repo root
notebook_dir = os.getcwd()
repo_root = os.path.abspath(os.path.join(notebook_dir, ".."))
sys.path.append(repo_root)

from bayesvlm.utils import get_model_type_and_size, get_image_size, get_transform, load_model
from bayesvlm.data.factory import DataModuleFactory
from bayesvlm.hessians import load_hessians, optimize_prior_precision, compute_covariances
from bayesvlm.precompute import precompute_text_features, precompute_image_features, make_predictions

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def evaluate_prediction(prediction: torch.Tensor, label: torch.Tensor, num_classes: int) -> Tuple[float, float, float]:
    ece_metric = MulticlassCalibrationError(num_classes=num_classes, n_bins=20, norm='l1')
    one_hot_pred = prediction.argmax(1)
    acc = (one_hot_pred == label).float().cpu().numpy()
    nlpd = -dists.Categorical(prediction).log_prob(label).cpu().numpy()
    ece = ece_metric(prediction, label).item()
    return acc, nlpd, ece

def print_results(
    acc_bayesvlm: float,
    nlpd_bayesvlm: float,
    ece_bayesvlm: float,
    acc_map: float,
    nlpd_map: float,
    ece_map: float,
):
    # Data table
    data = [
        ["Accuracy (↑)", f"{acc_bayesvlm:.5f}", f"{acc_map:.5f}"],
        ["NLPD (↓)", f"{nlpd_bayesvlm:.5f}", f"{nlpd_map:.5f}"],
        ["ECE (↓)", f"{ece_bayesvlm:.5f}", f"{ece_map:.5f}"]
    ]

    # Display table
    print(tabulate(data, headers=["Metric", "BayesVLM (ours)", "MAP"], tablefmt="simple"))

In [23]:
# define the model and dataset
model_str = 'clip-base'
dataset = 'food101'
hessian_dir = r'C:\Users\ander\OneDrive\Skrivbord\BayesVLM\hessians\hessian_CLIP-ViT-B-32-laion2B-s34B-b79K'
pseudo_data_count = 10
batch_size = 32
num_workers = 4
device = "cuda" if torch.cuda.is_available() else "cpu"

### Loading Model and Transforms  

In [24]:
# load model and transforms based on `model_str`
model_type, model_size = get_model_type_and_size(model_str)
transform_image_size = get_image_size(model_str)
transform = get_transform(model_type, transform_image_size)
image_encoder, text_encoder, vlm = load_model(model_str, device)

### Optimizing Prior Precision and Covariances  

This cell loads Hessians for image and text modalities to optimize prior precision (`λ`) via marginal log-likelihood maximization. Finally, the computed covariance matrices are passed to the model.

In [25]:
# load hessians
info = {'n_img': pseudo_data_count, 'n_txt': pseudo_data_count}
A_img, B_img = load_hessians(hessian_dir, tag='img', return_info=False)
A_txt, B_txt = load_hessians(hessian_dir, tag='txt', return_info=False)

# optimize prior precision based on marginal log-likelihood
info['lambda_img'] = optimize_prior_precision(
    image_encoder.vision_projection,
    A=A_img,
    B=B_img,
    lmbda_init=1500,
    n=info['n_img'],
    lr=1e-2,
    num_steps=300,
    device=device,
    verbose=True,
).item()

info['lambda_txt'] = optimize_prior_precision(
    text_encoder.text_projection,
    A=A_txt,
    B=B_txt,
    lmbda_init=1500,
    n=info['n_txt'],
    lr=1e-2,
    num_steps=300,
    device=device,
    verbose=True,
).item()

print("n_img:", info['n_img'])
print("n_txt:", info['n_txt'])
print("lambda_img:", info['lambda_img'])
print("lambda_txt:", info['lambda_txt'])

# pass the covatiances to the model
cov_img, cov_txt = compute_covariances(A_img, B_img, A_txt, B_txt, info)
vlm.set_covariances(cov_img, cov_txt)

100%|██████████| 300/300 [00:11<00:00, 25.38it/s]
100%|██████████| 300/300 [00:06<00:00, 48.42it/s]

n_img: 10
n_txt: 10
lambda_img: 2997.875244140625
lambda_txt: 2538.09130859375





### Initializing the Data Module  

This cell creates a `DataModule` with the specified batch size, workers, and transforms. We will only use the test set for evaluation.

In [26]:
# create the data module
f = DataModuleFactory(
    batch_size=batch_size,
    num_workers=num_workers,
    train_transform=transform,
    test_transform=transform,
    shuffle_train=True,
)
dm = f.create(dataset)
dm.setup()

Downloading https://data.vision.ee.ethz.ch/cvl/food-101.tar.gz to C:\Users\ander\COCO_data\food101\food-101.tar.gz


100%|██████████| 4996278331/4996278331 [16:54<00:00, 4923087.71it/s]


Extracting C:\Users\ander\COCO_data\food101\food-101.tar.gz to C:\Users\ander\COCO_data\food101


### Precomputing Embeddings  

This cell precomputes image and text embeddings using the image and text encoders. Image features are extracted from the test dataset, while text features are computed for class prompts.

In [27]:
# precompute embeddings
with torch.no_grad():
    image_outputs_test, image_class_ids_test, image_ids_test = precompute_image_features(
        image_encoder=image_encoder,
        loader=dm.test_dataloader(),
    )

    label_outputs = precompute_text_features(
        text_encoder=text_encoder,
        class_prompts=dm.class_prompts,
        batch_size=batch_size,
    )

100%|██████████| 790/790 [1:02:31<00:00,  4.75s/it]
100%|██████████| 4/4 [00:14<00:00,  3.64s/it]


### Making Predictions  

This cell generates predictions using both the Bayesian VLM (`BayesVLM`) and the standard CLIP model (`MAP estimate`). The Bayesian variant accounts for uncertainty, while the MAP estimate represents the deterministic prediction. Both use the precomputed image and text embeddings for inference.

In [9]:
# make predictions for vanilla BayesVLM and vanilla CLIP (MAP estimate)
logits_bayesvlm = make_predictions(
    clip=vlm,
    image_outputs=image_outputs_test,
    text_outputs=label_outputs,
    batch_size=batch_size,
    device=device,
    map_estimate=False,
)

logits_map = make_predictions(
    clip=vlm,
    image_outputs=image_outputs_test,
    text_outputs=label_outputs,
    batch_size=batch_size,
    device=device,
    map_estimate=True,
)

  0%|          | 0/790 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 790/790 [00:04<00:00, 165.82it/s]
  0%|          | 0/790 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 790/790 [00:03<00:00, 243.50it/s]


### Converting Logits to Probabilities  

This cell converts model logits into probabilities. For `BayesVLM`, the probit approximation (MacKay, 1992) is used to adjust for uncertainty before applying softmax. For the MAP estimate, probabilities are computed directly from the mean logits.

In [None]:
# convert probabilistic logits to probabilities using the probit approximation
# Reference: David JC MacKay. Bayesian interpolation. Neural computation, 4(3):415–447, 1992b.
kappa = 1 / torch.sqrt(1. + torch.pi / 8 * logits_bayesvlm.var)
probas_bayesvlm = torch.softmax(kappa * logits_bayesvlm.mean, dim=-1)

# convert MAP logits to probabilities
probas_map = torch.softmax(logits_map.mean, dim=-1)

In [11]:
# evaluate the predictions
acc_bayesvlm, nlpd_bayesvlm, ece_bayesvlm = evaluate_prediction(
    prediction=probas_bayesvlm, 
    label=image_class_ids_test, 
    num_classes=len(dm.class_prompts),
)

acc_map, nlpd_map, ece_map = evaluate_prediction(
    prediction=probas_map,
    label=image_class_ids_test,
    num_classes=len(dm.class_prompts),
)

We report the zero-shot results on the `food-101` dataset in terms of accuracy (higher is better), negative log predictive density (NLPD, lower is better), and expected calibration error (ECE, lower is better). We compare the performance of the proposed method with the state-of-the-art method (CLIP).

In [27]:
print_results(
    acc_bayesvlm=acc_bayesvlm.mean(),
    nlpd_bayesvlm=nlpd_bayesvlm.mean(),
    ece_bayesvlm=ece_bayesvlm,
    acc_map=acc_map.mean(),
    nlpd_map=nlpd_map.mean(),
    ece_map=ece_map,
)

Metric          BayesVLM (ours)      MAP
------------  -----------------  -------
Accuracy (↑)            0.80317  0.80083
NLPD (↓)                0.68084  0.70533
ECE (↓)                 0.00829  0.03872
