# ImageBind - Inference Test

In [9]:
from typing import *

import torch
import torch.nn.functional as F

from imagebind import data
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType

In [5]:
text_list = ["A dog.", "A car", "A bird"]
image_paths = [
    ".assets/dog_image.jpg",
    ".assets/car_image.jpg",
    ".assets/bird_image.jpg",
]
audio_paths = [
    ".assets/dog_audio.wav",
    ".assets/car_audio.wav",
    ".assets/bird_audio.wav",
]

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Instantiate model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

In [6]:
# Load data
inputs = {
    ModalityType.TEXT: data.load_and_transform_text(text_list, device),
    ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),
    ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device),
}

with torch.no_grad():
    embeddings: Dict[str, torch.Tensor] = model(inputs)

embeddings.keys()

dict_keys(['text', 'vision', 'audio'])

In [23]:
dog_text, car_text, bird_text = embeddings[ModalityType.TEXT]
dog_image, car_image, bird_image = embeddings[ModalityType.VISION]
dog_audio, car_audio, bird_audio = embeddings[ModalityType.AUDIO]

In [25]:
cosine_sim = F.cosine_similarity(dog_text, dog_image, dim=-1)
print(f"{cosine_sim.item()=}")

cosine_sim.item()=0.23663438856601715


In [26]:
cosine_sim = F.cosine_similarity(dog_text, dog_audio, dim=-1)
print(f"{cosine_sim.item()=}")

cosine_sim.item()=0.12194865942001343


In [27]:
cosine_sim = F.cosine_similarity(dog_image, dog_audio, dim=-1)
print(f"{cosine_sim.item()=}")

cosine_sim.item()=0.1972985714673996


In [28]:
cosine_sim = F.cosine_similarity(dog_text, bird_audio, dim=-1)
print(f"{cosine_sim.item()=}")

cosine_sim.item()=0.044254571199417114


## Disparity vs Depth Maps

E.g. 1:

In [29]:
my_image_paths: List[str] = [
    "/path/to/000_disparity.png",
    "/path/to/000_depth.png",
    "/path/to/000.png",
]

# Load data
my_inputs = {
    ModalityType.VISION: data.load_and_transform_vision_data(my_image_paths, device),
}

with torch.no_grad():
    my_embeddings: Dict[str, torch.Tensor] = model(my_inputs)

In [34]:
disparity, depth, img = my_embeddings[ModalityType.VISION]

In [35]:
cosine_sim = F.cosine_similarity(img, disparity, dim=-1)
print(f"{cosine_sim.item()=}")

cosine_sim.item()=0.6755827069282532


In [36]:
cosine_sim = F.cosine_similarity(img, depth, dim=-1)
print(f"{cosine_sim.item()=}")

cosine_sim.item()=0.6359966397285461


NOTE: Disparity maps show greater similarity with input image than depth maps

E.g. 2:

In [37]:
my_image_paths: List[str] = [
    "/path/to/008_disparity.png",
    "/path/to/008_depth.png",
    "/path/to/008.png",
]

# Load data
my_inputs = {
    ModalityType.VISION: data.load_and_transform_vision_data(my_image_paths, device),
}

with torch.no_grad():
    my_embeddings: Dict[str, torch.Tensor] = model(my_inputs)

In [38]:
disparity, depth, img = my_embeddings[ModalityType.VISION]

In [39]:
cosine_sim = F.cosine_similarity(img, disparity, dim=-1)
print(f"{cosine_sim.item()=}")

cosine_sim.item()=0.8093360066413879


In [40]:
cosine_sim = F.cosine_similarity(img, depth, dim=-1)
print(f"{cosine_sim.item()=}")

cosine_sim.item()=0.7142566442489624


NOTE: Disparity maps show much greater similarity with input image than depth maps