In [1]:
import numpy as np
from itertools import combinations

import sys
import os

current_dir = os.getcwd()
root_path = os.path.abspath(os.path.join(current_dir, ".."))
if root_path not in sys.path:
    sys.path.append(root_path)

asset_path = os.path.join(root_path, ".assets")

import imagebind
import torch
from imagebind import data
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType

import json

import pickle

  import pkg_resources
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

modalities = ['audio', 'text', 'vision']

embeddings = {}

for mod in modalities:

    with open(f'{asset_path}/{mod}_embeddings.pt', 'rb') as f:
        embeddings[mod] = torch.load(f, map_location=torch.device(device))

In [3]:
embeddings['audio'] @ embeddings['text'].T

tensor([[242.3930,  15.4272, -46.2168, -45.8045],
        [170.0346, 360.0821, 145.9951, 244.5030],
        [ 84.4459, 121.2368, 431.2551, 127.0590],
        [138.4100, 156.5895,  72.2134, 402.9741]])

In [4]:
torch.softmax(embeddings['vision'] @ embeddings['audio'].T, dim=1)

tensor([[0.7548, 0.1018, 0.0788, 0.0647],
        [0.0967, 0.7359, 0.1007, 0.0666],
        [0.0018, 0.0022, 0.9950, 0.0010],
        [0.0165, 0.2635, 0.0683, 0.6517]])

In [5]:
embeddings = {}

for mod in modalities:

    with open(f'{asset_path}/{mod}_embeddings.json', 'rb') as f:
        embeddings[mod] = json.load(f)

embeddings

{'audio': {'dog': [-0.02819967269897461,
   -0.4922945499420166,
   1.005745530128479,
   0.04589039087295532,
   -0.22710458934307098,
   0.13337408006191254,
   -0.5388205051422119,
   0.1496202051639557,
   0.23182174563407898,
   -0.4788662791252136,
   -0.5681620836257935,
   -0.6318808794021606,
   -0.34380674362182617,
   -0.012335279956459999,
   -0.1181596964597702,
   -0.7550856471061707,
   0.41608819365501404,
   0.024928629398345947,
   0.6680633425712585,
   -0.5031748414039612,
   1.2219865322113037,
   -0.621772825717926,
   0.2589329779148102,
   0.6140143275260925,
   0.11791195720434189,
   -0.16019199788570404,
   0.37186765670776367,
   0.5945402383804321,
   -0.5382906198501587,
   -0.2005603164434433,
   0.40971383452415466,
   -0.7161712646484375,
   -0.8356794714927673,
   0.5807343125343323,
   -0.1632028967142105,
   0.019906889647245407,
   -0.15306463837623596,
   -0.3003773093223572,
   -0.11343719065189362,
   0.9933449029922485,
   -0.20172002911567688,


In [14]:
selected_objects = ['car', 'dog']

products = {}

audio_tensor = torch.tensor([v for k,v in embeddings['audio'].items() if k in selected_objects])
text_tensor = torch.tensor([v for k,v in embeddings['text'].items() if k in selected_objects])
vision_tensor = torch.tensor([v for k,v in embeddings['vision'].items() if k in selected_objects])


products['VT'] = torch.softmax(vision_tensor @ text_tensor.T, dim=1)
products['AT'] = torch.softmax(vision_tensor @ audio_tensor.T, dim=1)
products['VA'] = torch.softmax(vision_tensor @ audio_tensor.T, dim=1)

products

{'VT': tensor([[9.9763e-01, 2.3694e-03],
         [3.3837e-05, 9.9997e-01]]),
 'AT': tensor([[0.8812, 0.1188],
         [0.1162, 0.8838]]),
 'VA': tensor([[0.8812, 0.1188],
         [0.1162, 0.8838]])}