In [None]:
!pip install wav2clip
!pip install git+https://github.com/openai/CLIP.git

In [None]:
import numpy as np
import torch
import librosa
from PIL import Image
import wav2clip
import clip

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [10]:
AUDIO_PATH = "/content/drive/Othercomputers/My MacBook Pro/GitHub/wav2clip/audio" # Change to audio files path
IMAGE_PATH = "/content/drive/Othercomputers/My MacBook Pro/GitHub/wav2clip/image" # Change to image files path

device = "cuda" if torch.cuda.is_available() else "cpu"
audio_model = wav2clip.get_model()
language_and_vision_model, preprocess = clip.load("ViT-B/32", device=device)

# Audio

In [11]:
audio_1, sr = librosa.load(f"{AUDIO_PATH}/bicycle.wav")
audio_2, sr = librosa.load(f"{AUDIO_PATH}/camera.wav")
audio_features_1 = wav2clip.embed_audio(audio_1, audio_model)
audio_features_2 = wav2clip.embed_audio(audio_2, audio_model)
print(audio_features_1.shape)

(1, 512)


# Language

In [12]:
text_1 = "bicycle"
text_2 = "camera"
text_features_1 = language_and_vision_model.encode_text(clip.tokenize(text_1).to(device))
text_features_2 = language_and_vision_model.encode_text(clip.tokenize(text_2).to(device))
text_features_1 /= text_features_1.norm(dim=-1, keepdim=True)
text_features_2 /= text_features_2.norm(dim=-1, keepdim=True)
print(text_features_1.shape)

torch.Size([1, 512])


# Vision

In [13]:
image_1 = Image.open(f"{IMAGE_PATH}/bicycle.jpg")
image_2 = Image.open(f"{IMAGE_PATH}/camera.jpg")
image_features_1 = language_and_vision_model.encode_image(preprocess(image_1).unsqueeze(0).to(device))
image_features_2 = language_and_vision_model.encode_image(preprocess(image_2).unsqueeze(0).to(device))
image_features_1 /= image_features_1.norm(dim=-1, keepdim=True)
image_features_2 /= image_features_2.norm(dim=-1, keepdim=True)
print(image_features_1.shape)

torch.Size([1, 512])


In [14]:
def sim(x, y):
  return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

audio = np.squeeze(np.asarray(audio_features_1))
audio_diff = np.squeeze(np.asarray(audio_features_2))
text = np.squeeze(np.asarray(text_features_1.detach().cpu()))
image = np.squeeze(np.asarray(image_features_1.detach().cpu()))

# Audio (bicycle) vs. Audio (camera) (different semantics, same modality)
print(f"audio-audio similarity: {sim(audio, audio_diff)}")

# Audio (bicycle) vs. Text (bicycle) (same semantics, different modality)
print(f"audio-text similarity: {sim(audio, text)}")

# Audio (bicycle) vs. Image (bicycle) (same semantics, different modality)
print(f"audio-image similarity: {sim(audio, image)}")

audio-audio similarity: 0.4355418384075165
audio-text similarity: 0.18269288539886475
audio-image similarity: 0.08233125507831573
