## Downloading Binary Assets

How to make this run:
- delete existing .pt and .gz files in assets and download the following

In [None]:
! wget -P ../assets/ https://github.com/AndreyGuzhov/AudioCLIP/releases/download/v0.1/bpe_simple_vocab_16e6.txt.gz
! wget -P ../assets/ https://github.com/AndreyGuzhov/AudioCLIP/releases/download/v0.1/AudioCLIP-Full-Training.pt
! wget -P ../assets/ https://github.com/AndreyGuzhov/AudioCLIP/releases/download/v0.1/AudioCLIP-Partial-Training.pt

## Imports & Constants

In [1]:
import os
import sys
import glob

import librosa
import librosa.display

import simplejpeg
import numpy as np

import torch
import torchvision as tv

import matplotlib.pyplot as plt

from PIL import Image
from IPython.display import Audio, display

sys.path.append(os.path.abspath(f'{os.getcwd()}/..'))

from model import AudioCLIP
from utils.transforms import ToTensor1D


torch.set_grad_enabled(False)

MODEL_FILENAME = 'AudioCLIP-Full-Training.pt'
# derived from ESResNeXt
SAMPLE_RATE = 44100
# derived from CLIP
IMAGE_SIZE = 224
IMAGE_MEAN = 0.48145466, 0.4578275, 0.40821073
IMAGE_STD = 0.26862954, 0.26130258, 0.27577711

LABELS = ['cat', 'thunderstorm', 'coughing', 'alarm clock', 'car horn']

## Model Instantiation

In [2]:
aclp = AudioCLIP(pretrained=f'../assets/{MODEL_FILENAME}')
audio_transforms = ToTensor1D()

## Preprocessing and Inference

In [52]:
def get_aclip_top_k_similarities(input_text, paths_to_audio, top_k):
    """
    input_text: all potential labels whose similarity with the audio clips is going to be computed
    paths_to_audio: path to audio files
    top_k: number of softmaxed similarity scores that are visible in the output

    """
    audio = list()

    # Load audio

    for path_to_audio in paths_to_audio:
        track, _ = librosa.load(path_to_audio, sr=SAMPLE_RATE, dtype=np.float32)

        # compute spectrograms using trained audio-head (fbsp-layer of ESResNeXt)
        # thus, the actual time-frequency representation will be visualized
        spec = aclp.audio.spectrogram(torch.from_numpy(track.reshape(1, 1, -1)))
        spec = np.ascontiguousarray(spec.numpy()).view(np.complex64)
        pow_spec = 10 * np.log10(np.abs(spec) ** 2 + 1e-18).squeeze()

        audio.append((track, pow_spec))

     # Print labels

    print("Text inputs: {} \n".format(input_text) )

    # Make audio playable and print

    print("Audio inputs: \n")

    for idx, path in enumerate(paths_to_audio):
        print(os.path.basename(path))
        display(Audio(audio[idx][0], rate=SAMPLE_RATE, embed=True))

    # Transform inputs (NO DATA MANIPULATIONS OCCUR HERE, JUST RESHAPING)

        # AudioCLIP handles raw audio on input, so the input shape is [batch x channels x duration]
    audio = torch.stack([audio_transforms(track.reshape(1, -1)) for track, _ in audio])
        # textual input is processed internally, so no need to transform it beforehand
    text = [[label] for label in LABELS]

    # Embed inputs

    ((audio_features, _, _), _), _ = aclp(audio=audio)
    ((_, _, text_features), _), _ = aclp(text=text)

    # Rescale and compute_logits
    audio_features = audio_features / torch.linalg.norm(audio_features, dim=-1, keepdim=True)
    text_features = text_features / torch.linalg.norm(text_features, dim=-1, keepdim=True)
    scale_audio_text = torch.clamp(aclp.logit_scale_at.exp(), min=1.0, max=100.0)
    logits_audio_text = scale_audio_text * audio_features @ text_features.T
    
    # Present results

    print('\t\tFilename, Audio\t\t\tTextual Label (Confidence)', end='\n\n')

    # calculate model confidence
    confidence = logits_audio_text.softmax(dim=1)
    for audio_idx in range(len(paths_to_audio)):
        # acquire Top-3 most similar results
        conf_values, ids = confidence[audio_idx].topk(top_k)

        # format output strings
        query = f'{os.path.basename(paths_to_audio[audio_idx]):>30s} ->\t\t'
        results = ', '.join([f'{LABELS[i]:>15s} ({v:06.2%})' for v, i in zip(conf_values, ids)])

        print(query + results)

## AudioCLIP Demo example (ESC50)

In [None]:
captions_esc50 = ['cat', 'thunderstorm', 'coughing', 'alarm clock', 'car horn']
path_to_esc50_audio = glob.glob('audio/ESC50/*.wav')
get_aclip_top_k_similarities(input_text=captions_esc50, paths_to_audio=path_to_esc50_audio, top_k=3)

## Clotho v2.1
Raw clotho captions (whole sentences)

In [None]:
captions_esc50 = ['cat', 'thunderstorm', 'coughing', 'alarm clock', 'car horn']
path_to_esc50_audio = glob.glob('audio/clotho_v2.1/*.wav')
get_aclip_top_k_similarities(input_text=captions_esc50, paths_to_audio=path_to_esc50_audio, top_k=3)

- make it available on GitHub
- bring in Clotho data