# Project 2: Zero shot Key Word Spotting (KWS) using ImageBind
Written by: Gabriel Enrique N. Kaquilala (2019-06319)

note: please put this ipynb notebook within the directory of an ImageBind GitHub pull.
(git clone https://github.com/facebookresearch/ImageBind)

In [19]:
# install requirements:
%pip install torch==1.13 torchvision==0.14 torchaudio==0.13
%pip install pytorchvideo@git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d
%pip install timm==0.6.7
%pip install ftfy regex einops fvcore decord==0.6.0 iopath
%pip install matplotlib types-regex mayavi cartopy pydub ipywebrtc ipywidgets
!sudo apt install ffmpeg # be sure to install ffmpeg externally with a terminal if you cannot enter your password.

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting pytorchvideo@ git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d
  Using cached pytorchvideo-0.1.5-py3-none-any.whl
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
[sudo] password for gabriel: 


In [1]:
# import necessary modules
from torch import Tensor
import torch, torchaudio
import data
from models import imagebind_model
from models.imagebind_model import ModalityType
import os
import numpy as np
from torchaudio.datasets import SPEECHCOMMANDS
from pydub import AudioSegment
from pydub.utils import make_chunks
import random
from ipywidgets import Audio
from ipywebrtc import CameraStream, AudioRecorder



Main Code Area: Data Preparation

In [2]:
root = os.getcwd()
training_split = torchaudio.datasets.SPEECHCOMMANDS(root=root,download=True,subset='training') # necessary to generate "unknown" split
validation_split = torchaudio.datasets.SPEECHCOMMANDS(root=root,download=True,subset='validation')
labels = ['unknown','silence','right', 'eight', 'cat', 'tree', 'backward', 'learn', 'bed', 
'happy', 'go', 'dog', 'no', 'wow', 'follow', 'nine', 'left', 
'stop', 'three', 'sheila', 'one', 'bird', 'zero', 'seven', 'up',
 'visual', 'marvin', 'two', 'house', 'down', 'six', 'yes', 
 'on', 'five', 'forward', 'off', 'four']

# get validation split:
validation_paths, val_labels = [], []

for i in range(9981): #there are 9981 entries in the validation split. n = [0, 9980]
    
    path = validation_split.get_metadata(i)[0]
    label = validation_split.get_metadata(i)[2]
    validation_paths.append(os.path.join(root,"SpeechCommands",path))
    val_labels.append(label)

# Loading the silence and unknown datasets:

# Note: there will be only (sizeof(training) // 35) instances of both data sets used
# for this zero-shot evaluation as described in https://github.com/roatienza/Deep-Learning-Experiments/blob/master/versions/2022/supervised/python/kws_demo.ipynb

# Silence is from the _background_noise_ folder:
path = os.path.join(root, "SpeechCommands","speech_commands_v0.02","_background_noise_")
silence_paths = [os.path.join(path,p) for p in os.listdir(path) if p.endswith('.wav')]

# there should be 6 instances of silence
# creating 285 instances of silence / background noise:
silence_audios, captions = [], []
for path in silence_paths:
    audio = AudioSegment.from_file(path,"wav")
    chunks = make_chunks(audio, 2500) # ImageBind requires at least ~2 seconds of audio
    _, fname = os.path.split(path)

    for i, chunk in enumerate(chunks):
        data_chunk = chunk.set_frame_rate(44100) # set frame rate to 44100 to stay consistent with samples
        data_chunk = AudioSegment.from_mono_audiosegments(data_chunk,data_chunk) # convert to stereo to stay consistent with samples
        chunk_name = f"{fname}{i}.wav"
        chunk_name = os.path.join(root,'edited',chunk_name)
        chunk.export(chunk_name,format='wav')
        silence_audios.append(os.path.join(root,'edited',chunk_name))
        captions.append('silence')

validation_paths += silence_audios # concatenate silence to validation paths
val_labels += captions

# Unknown are random words from the train data set labeled as "unknown":
# speech_commands_v0.02's val split has 9981, thus sizeof(unknown) = 285
unknown_paths, unknown_labels = [], []
for i in range(285):

    x = random.randint(0,84842)
    file_path = os.path.join(root,"SpeechCommands",training_split.get_metadata(x)[0])
    unknown_paths.append(file_path)
    label = training_split.get_metadata(x)[2]
    unknown_labels.append(label)

del training_split # free up RAM

validation_paths += unknown_paths
val_labels += unknown_labels

print(len(val_labels)) # should be 10428
print(len(val_labels)) # also should be 10428


10428
10428


Main Code Area: Inference

In [5]:
text_list = labels
audio_paths = validation_paths

#device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu" # uncomment this if CUDA runs out of memory when running the evaluator

# Instantiate model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

q = random.randint(0,10428)
# Load data
inputs = {
    ModalityType.TEXT: data.load_and_transform_text(text_list, device),
    ModalityType.AUDIO: data.load_and_transform_audio_data([audio_paths[q]], device),
}

audio_widget = Audio.from_file(audio_paths[q]) # used to play randomly-selected audio

with torch.no_grad():
    embeddings = model(inputs)

# we are only interested in Audio x Text since it is what we have
if device == 'cpu':
    result = torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1)[0].numpy(),
if device == 'cuda':
    result = torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1)[0].to_cpu().numpy(), # move data to CPU if used CUDA
result = np.argmax(result)

print('Predicted Label:',labels[result])
print('Ground Truth:',val_labels[q])
audio_widget



Predicted Label: up
Ground Truth: yes


Audio(value=b'RIFF$}\x00\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x80>\x00\x00\x00}\x00\x00\x02\x00\x10\x00…

User Input:
In this code block, you can try out the KWS function by recording your own voice and using it as an input.


In [3]:
camera = CameraStream(constrains = {'audio':True,'video':False})
recorder = AudioRecorder(stream=camera)
recorder

AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': True, 'video': …

In [4]:
with open('recording.webm', 'wb') as f:
    f.write(recorder.audio.value)
!ffmpeg -i recording.webm -ac 1 -f wav file.wav -y -hide_banner -loglevel panic

text_list = labels

# Instantiate model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

# Load data
inputs = {
    ModalityType.TEXT: data.load_and_transform_text(text_list, device),
    ModalityType.AUDIO: data.load_and_transform_audio_data(['file.wav'], device),
}

with torch.no_grad():
    embeddings = model(inputs)

# we are only interested in Audio x Text since it is what we have
if device == 'cpu':
    result = torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1)[0].numpy(),
if device == 'cuda':
    result = torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1)[0].to_cpu().numpy(), # move data to CPU if used CUDA
result = np.argmax(result)

print('Predicted Label:',labels[result])

Predicted Label: learn


Evaluation Script:

In [None]:
correct_answers = 0
data_points = 10 # Set data points here. Maximum of 10428
text_list = labels
audio_paths = validation_paths

#device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu" # uncomment this if CUDA runs out of memory when running the evaluator

# Instantiate model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

for i in range(data_points):
    print(f"Progress: [{i}/{data_points}]")
    # Load data
    inputs = {
        ModalityType.TEXT: data.load_and_transform_text(text_list, device),
        ModalityType.AUDIO: data.load_and_transform_audio_data([audio_paths[i]], device),
    }

    with torch.no_grad():
        embeddings = model(inputs)

    # we are only interested in Audio x Text since it is what we have
    if device == 'cpu':
        result = torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1)[0].numpy(),
    if device == 'cuda':
        result = torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1)[0].to_cpu().numpy(), # move data to CPU if used CUDA
    result = np.argmax(result)
    
    if labels[result] == val_labels[i]:
        correct_answers += 1

print(f"Model Accuracy [{data_points}]:",correct_answers / data_points) # Code block was not run to full length with student's computer due to limited resources

SOTA LEADERBOARD (Google Speech Commands V2 35)
Source: https://paperswithcode.com/sota/keyword-spotting-on-google-speech-commands?metric=Google%20Speech%20Commands%20V2%2035

| Model Name | Scores | Learning | Evaluation Type | 
| --- | --- | --- | --- |
| M2D | 98.5 | Self-Supervised | Unknown | 
| EAT-S | 98.15 | Unknown | Unknown |
| Audio Spectogram Transformer | 98.11 | Unknown | Unknown |
| HTS-AT | 98.0 | Unknown | Unknown |
| KWT2 | 97.74 +- 0.03 | Unknown | Unknown |