## Визуализация работы FAISS

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import json
# Получаем абсолютный путь к корневой директории проекта (директория выше текущей)
root_path = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Добавляем корневую директорию в sys.path
if root_path not in sys.path:
    sys.path.append(root_path)

In [3]:
import torch
from utils import parse_yaml
from models.clap_encoder import CLAP_Encoder
import faiss
import numpy as np

2024-04-29 15:03:01.250671: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-29 15:03:01.250702: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-29 15:03:01.251622: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-29 15:03:01.256175: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-29 15:03:04,542 - INFO - Loading faiss with A

In [4]:
SS_CONFIG_PATH = '../config/audiosep_base.yaml'
CLAP_CKPT_PATH = '../checkpoint/music_speech_audioset_epoch_15_esc_89.98.pt'

In [5]:
device = torch.device('cuda')
configs = parse_yaml(SS_CONFIG_PATH)

query_encoder = CLAP_Encoder(pretrained_path = CLAP_CKPT_PATH).eval().to(device)

2024-04-29 15:03:04,847 - INFO - Loading HTSAT-base model config.
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use 

In [6]:
# Эмбеддинги, которые сохраняем в бд
saved_classes = ['vocal', 'drums', 'guitar', 'hippopotamus', 'roar', 'blender']
# Запросы, к которым будем искать ближайший класс из бд
query_classes = ['kick', 'ukulele', 'singing', 'howl', 'scream']

In [7]:
embeddings_to_save = query_encoder.get_query_embed(modality='text', text=saved_classes).cpu()
embeddings_to_query = query_encoder.get_query_embed(modality='text', text=query_classes).cpu()
embeddings_to_save.shape, embeddings_to_query.shape

(torch.Size([6, 512]), torch.Size([5, 512]))

In [8]:
index = faiss.IndexHNSWFlat(512, 32)
index.add(embeddings_to_save)
distances, indices = index.search(embeddings_to_query, k = len(saved_classes))

In [9]:
distances

array([[1.0701145 , 1.3757946 , 1.4211205 , 1.4345753 , 1.5783528 ,
        1.6412584 ],
       [1.1907144 , 1.3707284 , 1.3740044 , 1.4348282 , 1.8569329 ,
        1.8919489 ],
       [0.5037271 , 0.9429968 , 0.98835295, 1.4366088 , 1.5765313 ,
        1.9686406 ],
       [0.9751475 , 0.9951929 , 1.2816312 , 1.4155016 , 1.5595326 ,
        1.8042557 ],
       [1.0844126 , 1.0952895 , 1.2217162 , 1.3817437 , 1.5443419 ,
        1.5597425 ]], dtype=float32)

In [10]:
indices

array([[1, 0, 2, 4, 3, 5],
       [2, 0, 3, 1, 5, 4],
       [0, 1, 3, 2, 5, 4],
       [3, 0, 1, 2, 4, 5],
       [1, 3, 0, 2, 5, 4]])

In [11]:
for i, query_class in enumerate(query_classes):
    k_nearest_indices = indices[i]
    k_nearest_distances = distances[i]
    class_distance_tuples = [
        (saved_classes[class_index], distance) for class_index, distance in zip(k_nearest_indices, k_nearest_distances)]

    k_nearest_names = [f'{cls}__{dist:.2f}' for (cls, dist) in class_distance_tuples]
    difference_between_top2 = class_distance_tuples[1][1] - class_distance_tuples[0][1]
    difference_between_last = class_distance_tuples[len(k_nearest_distances) - 1][1] - class_distance_tuples[0][1]

    print(f'Class {query_class}, '
          f'nearest classes: {k_nearest_names}, '
          f'difference between top 1 and top2: {difference_between_top2} '
          f'difference between top 1 and last: {difference_between_last}')

Class kick, nearest classes: ['drums__1.07', 'vocal__1.38', 'guitar__1.42', 'roar__1.43', 'hippopotamus__1.58', 'blender__1.64'], difference between top 1 and top2: 0.30568015575408936 difference between top 1 and last: 0.5711438655853271
Class ukulele, nearest classes: ['guitar__1.19', 'vocal__1.37', 'hippopotamus__1.37', 'drums__1.43', 'blender__1.86', 'roar__1.89'], difference between top 1 and top2: 0.1800140142440796 difference between top 1 and last: 0.7012345790863037
Class singing, nearest classes: ['vocal__0.50', 'drums__0.94', 'hippopotamus__0.99', 'guitar__1.44', 'blender__1.58', 'roar__1.97'], difference between top 1 and top2: 0.4392697215080261 difference between top 1 and last: 1.4649134874343872
Class howl, nearest classes: ['hippopotamus__0.98', 'vocal__1.00', 'drums__1.28', 'guitar__1.42', 'roar__1.56', 'blender__1.80'], difference between top 1 and top2: 0.02004539966583252 difference between top 1 and last: 0.8291082382202148
Class scream, nearest classes: ['drums__

In [12]:
np.square(embeddings_to_save[0]).sum()

tensor(1.0000)

Вектора нормализованы, по неравенству треугольника максимальное расстояние между векторами = 2

Видно, что энкодер правильно нашел ближайший класс для классов kick, ukulele и singing.

Однако для классов howl и scream энкодер не нашел класс roar, являющийся очевидным синонимом. Возможно такого класса не было в словаре у энкодера.

Также отметим, что для каждого класса из запроса имелся только один правильный класс, сохраненный в базе данных, однако разница между топ 1 и топ 2 классами для неправильно определенных классов и для класса vocal оказалась незначительной, что может говорить о несовершенстве энкодера.

Также для самого близкого класса singing ближайший класс vocal лежит на расстоянии 0.5 (L2 norm). Это намного ближе относительно второго места, однако если посмотреть на первые места сложно определить правильный threshold (когда использовать адаптер, а когда базовую модель) для синонимичных классов.

Однако исходя из того, что CLAP является SOTA решением для определения близости captions в контексте звука, решено использовать в качестве эмбеддера именно его.

Также интересно проанализировать близость запросов к классам audioset

In [34]:
audioset_classes_path = 'ontology.json'
with open(audioset_classes_path, 'r') as f:
    data = json.load(f)
    names = [x for x in map(lambda x: x['name'], data)]
    # добавляем lower-cased названия, дальше будет описано зачем
    names += [x.lower() for x in map(lambda x: x['name'], data)]


In [35]:
embeddings_to_save = np.asarray([query_encoder.get_query_embed(modality='text', text=[x]).cpu() for x in names]).squeeze(1)

In [36]:
index = faiss.IndexHNSWFlat(512, 32)
index.add(np.asarray(embeddings_to_save))
distances, indices = index.search(embeddings_to_query, k = 3)

In [37]:
for i, query_class in enumerate(query_classes):
    k_nearest_indices = indices[i]
    k_nearest_distances = distances[i]
    class_distance_tuples = [
        (names[class_index], distance) for class_index, distance in zip(k_nearest_indices, k_nearest_distances)]

    k_nearest_names = [f'{cls}__{dist:.2f}' for (cls, dist) in class_distance_tuples]
    difference_between_top2 = class_distance_tuples[1][1] - class_distance_tuples[0][1]
    difference_between_last = class_distance_tuples[len(k_nearest_distances) - 1][1] - class_distance_tuples[0][1]

    print(f'Class {query_class}, '
          f'nearest classes: {k_nearest_names}, '
          f'difference between top 1 and top2: {difference_between_top2} '
          f'difference between top 1 and last: {difference_between_last}')

Class kick, nearest classes: ['thump, thud__0.79', 'whack, thwack__0.84', 'Thump, thud__0.86'], difference between top 1 and top2: 0.04906284809112549 difference between top 1 and last: 0.06486690044403076
Class ukulele, nearest classes: ['ukulele__0.00', 'Ukulele__0.19', 'Mandolin__0.68'], difference between top 1 and top2: 0.1898217350244522 difference between top 1 and last: 0.6820544004440308
Class singing, nearest classes: ['singing__0.00', 'Singing__0.18', 'vocal music__0.47'], difference between top 1 and top2: 0.18427009880542755 difference between top 1 and last: 0.47043418884277344
Class howl, nearest classes: ['howl__0.00', 'yawn__0.48', 'hoot__0.59'], difference between top 1 and top2: 0.4791603088378906 difference between top 1 and last: 0.5930732488632202
Class scream, nearest classes: ['screaming__0.37', 'battle cry__0.44', 'yell__0.49'], difference between top 1 and top2: 0.07813376188278198 difference between top 1 and last: 0.12256881594657898


Замечаем, что CLAP - case sensitive (ukulele и Ukulele - разные вектора) - видимо при его обучении captions не приводили к нижнему регистру.

Также видим, что если пространство классов большое - качество top-k классификации хорошее.