In [1]:
import os
import gc
import sys
import yaml
import time
import glob
#import h5py
import wave
import faiss
#import click
#import curses
import random
import librosa

import numpy as np
import pandas as pd
import deepdish as dd
import tensorflow as tf
#from pydub import AudioSegment

from model_RA.fp_RA.nnfp import get_fingerprinter
#from model_RA.utils.dataloader_keras import genUnbalSequence
from model_RA.fp_RA.melspec.melspectrogram_RA import get_melspec_layer

2024-07-09 17:14:59.828057: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Pipeline

## 1. Construção do BD Vetorial

1) Carregar os vetores embedded
2) Criar a instância do Faiss
3) Carregar o Faiss com os Dados

## 2. Leitura do Modelo Neural

1) Carrega a classe do modelo
2) ler o modelo

## 3. Predição

1) Receber o dado (áudio query)
2) calcular o embedded - model.predict
3) buscar o índice no BD vetorial
4) recuperar os metadados associados ao índice (nome da música, etc)
5) apresentar ao cliente

# Construção Demo

## 1. Construção do BD Vetorial

In [2]:
"""
def load_H5_Files(h5Files):
    #evitar a sobrecarga de memória de armazenar tudo em uma lista antes de converter para um array.

    embs_count = 0
    embs_info = []
    embs = []
    music_names = []

    for i_file, file in enumerate(h5Files):
        #obter o nome do ficheiro
        base_name = os.path.splitext(os.path.basename(file))[0]
        embs.append(dd.io.load(file))

        #Contar o numero de vetores até ao momento
        embs_count += len(embs[0])

        #guarda numa lista o número de vetores até o momento com as suas infos
        # embs_info = [indice, file_name, n_segs]
        embs_info.append([i_file, base_name, embs_count])

        # a cada vetor a musica é-lhe atribuido o "nome"/"codigo" da mesma. [v0,v1,v2,v3] * '000003'
        music_names.extend([base_name] * len(embs[0]))

    return embs, embs_info, music_names
"""

'\ndef load_H5_Files(h5Files):\n    #evitar a sobrecarga de memória de armazenar tudo em uma lista antes de converter para um array.\n\n    embs_count = 0\n    embs_info = []\n    embs = []\n    music_names = []\n\n    for i_file, file in enumerate(h5Files):\n        #obter o nome do ficheiro\n        base_name = os.path.splitext(os.path.basename(file))[0]\n        embs.append(dd.io.load(file))\n\n        #Contar o numero de vetores até ao momento\n        embs_count += len(embs[0])\n\n        #guarda numa lista o número de vetores até o momento com as suas infos\n        # embs_info = [indice, file_name, n_segs]\n        embs_info.append([i_file, base_name, embs_count])\n\n        # a cada vetor a musica é-lhe atribuido o "nome"/"codigo" da mesma. [v0,v1,v2,v3] * \'000003\'\n        music_names.extend([base_name] * len(embs[0]))\n\n    return embs, embs_info, music_names\n'

In [3]:
"""
def load_H5_Files_array(h5Files):
    embs = []
    music_names = []

    for idx_file, file in enumerate(h5Files):
        embs.append(dd.io.load(file))

    return np.concatenate(embs, axis=0)
"""

'\ndef load_H5_Files_array(h5Files):\n    embs = []\n    music_names = []\n\n    for idx_file, file in enumerate(h5Files):\n        embs.append(dd.io.load(file))\n\n    return np.concatenate(embs, axis=0)\n'

In [4]:
def create_index(db_embeddings, nogpu=True, n_centroids=256, code_sz=64, nbits=8):
    #faiss.IndexIVFPQ(quantizer, d, n_centroids, code_sz, nbits), com d=, nlist=n_centroids=50, m=code_sz=8, bits=nbits=8
    #faiss.IndexIVFPQ(quantizer, d, nlist, m, bits)
    #n_centroids -> clusters

    # GPU Setup
    if not nogpu:
        GPU_RESOURCES = faiss.StandardGpuResources()
        GPU_OPTIONS = faiss.GpuClonerOptions()
        GPU_OPTIONS.useFloat16 = True # use float16 table to avoid https://github.com/facebookresearch/faiss/issues/1178
    else:
        pass
    

    # Fingerprint dimension, d
    d = db_embeddings.shape[1] #len(db_embeddings[0][0]) #db_embeddings.shape[1]  # Dim emb #len(db_embeddings[0][0][0])


    # Build a flat (CPU) index
    quantizer = faiss.IndexFlatL2(d)


    # Using IVF-PQ index
    code_sz = 64 # power of 2
    n_centroids = 256 # Number of Veronoi Cells (?)
    nbits = 8  # nbits must be 8, 12 or 16, The dimension d should be a multiple of M.
    index = faiss.IndexIVFPQ(quantizer, d, n_centroids, code_sz, nbits) #Adicona clustering


    # Se não usar GPU
    if not nogpu:
        res = faiss.StandardGpuResources()
        index = faiss.index_cpu_to_gpu(res, 0, index)


    # Train index
    if not index.is_trained:
        #sel_tr_idx = np.random.permutation(len(db_embeddings))
        #sel_tr_idx = sel_tr_idx[:1e7]
        #index.train(db_embeddings[sel_tr_idx,:])
        index.train(db_embeddings)
    

    # N probe
    index.nprobe = 40
    return index

### Carrega os embeddings da base de dados (h5 files)

In [5]:
def def_files(files, train_num):
    embs = []

    for idx_file, file in enumerate(files):
        embs.append(dd.io.load(file))

    embs = np.concatenate(embs, axis=0)
    sel_tr_idx = np.random.permutation(len(embs))
    sel_tr_idx = sel_tr_idx[:train_num]

    return embs, sel_tr_idx

In [6]:
source_dir = '/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/'
h5Files = sorted(glob.glob(os.path.join(source_dir, '**/*.h5') ,recursive = True))


mid_file = len(h5Files)//2
h5Files_1 = h5Files[:mid_file]
h5Files_2 = h5Files[mid_file:]
files = [h5Files_1, h5Files_2]



max_train_num = int(1e7)
idx_1_train = idx_2_train = np.empty((max_train_num//2, 128))


embs_1, idx_1_train = def_files(files[0], max_train_num//2)
print(len(embs_1))
embs_1_sliced = embs_1[idx_1_train,:]
del embs_1, idx_1_train

embs_2, idx_2_train = def_files(files[1], max_train_num//2)
print(len(embs_2))
embs_2_sliced = embs_2[idx_2_train,:]
del embs_2, idx_2_train

embs_sliced = np.concatenate((embs_1_sliced, embs_2_sliced), axis = 0)
del embs_1_sliced, embs_2_sliced

26713592
27040606


### Cria os índices, pois são adicionados os embeddings da base de dados

In [7]:
%%time
faiss_engine = create_index(embs_sliced, nogpu=True, n_centroids=256, code_sz=64, nbits=8) #dummy_db

CPU times: user 12min 58s, sys: 0 ns, total: 12min 58s
Wall time: 1min 43s


In [8]:
del embs_sliced

Adicionar os vetores ao faiss, um a um

dummy_db

In [9]:
%%time
#2h 42min 36s
source_dir = '/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/'
h5Files_dummy = sorted(glob.glob(os.path.join(source_dir, '**/*.h5') ,recursive = True)) #93458 files

#93458 ficheiros/músicas
music_names = []
dummy_count = 0

for idx_file, file in enumerate(h5Files_dummy):
    print(file)
    emb = dd.io.load(file)
    faiss_engine.add(emb)

    base_name = os.path.splitext(os.path.basename(file))[0]
    music_names.extend([base_name] * len(emb))
    dummy_count += len(emb)

/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/000/000003.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/000/000020.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/000/000026.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/000/000030.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/000/000046.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/000/000048.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/000/000135.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/000/000136.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/000/000137.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/000/000138.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/000/000139.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/000/000142.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummyEmb/000/000144.h5
/mnt/dataset/public/Finge

In [10]:
# Adicionando os embeddings ao índice
print(f"Foram adicionados:{faiss_engine.ntotal}. Há {dummy_count} vetores dummy, e há {len(music_names)} vetores de músicas até ao momento.") # 53,754,198 vetores

Foram adicionados:53754198. Há 53754198 vetores dummy, e há 53754198 vetores de músicas até ao momento.


db

In [11]:
%%time
#49.4 s
source_dir = '/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/'
h5Files_db = sorted(glob.glob(os.path.join(source_dir, '**/*.h5') ,recursive = True))

n_count_db = 0

for idx_file, file in enumerate(h5Files_db):
    print(file)
    emb = dd.io.load(file)
    faiss_engine.add(emb)

    base_name = os.path.splitext(os.path.basename(file))[0]
    music_names.extend([base_name] * len(emb))

    n_count_db += len(emb)

/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/000/000134.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/000/000512.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/000/000760.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/000/000768.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/000/000776.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/001/001018.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/001/001028.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/001/001042.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/001/001043.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/001/001108.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/001/001118.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/001/001126.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/001/001204.h5
/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dbEmb/001/00

In [12]:
# Adicionando os embeddings ao índice
print(f"Foram adicionados:{faiss_engine.ntotal}. Há {n_count_db} vetores db, e há {len(music_names)} vetores de músicas, dummy + db.")

# 53,754,198 + 29,500 vetores
# 53,783,698 vetores

Foram adicionados:53783698. Há 29500 vetores db, e há 53783698 vetores de músicas, dummy + db.


## 2. Leitura do modelo Neural

In [13]:
def load_config(config_fname):
    config_filepath = './config/' + config_fname + '.yaml'
    if os.path.exists(config_filepath):
        print(f'cli: Configuration from {config_filepath}')
    else:
        sys.exit(f'cli: ERROR! Configuration file {config_filepath} is missing!!')

    with open(config_filepath, 'r') as f:
        cfg = yaml.safe_load(f)
    return cfg

In [14]:
def build_fp(cfg):
    """ Build fingerprinter """
    # m_pre: log-power-Mel-spectrogram layer, S.
    m_pre = get_melspec_layer(cfg, trainable=False)

    # m_fp: fingerprinter g(f(.)).
    m_fp = get_fingerprinter(cfg, trainable=False)
    return m_pre, m_fp


@tf.function
def predict(X, m_pre, m_fp):
    """ 
    Test step used for mini-search-validation 
    X -> (B,1,8000)
    """
    #tf.print(X)
    feat = m_pre(X)  # (nA+nP, F, T, 1)
    m_fp.trainable = False
    emb_f = m_fp.front_conv(feat)  # (BSZ, Dim)
    emb_gf = m_fp.div_enc(emb_f)
    emb_gf = tf.math.l2_normalize(emb_gf, axis=1)
    
    return emb_gf # L2(g(f(.))

In [15]:
#audio_dir = '/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/debug_audios/000003.wav'
win_size_sec = 1
hop_size_sec = 0.5

config = "default_RA"
cfg = load_config(config)

m_pre, m_fp = build_fp(cfg)

checkpoint_root_dir:str = "./logs/CHECK_BFTRI_100/101/"
checkpoint = tf.train.Checkpoint(m_fp)
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_root_dir))

cli: Configuration from ./config/default_RA.yaml


<tensorflow.python.checkpoint.checkpoint.InitializationOnlyStatus at 0x7cb84eee3880>

## 3. Predição
### Número de indices de retorno do 'faiss search' e metadados

In [16]:
topN = 1
metadata_file = "/mnt/dataset/public/Fingerprinting/selected_tracks.csv"
metadata_df = pd.read_csv(metadata_file)

### Carrega áudio-query e cria o embedding do mesmo

In [17]:
def get_audio(audiofile, sr_target=8000):
    audio, fs = librosa.load(audiofile, mono=True, sr=sr_target)
    return audio, fs

def nframe(audio, win_size, hop_size):
    frames =librosa.util.frame(x=audio, frame_length=win_size, hop_length=hop_size)
    return frames

In [25]:
"""
def get_audio_wav(filename, duration=1, hop=0.5):
    pt_wav = wave.open(filename, 'r')
    fs = pt_wav.getframerate()

    # Get audio info
    n_frames_in_seg = fs * duration
    n_frames_in_hop = fs * hop  # 2019 09.05
    

    n_frames = pt_wav.getnframes()

    #n_segs = n_frames // n_frames_in_seg
    if n_frames > n_frames_in_seg:
        n_segs = (n_frames - n_frames_in_seg +
                    n_frames_in_hop) // n_frames_in_hop
    else:
        n_segs = 1

    n_segs = int(n_segs)
    assert (n_segs > 0)
    residual_frames = np.max([
        0,
        n_frames - ((n_segs - 1) * n_frames_in_hop + n_frames_in_seg)
    ])
    pt_wav.close()


    fns_event_seg_list = []

    for seg_idx in range(n_segs):
        offset_min, offset_max = int(-1 *
                                        n_frames_in_hop), n_frames_in_hop
        if seg_idx == 0:  # first seg
            offset_min = 0
        if seg_idx == (n_segs - 1):  # last seg
            offset_max = residual_frames

        fns_event_seg_list.append(
            [filename, seg_idx, offset_min, offset_max])

    return fns_event_seg_list
"""

Querys BD

In [18]:
#filesdir = '/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/debug_audios/'
filesdir = '/mnt/dataset/public/Fingerprinting/neural-audio-fp-dataset/music/test-query-db-500-30s/query/snr_00dB_10dB_1s/'
files = glob.glob(os.path.join(filesdir, '**/*.wav') ,recursive = True)
files = sorted(files)

In [19]:
files[2:3]

['/mnt/dataset/public/Fingerprinting/neural-audio-fp-dataset/music/test-query-db-500-30s/query/snr_00dB_10dB_1s/000/000760.wav']

In [27]:
%%time
#a) carregar áudio
emb_list = []

for query in files[:1]:
    audio, fs = get_audio(audiofile=query)
    audio_frames = nframe(audio, int(win_size_sec * fs), int(hop_size_sec*fs))
    audio_frames = np.transpose(audio_frames[np.newaxis, ...], (2, 0,1))
    
    # b) gerar o embedded
    emb = predict(audio_frames, m_pre, m_fp) # tensor 1 - TensorShape([473, 128])
    emb = emb.numpy() # tensor 1 - (473, 128)

    emb_list.append(emb)

CPU times: user 2.38 s, sys: 230 ms, total: 2.61 s
Wall time: 638 ms


In [None]:
#len(music_names) #tem de ser array, mandar o modelo para a gpu, e os dados

In [28]:
emb_list_array = np.concatenate(emb_list, axis = 0)

In [29]:
emb_list_array.shape

(59, 128)

### 'Faiss Search'

In [30]:
%%time
# c) Buscar o índice
D, I = faiss_engine.search(emb_list_array, 20) # D: Distâncias, I: Índices dos resultados

CPU times: user 17.1 s, sys: 99.2 ms, total: 17.2 s
Wall time: 2.87 s


In [34]:
candidates = np.unique(I[np.where(I >= 0)])   # ignore id < 0

Extra

In [None]:
_scores = np.zeros(len(candidates))
for ci, cid in enumerate(candidates):
    _scores[ci] = np.mean(
        np.diag(
            # np.dot(q, index.reconstruct_n(cid, (cid + l)).T)
            np.dot(emb_list_array, fake_recon_index[cid:cid + sl, :].T)
            )
        )

In [95]:
len(music_names)

53783698

In [97]:
music_names[I[0]] #music_names[53754198] #'000134'

array(['132231', '057656', '025143', '059068', '090621', '091308',
       '117950', '070949', '017602', '000776', '070436', '122728',
       '127154', '117858', '146709', '019602', '128439', '128439',
       '018025', '122713'], dtype='<U6')

In [36]:
k=0
for k in range(len(I)):
    print(I[k])

[53769375 53757773 53778611 53756835 53769365 53777426 53754225 53777888
 53760882 53769414 53755485 53778219 53772967 53760575 53763503 53765799
 53773500 53755487 53773036 53770455]
[53769375 53757773 53778611 53754225 53756835 53777888 53769365 53769414
 53755485 53772967 53773036 53778219 53760882 53755487 53763503 53777426
 53760575 53765799 53770455 53761035]
[53769375 53757773 53756835 53769365 53754225 53778611 53760882 53773036
 53778219 53777426 53772967 53755487 53760575 53763503 53770455 53769414
 53769381 53773500 53770974 53765799]
[53757773 53769375 53778611 53756835 53754225 53777426 53773036 53777888
 53755485 53772967 53778219 53769414 53760882 53769365 53755487 53765799
 53766709 53761035 53770974 53769381]
[53757773 53769375 53778611 53756835 53754225 53769365 53777888 53772967
 53777426 53755485 53773036 53769414 53760882 53755487 53778219 53765799
 53760575 53761035 53763503 53773500]
[53757773 53777426 53769375 53778611 53756835 53769365 53755485 53769414
 537542

In [None]:
music_names[I[0][0]] #map ver

In [56]:
music_names = np.array(music_names)

map_obra = lambda idx: music_names[idx]

obras_result = map_obra(I)

In [None]:
obras_result

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.plot(I)

#x-axis -> query
#y-axis -> BD

In [51]:
#obras_result
unique_values, counts = np.unique(obras_result, return_counts=True)
print(unique_values, counts)

valor_com_mais_votos = int(unique_values[np.argmax(counts)])

['000003' '000020' '000026' '000030' '000046' '000048' '000135' '000136'
 '000137'] [ 473  621  362  348  207  409 1672 1016 2463]


In [52]:
#00003
# d) Recuperar os metadados
data = metadata_df.loc[metadata_df["track_id"] == valor_com_mais_votos]

# e) Retornar ao Cliente
print(data)

    track_id artist_name track_title
12       137      Airway      Side A


In [56]:
indice_minimo = np.argmin(D)
valor_minimo = D[indice_minimo]
print(indice_minimo, valor_minimo)

6112 [5.7433986e-06]
