In [1]:
import os
import sys
import yaml
import time
import glob
import h5py
import faiss
import click
import curses
import librosa

import numpy as np
import pandas as pd
import tensorflow as tf
from pydub import AudioSegment


from model_RA.fp_RA.melspec.melspectrogram_RA import get_melspec_layer
from model_RA.fp_RA.nnfp import get_fingerprinter
from model_RA.utils.dataloader_keras import genUnbalSequence

2024-06-22 00:46:05.266821: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Pipeline

## 1. Construção do BD Vetorial

1) Carregar os vetores embedded
2) Criar a instância do Faiss
3) Carregar o Faiss com os Dados

## 2. Leitura do Modelo Neural

1) Carrega a classe do modelo
2) ler o modelo

## 3. Predição

1) Receber o dado (áudio query)
2) calcular o embedded - model.predict
3) buscar o índice no BD vetorial
4) recuperar os metadados associados ao índice (nome da música, etc)
5) apresentar ao cliente

# Construção Demo

## 1. Construção do BD Vetorial

In [92]:
def load_h5_data(source_dir):
    h5Files = sorted(glob.glob(source_dir + '**/*.h5', recursive=True))

    embs_count = 0
    embs_info = []
    embs = []
    music_names = []

    for i in range(len(h5Files[:29998])):
        with h5py.File(h5Files[i], "r") as f:
            #print(i)
            base_name = os.path.splitext(os.path.basename(h5Files[i]))[0]
            #primeiro objeto é o que contém os embeddings
            a_group_key = list(f.keys())[0]

            #Extração dos embs como um array
            ds_arr = f[a_group_key][()]  # returns as a numpy array
            #print(ds_arr.shape)
            embeddings = np.squeeze(ds_arr, axis=1)
            #print(embeddings.shape)
            embs.append(embeddings) #Guarda na lista os embs
            #arrayEmb = ds_arr

            embs_count += len(ds_arr) #conta quantos embs tem o vetor
            embs_info.append([i, base_name, embs_count]) #guarda numa lista o número de vetores até o momento.
            # embs_info = [indice, file_name, n_segs]

            music_names.extend([base_name] * len(ds_arr))
            
            f.close()
    return embs, embs_info, music_names

In [3]:
def load_memmap_data(source_dir,
                     fname,
                     append_extra_length=None,
                     shape_only=False,
                     display=True):

    path_shape = source_dir + fname + '_shape.npy'
    path_data = source_dir + fname + '.mm'
    data_shape = np.load(path_shape)
    if shape_only:
        return data_shape

    if append_extra_length:
        data_shape[0] += append_extra_length
        data = np.memmap(path_data, dtype='float32', mode='r+',
                         shape=(data_shape[0], data_shape[1]))
    else:
        data = np.memmap(path_data, dtype='float32', mode='r',
                         shape=(data_shape[0], data_shape[1]))
    if display:
        print(f'Load {data_shape[0]:,} items from \033[32m{path_data}\033[0m.')
        
    return data, data_shape


def create_index(db_embeddings, nogpu=True, n_centroids=256, code_sz=64, nbits=8):
    #faiss.IndexIVFPQ(quantizer, d, n_centroids, code_sz, nbits), com d=, nlist=n_centroids=50, m=code_sz=8, bits=nbits=8
    #faiss.IndexIVFPQ(quantizer, d, nlist, m, bits)
    #n_centroids -> clusters
    
    d = db_embeddings.shape[1]  # Dim emb #len(db_embeddings[0][0][0])

    quantizer = faiss.IndexFlatL2(d)

    code_sz = 64 # power of 2
    n_centroids = 256 #Veronoi Cells
    nbits = 8  # nbits must be 8, 12 or 16, The dimension d should be a multiple of M.
    index = faiss.IndexIVFPQ(quantizer, d, n_centroids, code_sz, nbits) #Adicona clustering

    # Se não usar GPU
    if not nogpu:
        res = faiss.StandardGpuResources()
        index = faiss.index_cpu_to_gpu(res, 0, index)

    if not index.is_trained:
        index.train(db_embeddings)

    # Adicionando os embeddings ao índice
    index.add(db_embeddings)
    print(f"Foram adicionados:{index.ntotal}")
    
    return index

### Carrega os embeddings de cada música da base de dados

In [93]:
data_dir = '/mnt/dev/rodrigoalmeida/neural-audio-fp/logs/emb/CHECK_BFTRI_100/101/'
dummy_db, dummy_db_shape = load_memmap_data(data_dir, 'dummy_db')
h5Dir = '/mnt/dataset/public/Fingerprinting/Embeddings_BFTRI/dummy_db/'
h5Embs, embs_info, music_names = load_h5_data(h5Dir)
embsArrayDummy=np.vstack(h5Embs)

Load 53,754,198 items from [32m/mnt/dev/rodrigoalmeida/neural-audio-fp/logs/emb/CHECK_BFTRI_100/101/dummy_db.mm[0m.


### Cria os índices, pois são adicionados os embeddings da base de dados

In [5]:
faiss_engine = create_index(embsArrayDummy, nogpu=True, n_centroids=256, code_sz=64, nbits=8) #dummy_db

Foram adicionados:17336985


## 2. Leitura do modelo Neural

In [19]:
def load_config(config_fname):
    config_filepath = './config/' + config_fname + '.yaml'
    if os.path.exists(config_filepath):
        print(f'cli: Configuration from {config_filepath}')
    else:
        sys.exit(f'cli: ERROR! Configuration file {config_filepath} is missing!!')

    with open(config_filepath, 'r') as f:
        cfg = yaml.safe_load(f)
    return cfg


def build_fp(cfg):
    """ Build fingerprinter """
    # m_pre: log-power-Mel-spectrogram layer, S.
    m_pre = get_melspec_layer(cfg, trainable=False)

    # m_fp: fingerprinter g(f(.)).
    m_fp = get_fingerprinter(cfg, trainable=False)
    return m_pre, m_fp


@tf.function
def predict(X, m_pre, m_fp):
    """ 
    Test step used for mini-search-validation 
    X -> (B,1,8000)
    """
    #tf.print(X)
    feat = m_pre(X)  # (nA+nP, F, T, 1)
    m_fp.trainable = False
    emb_f = m_fp.front_conv(feat)  # (BSZ, Dim)
    emb_gf = m_fp.div_enc(emb_f)
    emb_gf = tf.math.l2_normalize(emb_gf, axis=1)
    
    return emb_gf # f(.), L2(f(.)), L2(g(f(.))

In [20]:
config = "default_RA"
cfg = load_config(config)

m_pre, m_fp = build_fp(cfg)

checkpoint_root_dir:str = "./logs/CHECK_BFTRI_100/101/"
checkpoint = tf.train.Checkpoint(m_fp)
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_root_dir))

cli: Configuration from ./config/default_RA.yaml


<tensorflow.python.checkpoint.checkpoint.InitializationOnlyStatus at 0x72aa4be62790>

## 3. Predição
### Número de indices de retorno do 'faiss search' e metadados

In [195]:
topN = 1
metadata_file = "/mnt/dataset/public/Fingerprinting/selected_tracks.csv"
metadata_df = pd.read_csv(metadata_file)

### Carrega áudio-query e cria o embedding do mesmo

In [9]:
def get_audio(audiofile, sr_target=8000):
    audio, fs = librosa.load(audiofile, mono=True, sr=sr_target)
    return audio

In [22]:
#source_root_dir = '/mnt/dataset/public/Fingerprinting/neural-audio-fp-dataset/music/'
audio_dir = '/mnt/dataset/public/Fingerprinting/query_procura/000003.wav' #audio query
"""ts_dummy_db_source_fps = sorted(
    glob.glob(audio_dir, recursive=True))"""

dur = cfg['MODEL']['DUR']
hop = cfg['MODEL']['HOP']
fs = cfg['MODEL']['FS']
bsz = ts_batch_sz = cfg['BSZ']['TS_BATCH_SZ']

_ts_n_anchor = ts_batch_sz
ds = genUnbalSequence(
    list(audio_dir),
    ts_batch_sz,
    _ts_n_anchor,
    dur,
    hop,
    fs,
    shuffle=False,
    random_offset_anchor=False,
    drop_the_last_non_full_batch=False)

enq = tf.keras.utils.OrderedEnqueuer(ds,use_multiprocessing=True,shuffle=False)
enq.start(workers=cfg['DEVICE']['CPU_N_WORKERS'], max_queue_size=cfg['DEVICE']['CPU_MAX_QUEUE'])

i = 0
emb_query_list = []

while i < len(enq.sequence):
    X, _ = next(enq.get())
    emb = predict(X, m_pre, m_fp)
    emb_query_list.append(emb.numpy())
    i += 1
enq.stop()

In [47]:
#emb_query_list, está dividido em batchs de 125+125+125+98

In [48]:
"""
def split_audio_into_segments(audio, dur=1.0, hop=.5, fs=8000):
    segment_samples = int(dur * fs) # 8000
    hop_samples = int(hop * fs) # 4000

    segments = []
    for start in range(0, len(audio) - segment_samples + 1, hop_samples):
        #len(audio) = 1898580; segment_samples = 8000; hop_samples = 4000
        #0, 1898580 - 8000 + 1, 4000

        #fica a faltar as residual frames, que são as ultimas que são menores que 1 segundo

        segment = audio[start:start + segment_samples] #faz os vetores de 8000 em 8000, isto é, de 1s em 1s, com 

        if len(segment) == segment_samples:
            segments.append(segment)

    return segments


dur=cfg['MODEL']['DUR'] 
hop=cfg['MODEL']['HOP'] 
fs=cfg['MODEL']['FS']

audio = get_audio(audio_dir)
segments = split_audio_into_segments(audio, dur, hop, fs)
X_segments = tf.convert_to_tensor(segments, dtype=tf.float32)

checkpoint_root_dir:str = "./logs/CHECK_BFTRI_100/101/"
checkpoint = tf.train.Checkpoint(m_fp)
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_root_dir))

emb = [] 
for i in range(len(X_segments)):
    X = tf.reshape(X_segments[i],(1, 1,-1)) # podia meter um append, mas pode ser uma variável auxiliar, porque só preciso do tensor para gerar o embedding

    embedding = predict(X, m_pre, m_fp)
    emb.append(embedding.numpy())
"""

'\ndef split_audio_into_segments(audio, dur=1.0, hop=.5, fs=8000):\n    segment_samples = int(dur * fs) # 8000\n    hop_samples = int(hop * fs) # 4000\n\n    segments = []\n    for start in range(0, len(audio) - segment_samples + 1, hop_samples):\n        #len(audio) = 1898580; segment_samples = 8000; hop_samples = 4000\n        #0, 1898580 - 8000 + 1, 4000\n\n        #fica a faltar as residual frames, que são as ultimas que são menores que 1 segundo\n\n        segment = audio[start:start + segment_samples] #faz os vetores de 8000 em 8000, isto é, de 1s em 1s, com \n\n        if len(segment) == segment_samples:\n            segments.append(segment)\n\n    return segments\n\n\n\ndur=cfg[\'MODEL\'][\'DUR\'] \nhop=cfg[\'MODEL\'][\'HOP\'] \nfs=cfg[\'MODEL\'][\'FS\']\n\naudio = get_audio(audio_dir)\nsegments = split_audio_into_segments(audio, dur, hop, fs)\nX_segments = tf.convert_to_tensor(segments, dtype=tf.float32)\n\ncheckpoint_root_dir:str = "./logs/CHECK_BFTRI_100/101/"\ncheckpoint = 

In [244]:
#a) carregar áudio
#audio, fs = get_audio(audio_dir)

In [251]:
# b) gerar o embedded
"""
segments = split_audio_into_segments(audio, dur, hop, fs)
X_segments = tf.convert_to_tensor(segments, dtype=tf.float32)

emb = [] 
for i in range(len(X_segments)):
    X = tf.reshape(X_segments[i],(1, 1,-1)) # podia meter um append, mas pode ser uma variável auxiliar, porque só preciso do tensor para gerar o embedding

    _,embedding = predict(X, m_pre, m_fp)
    emb.append(embedding.numpy())
"""

'\nX = tf.convert_to_tensor(audiox, dtype=tf.float32)\nX = tf.reshape(X, (1, 1, 8000))\n_, emb = predict(X, m_pre, m_fp)\nemb= emb.numpy()\n'

In [25]:
emb_query_array = np.vstack(emb_query_list) #emb_array[472] = emb_query_list[3][97] pois emb_query_list tem 4 batches, sendo os 3 primeiros preenchidos até 125 vetores, e o último com 98 vetores.
#Com esta conversão passo a ter um array com todos os vetores, ou seja, os 473
#genérico: O tem a seguinte forma emb_query_list[N_BSZ][BSZ], e quando o último não está preenchido tem o valor que entre 0 e 125. Pois, BSZ de teste = 125

### 'Faiss Search'

In [26]:
print(faiss_engine.ntotal)
print(faiss_engine.nprobe)

# c) Buscar o índice
D, I = faiss_engine.search(emb_query_array, 1) # D: Distâncias, I: Índices dos resultados

17336985
1


In [86]:
emb_query_array.shape

(473, 128)

In [None]:
I

In [35]:
#candidates = np.unique(I[np.where(I >= 0)])   # ignore id < 0

In [37]:
len(embs_info)

29998

In [89]:
I[0] 

array([10946321])

In [88]:
embs_info[i][2]

17336985

In [134]:
music_names[I[0][0]]
#map ver

'033986'

In [135]:
I[0][0]

10946321

In [160]:
music_names = np.array(music_names)

map_obra = lambda idx: music_names[idx]

obras_result = map_obra(I)

In [198]:
#obras_result
unique_values, counts = np.unique(obras_result, return_counts=True)
print(unique_values, counts)

valor_com_mais_votos = int(unique_values[np.argmax(counts)])

['003365' '014460' '019548' '019636' '019637' '019787' '019878' '019903'
 '019913' '019914' '020066' '020762' '021277' '021278' '021308' '022066'
 '022127' '022739' '024458' '024459' '026359' '026500' '027140' '027141'
 '027459' '027461' '028554' '031797' '031798' '032203' '032926' '033030'
 '033357' '033985' '033986' '034203' '035034' '036782' '036785' '037694'
 '038451' '038452' '038963' '038968' '039204' '040710' '040798' '040954'
 '041233' '041377' '041378' '041872' '043253' '044230' '044747' '045936'
 '046756' '047667' '048605'] [  2   1   1   4   1   1   1   1  59   3   2   1   3   1   1   1   1   3
  18  19   2   1   2   1   7   9   3   8   2   4   1   5   1 130 128   1
   3   1   1   1   1   1   1   1   1   1   1   1   1   5   3   3   1   1
   1   4   1   9   1]


: 

In [196]:
#00003
# d) Recuperar os metadados
data = metadata_df.loc[metadata_df["track_id"] == valor_com_mais_votos]

# e) Retornar ao Cliente
print(data)

       track_id         artist_name        track_title
21753     33985  The Pleasure Kills  Dancing On My Bed


In [90]:
x = 0
idx = 0
for i in range(len(embs_info)):
    if embs_info[i][2] < I[0]:
        x=embs_info[i+1][2]
        idx = i+1

In [91]:
idx, x, I[0]

(19091, 10946453, array([10946321]))

In [None]:
indice_minimo = np.argmin(D)
valor_minimo = D[indice_minimo]
print(indice_minimo, valor_minimo)

In [None]:
unique_values, counts = np.unique(II, return_counts=True)
print(unique_values, counts)
valor_com_mais_votos = unique_values[np.argmax(counts)]
print(valor_com_mais_votos)

In [84]:
embs_info[idx]

[1523,
 '003365',
 764383,
 array([[[-0.0110972 , -0.07463508,  0.05751495, ...,  0.03888726,
          -0.01230168,  0.10429765]],
 
        [[-0.07340898, -0.07603618,  0.03904646, ...,  0.03761718,
          -0.01233643,  0.06865104]],
 
        [[-0.07431366, -0.11635654,  0.03024611, ...,  0.0690169 ,
           0.03337203,  0.13259956]],
 
        ...,
 
        [[ 0.03395545, -0.09449603,  0.01328527, ...,  0.05414047,
          -0.00072304,  0.08297522]],
 
        [[-0.04677773, -0.06725947,  0.03903989, ...,  0.04469272,
           0.0060822 ,  0.05386278]],
 
        [[-0.13497181, -0.11935817,  0.03368724, ...,  0.10156236,
           0.09774466,  0.00426541]]], dtype=float32)]

In [107]:
# d) Recuperar os metadados
#data = metadata_df.loc(metadata_df["track_id"]==I)
data = metadata_df.loc[metadata_df["track_id"] == int(obras_result)]

# e) Retornar ao Cliente
print(data)

       track_id         artist_name            track_title
21754     33986  The Pleasure Kills  Pictures On The Floor
