In [3]:
import os
import sys
import time
import glob
import click
import curses
import pathlib
import yaml
import faiss
import wave
import numpy as np
import tensorflow as tf

from eval_RA.utils.get_index_faiss import get_index
from eval_RA.utils.print_table import PrintTable

from model_RA.fp_RA.melspec.melspectrogram_RA import get_melspec_layer
from model_RA.fp_RA.nnfp import get_fingerprinter

2024-06-19 14:22:21.141783: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
def load_config(config_fname):
    config_filepath = './config/' + config_fname + '.yaml'
    if os.path.exists(config_filepath):
        print(f'cli: Configuration from {config_filepath}')
    else:
        sys.exit(f'cli: ERROR! Configuration file {config_filepath} is missing!!')

    with open(config_filepath, 'r') as f:
        cfg = yaml.safe_load(f)
    return cfg

In [5]:
config = "default_RA"
cfg = load_config(config)

cli: Configuration from ./config/default_RA.yaml


In [6]:
def load_memmap_data(source_dir,
                     fname,
                     append_extra_length=None,
                     shape_only=False,
                     display=True):

    path_shape = source_dir + fname + '_shape.npy'
    path_data = source_dir + fname + '.mm'
    data_shape = np.load(path_shape)
    if shape_only:
        return data_shape

    if append_extra_length:
        data_shape[0] += append_extra_length
        data = np.memmap(path_data, dtype='float32', mode='r+',
                         shape=(data_shape[0], data_shape[1]))
    else:
        data = np.memmap(path_data, dtype='float32', mode='r',
                         shape=(data_shape[0], data_shape[1]))
    if display:
        print(f'Load {data_shape[0]:,} items from \033[32m{path_data}\033[0m.')
    return data, data_shape

In [7]:
logsDir = '/mnt/dev/rodrigoalmeida/neural-audio-fp/logs/emb/CHECK_BFTRI_100/101'
emb_dir = logsDir + '/'

emb_dummy_dir = None
index_type='ivfpq'
nogpu=False
k_probe=1
display_interval=5

In [8]:
db, db_shape = load_memmap_data(emb_dir, 'db')

if emb_dummy_dir is None:
    emb_dummy_dir = emb_dir

dummy_db, dummy_db_shape = load_memmap_data(emb_dummy_dir, 'dummy_db')

Load 29,500 items from [32m/mnt/dev/rodrigoalmeida/neural-audio-fp/logs/emb/CHECK_BFTRI_100/101/db.mm[0m.
Load 53,754,198 items from [32m/mnt/dev/rodrigoalmeida/neural-audio-fp/logs/emb/CHECK_BFTRI_100/101/dummy_db.mm[0m.


In [9]:
print(dummy_db.shape, dummy_db_shape, db.shape, db_shape)

(53754198, 128) [53754198      128] (29500, 128) [29500   128]


In [10]:
def create_index(dummy_db, dummy_db_shape):
    #d = dummy_db_shape[1]

    # Build a flat (CPU) index
    index = faiss.IndexFlatL2(128) #

    index_type = 'ivfpq'
    mode = index_type.lower()
    print(f'Creating index: \033[93m{mode}\033[0m')

    # Using IVF-PQ index
    code_sz = 64 # power of 2
    n_centroids = 256#
    nbits = 8  # nbits must be 8, 12 or 16, The dimension d should be a multiple of M.
    index = faiss.IndexIVFPQ(index, 128, n_centroids, code_sz, nbits)

    train_data = dummy_db
    max_train=1e7
    max_nitem_train = int(max_train)

    # Train index

    if len(train_data) > max_nitem_train:
        print('Training index using {:>3.2f} % of data...'.format(
            100. * max_nitem_train / len(train_data)))
        # shuffle and reduce training data
        sel_tr_idx = np.random.permutation(len(train_data))
        sel_tr_idx = sel_tr_idx[:max_nitem_train]
        index.train(train_data[sel_tr_idx,:])

    index.nprobe = 40
    print(f"index trained: {index.is_trained}")

    #index.add(dummy_db); print(f'{len(dummy_db)} items from dummy DB')
    #index.add(db); print(f'{len(db)} items from reference DB') #corresponde aos que estão verdadeiros e iguais ao query_db
    return index

In [15]:
index = create_index(dummy_db, dummy_db_shape)

Creating index: [93mivfpq[0m
Training index using 18.60 % of data...
index trained: True


In [16]:
index.add(dummy_db); print(f'{len(dummy_db)} items from dummy DB')
index.add(db); print(f'{len(db)} items from reference DB') #corresponde aos que estão verdadeiros e iguais ao query_db

53754198 items from dummy DB
29500 items from reference DB


Modelo + emb

In [11]:
def build_fp(cfg):
    """ Build fingerprinter """
    # m_pre: log-power-Mel-spectrogram layer, S.
    m_pre = get_melspec_layer(cfg, trainable=False)

    # m_fp: fingerprinter g(f(.)).
    m_fp = get_fingerprinter(cfg, trainable=False)
    return m_pre, m_fp

@tf.function
def predict(X, m_pre, m_fp):
    """
    X -> (B,1,8000)
    """
    feat = m_pre(X)  # (n, F, T, 1)
    m_fp.trainable = False
    emb_f = m_fp.front_conv(feat)  # (BSZ, Dim)
    emb_gf = m_fp.div_enc(emb_f)
    emb_gf = tf.math.l2_normalize(emb_gf, axis=1)
    return emb_gf # L2(g(f(.))

Carregar audio

In [30]:
"""
def load_audio(queryDir):
    with wave.open(queryDir, 'rb') as wav_file:
        params = wav_file.getparams()
        
        #parâmetros
        nchannels, sampwidth, framerate, nframes = params[:4]
        
        frames = wav_file.readframes(nframes)
        
        audio_data = np.frombuffer(frames)
        return audio_data
"""

In [31]:
"""
import soundfile as sf
def load_audioo(file_path):
    audio, _ = sf.read(file_path)
    
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)  # Convertendo para mono se necessário
    # Normalizar o áudio
    audio = audio / np.max(np.abs(audio))
    return audio
"""

In [18]:
def get_audio_info(cfg, database, fs, hop, duration, segment_mode):

    #file_seg_list = []
    audio_seg_list = []
    frames = []
    for offset_idx, filename in enumerate(database):
        print(filename)
        #base_name = os.path.splitext(os.path.basename(filename))[0]

        if hop == None: hop = cfg['MODEL']['HOP']

        n_frames_in_seg = fs * duration # 8000
        n_frames_in_hop = fs * hop # 4000
        file_ext = filename[-3:] #para ficar só com 'wav'


        if file_ext == 'wav':
            pt_wav = wave.open(filename, 'r')
            
            _fs = pt_wav.getframerate()

            if fs != _fs:
                raise ValueError('Sample rate should be {} but got {}'.format(
                    str(fs), str(_fs)))

            n_frames = pt_wav.getnframes()

            if n_frames > n_frames_in_seg:
                n_segs = (n_frames - n_frames_in_seg +
                            n_frames_in_hop) // n_frames_in_hop
            else:
                n_segs = 1

            n_segs = int(n_segs)

            #file_seg_list.append([filename, n_segs]) #guardar numa lista o nome do audio e o numero de segmentos que tem. Para depois comparar no faiss
            
            assert (n_segs > 0)
            residual_frames = np.max([
                0,
                n_frames - ((n_segs - 1) * n_frames_in_hop + n_frames_in_seg)
            ])
            
            i=0
            while i in range(n_frames):
                frames.append(pt_wav.readframes(i))
            pt_wav.close()
        else:
            raise NotImplementedError(file_ext)
        

        #guardar os segmentos por ficheiro de áudio
        if segment_mode == 'all':
            for seg_idx in range(n_segs):
                offset_min, offset_max = int(-1 *
                                             n_frames_in_hop), n_frames_in_hop
                if seg_idx == 0:  # first seg
                    offset_min = 0
                if seg_idx == (n_segs - 1):  # last seg
                    offset_max = residual_frames

                audio_seg_list.append(
                    [filename, seg_idx, offset_min, offset_max])

    """ 
    #isto é para fazer quando carrego o ficheiro .h5  
    total_segments = 0
    for item in file_seg_list:
        total_segments += item[1]
        item.append(total_segments)
    """ 
    return audio_seg_list, frames

Carregar o query

In [19]:
queryDir = '/mnt/dataset/public/Fingerprinting/query_procura/000003.wav'
m_pre, m_fp = build_fp(cfg)
#audio = load_audio(queryDir)
#audio = load_audioo(queryDir)

source_root_dir = cfg['DIR']['SOURCE_ROOT_DIR']
dummy_db = sorted(glob.glob(source_root_dir + 'test-dummy-db-100k-full/' +'**/*.wav', recursive=True))
fs = 8000
hop = None
duration = 1
segment_mode:str = 'all'

x, framaes = get_audio_info(cfg, dummy_db[:1], fs, hop, duration, segment_mode)

/mnt/dataset/public/Fingerprinting/neural-audio-fp-dataset/music/test-dummy-db-100k-full/fma_full/000/000003.wav


Procurar o query

In [None]:
# segment-level top k search for each segment
_, I = index.search(q, k_probe) # _: distance, I: result IDs matri

In [None]:
q = query[test_id:(test_id + sl), :] # shape(q) = (length, dim)



# offset compensation to get the start IDs of candidate sequences
for offset in range(len(I)):
    I[offset, :] -= offset

# unique candidates
candidates = np.unique(I[np.where(I >= 0)])   # ignore id < 0

""" Sequence match score """
_scores = np.zeros(len(candidates))
for ci, cid in enumerate(candidates):
    _scores[ci] = np.mean(
        np.diag(
            # np.dot(q, index.reconstruct_n(cid, (cid + l)).T)
            np.dot(q, fake_recon_index[cid:cid + sl, :].T)
            )
        )

""" Evaluate """
pred_ids = candidates[np.argsort(-_scores)[:10]]
# pred_id = candidates[np.argmax(_scores)] <-- only top1-hit

# top1 hit
top1_exact[ti, si] = int(gt_id == pred_ids[0])
top1_near[ti, si] = int(
    pred_ids[0] in [gt_id - 1, gt_id, gt_id + 1])
# top1_song = need song info here...


#if (ti != 0) & ((ti % display_interval) == 0):
#    top1_exact_rate = 100. * np.mean(top1_exact[:ti + 1, :], axis=0)
#    top1_near_rate = 100. * np.mean(top1_near[:ti + 1, :], axis=0)

In [33]:
#nao interessa
def split_audio_into_segments(audio, segment_duration=1.0, hop_size=0.5, sample_rate=8000):
    segment_samples = int(segment_duration * sample_rate)
    hop_samples = int(hop_size * sample_rate)

    segments = []
    for start in range(0, len(audio) - segment_samples + 1, hop_samples):
        segment = audio[start:start + segment_samples]
        if len(segment) == segment_samples:
            segments.append(segment)
    
    return segments

In [None]:
segments = []
for start in range(0, len(audio) - segment_samples + 1, hop_samples):
    segment = audio[start:start + segment_samples]
    if len(segment) == segment_samples:
        segments.append(segment)

Carregar audio como no projeto

In [62]:
duration=1
hop=.5
fs=8000
seg_mode="all"
amp_mode='normal'
offset_margin_hop_rate = 0.4
reduce_items_p=0
reduce_batch_first_half=False
experimental_mode=False
shuffle=False
random_offset_anchor=False
drop_the_last_non_full_batch=False # No augmentations...

ts_batch_s = _ts_n_anchor = n_anchor = 125

#Como bsz == n_anchor, então:
n_pos_per_anchor = 0
n_pos_bsz = 0

offset_margin_frame = int(hop * offset_margin_hop_rate * fs) #1600


fns_event_seg_list = x

n_samples = len(fns_event_seg_list) # fp-generation #53754198


index_event = np.arange(n_samples) #index_event: [ 0 1 2 ... 53754195 53754196 53754197]
print(f"index_event: {index_event}")


assert(reduce_items_p <= 100)

index_event: [       0        1        2 ... 53754195 53754196 53754197]


In [None]:
def load_audio(filename=str(),
               seg_start_sec=float(),
               offset_sec=0.0,
               seg_length_sec=float(),
               seg_pad_offset_sec=0.0,
               fs=22050,
               amp_mode='normal'):
    """
        Open file to get file info --> Calulate index range
        --> Load sample by index --> Padding --> Max-Normalize --> Out
        
    """
    start_frame_idx = np.floor((seg_start_sec + offset_sec) * fs).astype(int)
    seg_length_frame = np.floor(seg_length_sec * fs).astype(int)
    end_frame_idx = start_frame_idx + seg_length_frame

    # Get file-info
    file_ext = filename[-3:]
    #print(start_frame_idx, end_frame_idx)

    if file_ext == 'wav':
        pt_wav = wave.open(filename, 'r')
        pt_wav.setpos(start_frame_idx)
        x = pt_wav.readframes(end_frame_idx - start_frame_idx)
        x = np.frombuffer(x, dtype=np.int16)
        x = x / 2**15  # dtype=float
    else:
        raise NotImplementedError(file_ext)

    # padding process. it works only when win_size> audio_size and padding='random'
    audio_arr = np.zeros(int(seg_length_sec * fs))
    seg_pad_offset_idx = int(seg_pad_offset_sec * fs)
    audio_arr[seg_pad_offset_idx:seg_pad_offset_idx + len(x)] = x
    return audio_arr

In [None]:
def load_audio_multi_start(filename=str(),
                           seg_start_sec_list=[],
                           seg_length_sec=float(),
                           fs=22050,
                           amp_mode='normal'):
    
    """ Load_audio wrapper for loading audio with multiple start indices. """

    out = None
    for seg_start_sec in seg_start_sec_list:
        x = load_audio(filename=filename,
                       seg_start_sec=seg_start_sec,
                       seg_length_sec=seg_length_sec,
                       fs=8000)
        x = x.reshape((1, -1))
        if out is None:
            out = x
        else:
            out = np.vstack((out, x))
    return out  # (B,T)

In [63]:
def __event_batch_load(anchor_idx_list):
        """ Get Xa_batch and Xp_batch for anchor (original) and positive (replica) samples. """
        Xa_batch = None
        Xp_batch = None


        for idx in anchor_idx_list:  # idx: index for one sample
            pos_start_sec_list = []


            # fns_event_seg_list = [[filename, seg_idx, offset_min, offset_max], [ ... ] , ... [ ... ]]
            offset_min, offset_max = fns_event_seg_list[idx][
                2], fns_event_seg_list[idx][3]
            anchor_offset_min = np.max([offset_min, -offset_margin_frame])
            anchor_offset_max = np.min([offset_max, offset_margin_frame])
            

            if (random_offset_anchor == True) & (experimental_mode
                                                      == False):
                # Usually, we can apply random offset to anchor only in training.
                np.random.seed(idx)

                # Calculate anchor_start_sec
                _anchor_offset_frame = np.random.randint(
                    low=anchor_offset_min, high=anchor_offset_max)
                _anchor_offset_sec = _anchor_offset_frame / fs
                anchor_start_sec = fns_event_seg_list[idx][
                    1] * hop + _anchor_offset_sec
            else:
                _anchor_offset_frame = 0
                anchor_start_sec = fns_event_seg_list[idx][1] * hop


            """
            load audio returns: [anchor, pos1, pos2,..pos_n]
            """
            #print(self.fns_event_seg_list[idx])
            start_sec_list = np.concatenate(
                ([anchor_start_sec], pos_start_sec_list))
            
            xs = load_audio_multi_start(fns_event_seg_list[idx][0],
                                        start_sec_list,
                                        duration,
                                        fs,
                                        amp_mode)  # xs: ((1+n_pos)),T)

            if Xa_batch is None:
                Xa_batch = xs[0, :].reshape((1, -1))
                Xp_batch = xs[
                    1:, :]  # If self.n_pos_per_anchor==0: this produces an empty array
            else:
                Xa_batch = np.vstack((Xa_batch, xs[0, :].reshape(
                    (1, -1))))  # Xa_batch: (n_anchor, T)
                Xp_batch = np.vstack(
                    (Xp_batch, xs[1:, :]))  # Xp_batch: (n_pos, T)
        return Xa_batch, Xp_batch

In [64]:
def __getitem__(idx):
        """ Get anchor (original) and positive (replica) samples. """

        index_anchor_for_batch = index_event[idx * n_anchor:(idx + 1) * n_anchor]

        Xa_batch, Xp_batch = __event_batch_load(index_anchor_for_batch)


        Xa_batch = np.expand_dims(Xa_batch,
                                  1).astype(np.float32)  # (n_anchor, 1, T)
        Xp_batch = np.expand_dims(Xp_batch,
                                  1).astype(np.float32)  # (n_pos, 1, T)
        
        return Xa_batch