In [129]:
import os
import sys
import yaml
import glob
import wave

import tensorflow as tf
import librosa
import numpy as np
import pickle

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

from model.fp.melspec.melspectrogram import get_melspec_layer
from model.fp.specaug_chain.specaug_chain import get_specaug_chain_layer
from model.fp.nnfp import get_fingerprinter

In [101]:
def build_fp(cfg):
    """ Build fingerprinter """
    # m_pre: log-power-Mel-spectrogram layer, S.
    m_pre = get_melspec_layer(cfg, trainable=False)

    # m_specaug: spec-augmentation layer.
    #m_specaug = get_specaug_chain_layer(cfg, trainable=False)
    #assert(m_specaug.bypass==False) # Detachable by setting m_specaug.bypass.

    # m_fp: fingerprinter g(f(.)).
    m_fp = get_fingerprinter(cfg, trainable=False)
    m_fp.trainable = False
    
    return m_pre, m_fp

In [130]:
def load_config(config_fname):
    config_filepath = './config/' + config_fname + '.yaml'
    if os.path.exists(config_filepath):
        print(f'cli: Configuration from {config_filepath}')
    else:
        sys.exit(f'cli: ERROR! Configuration file {config_filepath} is missing!!')

    with open(config_filepath, 'r') as f:
        cfg = yaml.safe_load(f)
    return cfg


@tf.function
def predict(X, m_fp):
    """ 
    Test step used for mini-search-validation 
    X -> (B,1,8000)
    """
    emb_gf = m_fp(X)

    return emb_gf


def load_model():

    checkpoint_name_dir:str = "./logs/CHECKPOINT_BSZ_120"  #"CHECKPOINT"   # string
    config:str = "default"   

    cfg = load_config(config)

    m_pre, m_fp = build_fp(cfg)

    checkpoint = tf.train.Checkpoint(m_fp)
    checkpoint.restore(tf.train.latest_checkpoint(checkpoint_name_dir))
        
    return m_fp

In [103]:
@tf.function
def predict(X, m_fp):
    """ 
    Test step used for mini-search-validation 
    X -> (B,1,8000)
    """
    emb_gf = m_fp(X)

    return emb_gf

In [104]:
audioDir = '/mnt/dataset/public/Fingerprinting/teste/uniqueFile/002000.wav'

fns_list = sorted(glob.glob(audioDir, recursive=True))

fns_seg_list = []
seg_list_test = []

fs:int = 8000
duration:int = 1
hop:int = 0.5

for offset_idx, filename in enumerate(fns_list):
    # Get audio info
    n_frames_in_seg = fs * duration
    n_frames_in_hop = fs * hop  # 2019 09.05
    file_ext = filename[-3:]

    if file_ext == 'wav':
        pt_wav = wave.open(filename, 'r')
        _fs = pt_wav.getframerate()

        if fs != _fs:
            raise ValueError('Sample rate should be {} but got {}'.format(
                str(fs), str(_fs)))

        n_frames = pt_wav.getnframes()
        #n_segs = n_frames // n_frames_in_seg
        if n_frames > n_frames_in_seg:
            n_segs = (n_frames - n_frames_in_seg +
                        n_frames_in_hop) // n_frames_in_hop
        else:
            n_segs = 1

        n_segs = int(n_segs)
        assert (n_segs > 0)
        residual_frames = np.max([
            0,
            n_frames - ((n_segs - 1) * n_frames_in_hop + n_frames_in_seg)
        ])
        
        #print(f"filename:{filename}\nn_frames_in_seg:{n_frames_in_seg}\nn_frames_in_hop:{n_frames_in_hop}\nfile_ext:{file_ext}\n")
        #print(f"_fs:{_fs}\nn_frames:{n_frames}\nn_segs:{n_segs}\nresidual_frames:{residual_frames}\n")
        
        pt_wav.close()

    else:
        raise NotImplementedError(file_ext)
    

    for seg_idx in range(n_segs):
        offset_min, offset_max = int(-1 * n_frames_in_hop), n_frames_in_hop
        
        #print(f"seg_idx:{seg_idx}\noffset_min:{offset_min}\noffset_max:{offset_max}\nfns_seg_list:{fns_seg_list}\n")

        if seg_idx == 0:  # first seg
            offset_min = 0
        if seg_idx == (n_segs - 1):  # last seg
            offset_max = residual_frames
        if seg_idx == 3:
            seg_list_test.append( [filename, seg_idx, offset_min, offset_max])
            print(f"filename:{filename}\nseg_idx:{seg_idx}\noffset_min:{offset_min}\noffset_max:{offset_max}\n")

        fns_seg_list.append(
            [filename, seg_idx, offset_min, offset_max])
        
        #print(f"seg_idx:{seg_idx}\noffset_min:{offset_min}\noffset_max:{offset_max}\nfns_seg_list:{fns_seg_list}\n")
    

filename:/mnt/dataset/public/Fingerprinting/teste/uniqueFile/002000.wav
seg_idx:3
offset_min:-4000
offset_max:4000.0



In [105]:
def load_audio(filename=str(),
               seg_start_sec=float(),
               offset_sec=0.0,
               seg_length_sec=float(),
               seg_pad_offset_sec=0.0,
               fs=22050,
               amp_mode='normal'):
    """
        Open file to get file info --> Calulate index range
        --> Load sample by index --> Padding --> Max-Normalize --> Out
        
    """
    start_frame_idx = np.floor((seg_start_sec + offset_sec) * fs).astype(int)
    seg_length_frame = np.floor(seg_length_sec * fs).astype(int)
    end_frame_idx = start_frame_idx + seg_length_frame

    # Get file-info
    file_ext = filename[-3:]
    print(start_frame_idx, end_frame_idx)

    if file_ext == 'wav':
        pt_wav = wave.open(filename, 'r')
        pt_wav.setpos(start_frame_idx)
        x = pt_wav.readframes(end_frame_idx - start_frame_idx)
        x = np.frombuffer(x, dtype=np.int16)
        x = x / 2**15  # dtype=float
    else:
        raise NotImplementedError(file_ext)

    # Max Normalize, random amplitude
    if amp_mode == 'normal':
        pass
    elif amp_mode == 'max_normalize':
        _x_max = np.max(np.abs(x))
        if _x_max != 0:
            x = x / _x_max
    else:
        raise ValueError('amp_mode={}'.format(amp_mode))

    # padding process. it works only when win_size> audio_size and padding='random'
    audio_arr = np.zeros(int(seg_length_sec * fs))
    seg_pad_offset_idx = int(seg_pad_offset_sec * fs)
    audio_arr[seg_pad_offset_idx:seg_pad_offset_idx + len(x)] = x
    return audio_arr

In [106]:
x = load_audio(filename=filename,
               seg_start_sec=1.5,
               seg_length_sec=1,
               fs=8000)

12000 20000


In [111]:
import numpy as np
print(type(x))
print(np.shape(x))
print(x)

<class 'numpy.ndarray'>
(8000,)
[0.0213623  0.01916504 0.021698   ... 0.02557373 0.0295105  0.02172852]


In [112]:
x = x.reshape((1, -1))

In [113]:
print(type(x))
print(np.shape(x))
print(x)

<class 'numpy.ndarray'>
(1, 8000)
[[0.0213623  0.01916504 0.021698   ... 0.02557373 0.0295105  0.02172852]]


In [114]:
def run(x, m_fp):
    '''
    filepath: (str, ndarray)
    '''

    # tenho as tramas e de 8000 amostras tenho de chamar a get_melspec, tenho de trazer o get_melspec e o get_fringerprint, tenho trazer uma matriz do input_shape para que consigo ver os pesos
    # tem de ser m_spec, _, m_fp = build_fp(cfg). matching. tirar o librosa pegar no sinal, passar no fingerprint. 3 tensores, um para mel spec, um para data_aug e outro para a rede.
    # (256,32,1) de entrada no X, tenho de alterar em vez de 8000
    
    emb = predict(x, m_fp)

    tf.config.run_functions_eagerly(True)

    return emb.numpy()

In [115]:
m_fp = load_model()
emb = run(x, m_fp)

cli: Configuration from ./config/default.yaml


ValueError: in user code:

    File "/tmp/ipykernel_3084456/2396453651.py", line 7, in predict  *
        emb_gf = m_fp(X)
    File "/home/rodrigo/anaconda3/envs/tf/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/tmp/__autograph_generated_file7mrpzq9p.py", line 10, in tf__call
        x = ag__.converted_call(ag__.ld(self).front_conv, (ag__.ld(inputs),), None, fscope)
    File "/tmp/__autograph_generated_fileu8kwn9i4.py", line 15, in tf__call
        raise

    ValueError: Exception encountered when calling layer 'finger_printer_5' (type FingerPrinter).
    
    in user code:
    
        File "/home/rodrigo/Documents/neural-audio-fp/model/fp/nnfp.py", line 229, in call  *
            x = self.front_conv(inputs) # (B,D) with D = (T/2^4) x last_hidden_ch
        File "/home/rodrigo/anaconda3/envs/tf/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "/tmp/__autograph_generated_fileu8kwn9i4.py", line 15, in tf__call
            raise
    
        ValueError: Exception encountered when calling layer 'conv_layer_40' (type ConvLayer).
        
        in user code:
        
            File "/home/rodrigo/Documents/neural-audio-fp/model/fp/nnfp.py", line 83, in call  *
                return self.forward(x)
            File "/home/rodrigo/anaconda3/envs/tf/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
                raise e.with_traceback(filtered_tb) from None
            File "/home/rodrigo/anaconda3/envs/tf/lib/python3.11/site-packages/keras/src/engine/input_spec.py", line 253, in assert_input_compatibility
                raise ValueError(
        
            ValueError: Exception encountered when calling layer 'sequential_683' (type Sequential).
            
            Input 0 of layer "conv2d_80" is incompatible with the layer: expected min_ndim=4, found ndim=2. Full shape received: (1, 8000)
            
            Call arguments received by layer 'sequential_683' (type Sequential):
              • inputs=tf.Tensor(shape=(1, 8000), dtype=float32)
              • training=None
              • mask=None
        
        
        Call arguments received by layer 'conv_layer_40' (type ConvLayer):
          • x=tf.Tensor(shape=(1, 8000), dtype=float32)
    
    
    Call arguments received by layer 'finger_printer_5' (type FingerPrinter):
      • inputs=tf.Tensor(shape=(1, 8000), dtype=float32)


In [81]:
#seg_list_test[0][2]

-4000

In [95]:
"""
seg_list_test[0][0]
seg_list_test[0][2]
duration = 1
fs = 8000
amp_mode = 'normal'
"""

"\nseg_list_test[0][0]\nseg_list_test[0][2]\nduration = 1\nfs = 8000\namp_mode = 'normal'\n"

In [None]:
#xs = load_audio_multi_start(fns_event_seg_list[idx][0], start_sec_list, duration, fs, amp_mode)  # xs: ((1+n_pos)),T)

In [123]:
norm = 'layer_norm2d'
q = 128

if norm in ['layer_norm1d', 'layer_norm2d']:
    BN = [tf.keras.layers.LayerNormalization(axis=-1) for i in range(q)]

In [125]:
BN

[<keras.src.layers.normalization.layer_normalization.LayerNormalization at 0x7aab162e0510>,
 <keras.src.layers.normalization.layer_normalization.LayerNormalization at 0x7aab16d66210>,
 <keras.src.layers.normalization.layer_normalization.LayerNormalization at 0x7aab16d50450>,
 <keras.src.layers.normalization.layer_normalization.LayerNormalization at 0x7aab15c1d550>,
 <keras.src.layers.normalization.layer_normalization.LayerNormalization at 0x7aab162f06d0>,
 <keras.src.layers.normalization.layer_normalization.LayerNormalization at 0x7aab15c55590>,
 <keras.src.layers.normalization.layer_normalization.LayerNormalization at 0x7aab16bfda50>,
 <keras.src.layers.normalization.layer_normalization.LayerNormalization at 0x7aab1620c290>,
 <keras.src.layers.normalization.layer_normalization.LayerNormalization at 0x7aab162e8c10>,
 <keras.src.layers.normalization.layer_normalization.LayerNormalization at 0x7aab162f0150>,
 <keras.src.layers.normalization.layer_normalization.LayerNormalization at 0x7aa

## RASCUNHOS

In [None]:
emb = predict(X, m_fp)

In [51]:
modelo = load_model()

cli: Configuration from ./config/default.yaml


In [52]:
type(modelo)

model.fp.nnfp.FingerPrinter

In [10]:
modelo.div_enc

<model.fp.nnfp.DivEncLayer at 0x7aab17659750>

In [16]:
config = "default" 

In [17]:
cfg = load_config(config)

cli: Configuration from ./config/default.yaml


In [23]:
cfg['DIR']['SOURCE_ROOT_DIR']

'/mnt/dataset/public/Fingerprinting/neural-audio-fp-dataset/music/'

In [24]:
import math

In [27]:
x = tf.constant([5, 1, 2, 4])
y=tf.reduce_max(x)

In [31]:
print(x)

tf.Tensor([5 1 2 4], shape=(4,), dtype=int32)


In [33]:
x[1:]

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 2, 4], dtype=int32)>

In [36]:
from tensorflow.keras.layers import Lambda, Permute

In [40]:
w=Permute((3, 2, 1), input_shape=x[1:])

In [42]:
print(w)

<keras.src.layers.reshaping.permute.Permute object at 0x7aab1769bd90>


In [48]:
import numpy as np

In [44]:
front_strides=[
                [(1,2), (2,1)], 
                [(1,2), (2,1)],
                [(1,2), (2,1)],
                [(1,2), (2,1)],
                [(1,1), (2,1)],
                [(1,2), (2,1)],
                [(1,1), (2,1)],
                [(1,2), (2,1)]
                ]
len(front_strides[])

8

In [139]:
np.shape(front_strides)

(8, 2, 2)

In [219]:
# -*- coding: utf-8 -*-
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
""" nnfp.py

'Neural Audio Fingerprint for High-specific Audio Retrieval based on 
Contrastive Learning', https://arxiv.org/abs/2010.11910

USAGE:
    
    Please see test() in the below.
    
"""
import numpy as np
import tensorflow as tf
assert tf.__version__ >= "2.0"

def ConvLayer(input_layer,
              hidden_ch,
                 strides,
                 norm):
    """
    Separable convolution layer
    
    Arguments
    ---------
    hidden_ch: (int)
    strides: [(int, int), (int, int)]
    norm: 'layer_norm1d' for normalization on Freq axis. (default)
          'layer_norm2d' for normalization on on FxT space 
          'batch_norm' or else, batch-normalization
    
    Input
    -----
    x: (B,F,T,1)
    
    [Conv1x3]>>[ELU]>>[BN]>>[Conv3x1]>>[ELU]>>[BN]
    
    Output
    ------
    x: (B,F,T,C) with {F=F/stride, T=T/stride, C=hidden_ch}
    
    """
    
    #input_layer = #tf.keras.Input(shape=(None, None, 1))
    
    #Convolution 1x3
    conv2d_1x3 = tf.keras.layers.Conv2D(hidden_ch,
                                                 kernel_size=(1, 3),
                                                 strides=strides[0],
                                                 padding='SAME',
                                                 dilation_rate=(1, 1),
                                                 kernel_initializer='glorot_uniform',
                                                 bias_initializer='zeros')(input_layer)

    layerNorm_1x3 = tf.keras.layers.ELU()(conv2d_1x3)

    if norm == 'layer_norm1d':
        BN_1x3 = tf.keras.layers.LayerNormalization(axis=-1)(layerNorm_1x3)
    elif norm == 'layer_norm2d':
        BN_1x3 = tf.keras.layers.LayerNormalization(axis=(1, 2, 3))(layerNorm_1x3)
    else:
        BN_1x3 = tf.keras.layers.BatchNormalization(axis=-1)(layerNorm_1x3) # Fix axis: 2020 Apr20


    #Convolution 3x1
    conv2d_3x1 = tf.keras.layers.Conv2D(hidden_ch,
                                                 kernel_size=(3, 1),
                                                 strides=strides[1],
                                                 padding='SAME',
                                                 dilation_rate=(1, 1),
                                                 kernel_initializer='glorot_uniform',
                                                 bias_initializer='zeros')(BN_1x3)

    layerNorm_3x1 = tf.keras.layers.ELU()(conv2d_3x1)

    if norm == 'layer_norm1d':
        BN_3x1 = tf.keras.layers.LayerNormalization(axis=-1)(layerNorm_3x1)
    elif norm == 'layer_norm2d':
        BN_3x1 = tf.keras.layers.LayerNormalization(axis=(1, 2, 3))(layerNorm_3x1)
    else:
        BN_3x1 = tf.keras.layers.BatchNormalization(axis=-1)(layerNorm_3x1)

    
    return tf.keras.Model(inputs=input_layer, outputs=BN_3x1)



def DivEncLayer(input_layer, q, unit_dim):
    """
    Multi-head projection a.k.a. 'divide and encode' layer:
        
    • The concept of 'divide and encode' was discovered  in Lai et.al.,
     'Simultaneous Feature Learning and Hash Coding with Deep Neural Networks',
      2015. https://arxiv.org/abs/1504.03410
    • It was also adopted in Gfeller et.al. 'Now Playing: Continuo-
      us low-power music recognition', 2017. https://arxiv.org/abs/1711.10958
    
    Arguments
    ---------
    q: (int) number of slices as 'slice_length = input_dim / q'
    unit_dim: [(int), (int)]
    norm: 'layer_norm1d' or 'layer_norm2d' uses 1D-layer normalization on the feature.
          'batch_norm' or else uses batch normalization. Default is 'layer_norm2d'.

    Input
    -----
    x: (B,1,1,C)
    
    Returns
    -------
    emb: (B,Q)
    """
    
    print(f"input_layer-div:{input_layer}")


    #input_layer = tf.keras.Input(shape=(None, None, 1))
    flatten_layer = tf.keras.layers.Flatten()(input_layer)

    layers = []

    for i in range(q):
        dense_1 = tf.keras.layers.Dense(unit_dim[0], activation='elu')(flatten_layer)
        dense_2 = tf.keras.layers.Dense(unit_dim[1])(dense_1)

        """if norm in ['layer_norm1d', 'layer_norm2d']:
            BN = tf.keras.layers.LayerNormalization(axis=-1)(dense_2)
        else:
            BN = tf.keras.layers.BatchNormalization(axis=-1)(dense_2)"""

        layers.append(dense_2)

    return tf.keras.Model(inputs=input_layer, outputs=tf.keras.layers.Concatenate(axis=1)(layers))

In [243]:
def FingerPrinter(input_shape,
                 front_hidden_ch,
                 front_strides,
                 emb_sz,
                 fc_unit_dim,
                 norm,
                 use_L2layer):
    """
    Fingerprinter: 'Neural Audio Fingerprint for High-specific Audio Retrieval
        based on Contrastive Learning', https://arxiv.org/abs/2010.11910
    
    IN >> [Convlayer]x8 >> [DivEncLayer] >> [L2Normalizer] >> OUT 
    
    Arguments
    ---------
    input_shape: tuple (int), not including the batch size
    front_hidden_ch: (list)
    front_strides: (list)
    emb_sz: (int) default=128
    fc_unit_dim: (list) default=[32,1]
    norm: 'layer_norm1d' for normalization on Freq axis. 
          'layer_norm2d' for normalization on on FxT space (default).
          'batch_norm' or else, batch-normalization.
    use_L2layer: True (default)
    
    • Note: batch-normalization will not work properly with TPUs.
                    
    
    Input
    -----
    x: (B,F,T,1)
    
        
    Returns
    -------
    emb: (B,Q) 
    
    """
    print(input_shape)
    input_layer = tf.keras.Input(shape=input_shape, dtype=tf.float32)
    #input_layer=input_shape
    print(input_layer)


    # Front conv layers
    for i in range(len(front_strides)):
        front_conv = ConvLayer(input_layer=input_layer,hidden_ch=front_hidden_ch[i],
            strides=front_strides[i], norm=norm)
        
        conv = front_conv(input_layer)
    
    print(f"conv.shape: {conv.shape}")
    

    #podia fazer o flatten aqui?


    # Divide & Encoder layer
    div_enc_layer = DivEncLayer(input_layer=input_layer, q=emb_sz, unit_dim=fc_unit_dim)
    print(f"div_enc_layer:{div_enc_layer}")
    x = tf.reshape(conv, shape=[conv.shape[0], emb_sz, conv.shape[-1]]) #x = tf.reshape(conv, shape=[conv.shape[0], q, -1])
    print(f"x.shape: {x.shape}")
    x = div_enc_layer(x)

    

    if use_L2layer:
        x = tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(x)
        #return tf.math.l2_normalize(x, axis=1)
    
    return tf.keras.Model(inputs=input_layer, outputs=x)

In [244]:
def get_fingerprinter(cfg, trainable=False):
    """
    Input length : 1s or 2s
    
    Arguements
    ----------
    cfg : (dict)
        created from the '.yaml' located in /config dicrectory

    Returns
    -------
    <tf.keras.Model> FingerPrinter object
    
    """
    input_shape = (256, 32, 1)
    emb_sz = cfg['MODEL']['EMB_SZ']
    norm = cfg['MODEL']['BN']
    fc_unit_dim = [32, 1]

    front_hidden_ch=[128, 128, 256, 256, 512, 512, 1024, 1024]
    front_strides=[[(1,2), (2,1)], [(1,2), (2,1)],
                [(1,2), (2,1)], [(1,2), (2,1)],
                [(1,1), (2,1)], [(1,2), (2,1)],
                [(1,1), (2,1)], [(1,2), (2,1)]]
    
    model = FingerPrinter(input_shape=input_shape,
                          front_hidden_ch=front_hidden_ch,
                          front_strides=front_strides,
                          emb_sz=emb_sz,
                          fc_unit_dim=fc_unit_dim,
                          norm=norm)
    
    model.trainable = trainable
    return model

In [245]:
#input_1s = tf.constant(np.random.randn(3,256,32,1), dtype=tf.float32) # BxFxTx1
input_1s=(256, 32, 1)

front_hidden_ch=[128, 128, 256, 256, 512, 512, 1024, 1024]
front_strides=[[(1,2), (2,1)], [(1,2), (2,1)],
            [(1,2), (2,1)], [(1,2), (2,1)],
            [(1,1), (2,1)], [(1,2), (2,1)],
            [(1,1), (2,1)], [(1,2), (2,1)]]

use_L2layer:bool = True


fprinter = FingerPrinter(input_shape=input_1s, front_hidden_ch=front_hidden_ch, front_strides=front_strides, emb_sz=128, fc_unit_dim=[32, 1], norm='layer_norm2d', use_L2layer=use_L2layer)
emb_1s = fprinter(input_1s) # BxD

(256, 32, 1)
KerasTensor(type_spec=TensorSpec(shape=(None, 256, 32, 1), dtype=tf.float32, name='input_40'), name='input_40', description="created by layer 'input_40'")
conv.shape: (None, 128, 16, 1024)
input_layer-div:KerasTensor(type_spec=TensorSpec(shape=(None, 256, 32, 1), dtype=tf.float32, name='input_40'), name='input_40', description="created by layer 'input_40'")
div_enc_layer:<keras.src.engine.functional.Functional object at 0x7aab08139dd0>


TypeError: Exception encountered when calling layer "tf.reshape_5" (type TFOpLambda).

Failed to convert elements of [None, 128, 1024] to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.

Call arguments received by layer "tf.reshape_5" (type TFOpLambda):
  • tensor=tf.Tensor(shape=(None, 128, 16, 1024), dtype=float32)
  • shape=['None', '128', '1024']
  • name=None

In [171]:
input_1s = tf.constant(np.random.randn(3,256,32,1), dtype=tf.float32) # BxFxTx1
emb_sz = cfg['MODEL']['EMB_SZ']
norm = cfg['MODEL']['BN']
fc_unit_dim = [32, 1]

model = FingerPrinter(input_shape=input_1s,
                    emb_sz=emb_sz,
                    fc_unit_dim=fc_unit_dim,
                    norm=norm)

TypeError: only integer scalar arrays can be converted to a scalar index

Rascunho

In [160]:
# -*- coding: utf-8 -*-
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
""" nnfp.py

'Neural Audio Fingerprint for High-specific Audio Retrieval based on 
Contrastive Learning', https://arxiv.org/abs/2010.11910

USAGE:
    
    Please see test() in the below.
    
"""
import numpy as np
import tensorflow as tf
assert tf.__version__ >= "2.0"


class ConvLayer(tf.keras.layers.Layer):
    """
    Separable convolution layer
    
    Arguments
    ---------
    hidden_ch: (int)
    strides: [(int, int), (int, int)]
    norm: 'layer_norm1d' for normalization on Freq axis. (default)
          'layer_norm2d' for normalization on on FxT space 
          'batch_norm' or else, batch-normalization
    
    Input
    -----
    x: (B,F,T,1)
    
    [Conv1x3]>>[ELU]>>[BN]>>[Conv3x1]>>[ELU]>>[BN]
    
    Output
    ------
    x: (B,F,T,C) with {F=F/stride, T=T/stride, C=hidden_ch}
    
    """
    def __init__(self,
                 hidden_ch=128,
                 strides=[(1,1),(1,1)],
                 norm='layer_norm2d'):
        super(ConvLayer, self).__init__()
        self.conv2d_1x3 = tf.keras.layers.Conv2D(hidden_ch,
                                                 kernel_size=(1, 3),
                                                 strides=strides[0],
                                                 padding='SAME',
                                                 dilation_rate=(1, 1),
                                                 kernel_initializer='glorot_uniform',
                                                 bias_initializer='zeros')
        self.conv2d_3x1 = tf.keras.layers.Conv2D(hidden_ch,
                                                 kernel_size=(3, 1),
                                                 strides=strides[1],
                                                 padding='SAME',
                                                 dilation_rate=(1, 1),
                                                 kernel_initializer='glorot_uniform',
                                                 bias_initializer='zeros')
        
        if norm == 'layer_norm1d':
            self.BN_1x3 = tf.keras.layers.LayerNormalization(axis=-1)
            self.BN_3x1 = tf.keras.layers.LayerNormalization(axis=-1)
        elif norm == 'layer_norm2d':
            self.BN_1x3 = tf.keras.layers.LayerNormalization(axis=(1, 2, 3))
            self.BN_3x1 = tf.keras.layers.LayerNormalization(axis=(1, 2, 3))
        else:
            self.BN_1x3 = tf.keras.layers.BatchNormalization(axis=-1) # Fix axis: 2020 Apr20
            self.BN_3x1 = tf.keras.layers.BatchNormalization(axis=-1)
            
        self.forward = tf.keras.Sequential([self.conv2d_1x3,
                                            tf.keras.layers.ELU(),
                                            self.BN_1x3,
                                            self.conv2d_3x1,
                                            tf.keras.layers.ELU(),
                                            self.BN_3x1
                                            ])

       
    def call(self, x):
        return self.forward(x)


class DivEncLayer(tf.keras.layers.Layer):
    """
    Multi-head projection a.k.a. 'divide and encode' layer:
        
    • The concept of 'divide and encode' was discovered  in Lai et.al.,
     'Simultaneous Feature Learning and Hash Coding with Deep Neural Networks',
      2015. https://arxiv.org/abs/1504.03410
    • It was also adopted in Gfeller et.al. 'Now Playing: Continuo-
      us low-power music recognition', 2017. https://arxiv.org/abs/1711.10958
    
    Arguments
    ---------
    q: (int) number of slices as 'slice_length = input_dim / q'
    unit_dim: [(int), (int)]
    norm: 'layer_norm1d' or 'layer_norm2d' uses 1D-layer normalization on the feature.
          'batch_norm' or else uses batch normalization. Default is 'layer_norm2d'.

    Input
    -----
    x: (B,1,1,C)
    
    Returns
    -------
    emb: (B,Q)
    
    """
    def __init__(self, q=128, unit_dim=[32, 1], norm='batch_norm'):
        super(DivEncLayer, self).__init__()

        self.q = q
        self.unit_dim = unit_dim
        self.norm = norm
        
        if norm in ['layer_norm1d', 'layer_norm2d']:
            self.BN = [tf.keras.layers.LayerNormalization(axis=-1) for i in range(q)]
        else:
            self.BN = [tf.keras.layers.BatchNormalization(axis=-1) for i in range(q)]
            
        self.split_fc_layers = self._construct_layers() 


    def build(self, input_shape):
        # Prepare output embedding variable for dynamic batch-size 
        self.slice_length = int(input_shape[-1] / self.q)

 
    def _construct_layers(self):
        layers = list()
        for i in range(self.q): # q: num_slices
            layers.append(tf.keras.Sequential([tf.keras.layers.Dense(self.unit_dim[0], activation='elu'),
                                               #self.BN[i],
                                               tf.keras.layers.Dense(self.unit_dim[1])]))
        return layers

 
    @tf.function
    def _split_encoding(self, x_slices):
        """
        Input: (B,Q,S)
        Returns: (B,Q)
        
        """
        out = list()
        for i in range(self.q):
            out.append(self.split_fc_layers[i](x_slices[:, i, :]))
        return tf.concat(out, axis=1)

    
    def call(self, x): # x: (B,1,1,2048)
        x = tf.reshape(x, shape=[x.shape[0], self.q, -1]) # (B,Q,S); Q=num_slices; S=slice length; (B,128,8 or 16)
        return self._split_encoding(x)
    
    
class FingerPrinter(tf.keras.Model):
    """
    Fingerprinter: 'Neural Audio Fingerprint for High-specific Audio Retrieval
        based on Contrastive Learning', https://arxiv.org/abs/2010.11910
    
    IN >> [Convlayer]x8 >> [DivEncLayer] >> [L2Normalizer] >> OUT 
    
    Arguments
    ---------
    input_shape: tuple (int), not including the batch size
    front_hidden_ch: (list)
    front_strides: (list)
    emb_sz: (int) default=128
    fc_unit_dim: (list) default=[32,1]
    norm: 'layer_norm1d' for normalization on Freq axis. 
          'layer_norm2d' for normalization on on FxT space (default).
          'batch_norm' or else, batch-normalization.
    use_L2layer: True (default)
    
    • Note: batch-normalization will not work properly with TPUs.
                    
    
    Input
    -----
    x: (B,F,T,1)
    
        
    Returns
    -------
    emb: (B,Q) 
    
    """
    def __init__(self,
                 input_shape=(256,32,1),
                 front_hidden_ch=[128, 128, 256, 256, 512, 512, 1024, 1024],
                 front_strides=[[(1,2), (2,1)], [(1,2), (2,1)],
                                [(1,2), (2,1)], [(1,2), (2,1)],
                                [(1,1), (2,1)], [(1,2), (2,1)],
                                [(1,1), (2,1)], [(1,2), (2,1)]],
                 emb_sz=128, # q
                 fc_unit_dim=[32,1],
                 norm='layer_norm2d',
                 use_L2layer=True):
        super(FingerPrinter, self).__init__()
        self.front_hidden_ch = front_hidden_ch
        self.front_strides = front_strides
        self.emb_sz=emb_sz
        self.norm = norm
        self.use_L2layer = use_L2layer
        
        self.n_clayers = len(front_strides)
        self.front_conv = tf.keras.Sequential(name='ConvLayers')
        if ((front_hidden_ch[-1] % emb_sz) != 0):
            front_hidden_ch[-1] = ((front_hidden_ch[-1]//emb_sz) + 1) * emb_sz                
        
        # Front (sep-)conv layers
        #x = tf.zeros((1,)+ input_shape, dtype=tf.float32)
        #print(f"ConvLayer entrada: {self.front_conv(x).shape}")
        for i in range(self.n_clayers):
            self.front_conv.add(ConvLayer(hidden_ch=front_hidden_ch[i],
                strides=front_strides[i], norm=norm))
            #print(f"ConvLayer {i+1}: {self.front_conv(x).shape}")

        self.front_conv.add(tf.keras.layers.Flatten()) # (B,F',T',C) >> (B,D)

        #print(f"Flatten: {self.front_conv}") 

        # Divide & Encoder layer
        self.div_enc = DivEncLayer(q=emb_sz, unit_dim=fc_unit_dim, norm=norm)

        
    @tf.function
    def call(self, inputs):
        x = self.front_conv(inputs) # (B,D) with D = (T/2^4) x last_hidden_ch
        x = self.div_enc(x) # (B,Q)
        if self.use_L2layer:
            return tf.math.l2_normalize(x, axis=1) 
        else:
            return x


def get_fingerprinter(cfg, trainable=False):
    """
    Input length : 1s or 2s
    
    Arguements
    ----------
    cfg : (dict)
        created from the '.yaml' located in /config dicrectory

    Returns
    -------
    <tf.keras.Model> FingerPrinter object
    
    """
    input_shape = (256, 32, 1) 
    emb_sz = cfg['MODEL']['EMB_SZ']
    norm = cfg['MODEL']['BN']
    fc_unit_dim = [32, 1]
    
    m = FingerPrinter(input_shape=input_shape,
                      emb_sz=emb_sz,
                      fc_unit_dim=fc_unit_dim,
                      norm=norm)
    m.trainable = trainable
    return m

In [161]:
input_1s = tf.constant(np.random.randn(3,256,32,1), dtype=tf.float32) # BxFxTx1
fprinter = FingerPrinter(emb_sz=128, fc_unit_dim=[32, 1], norm='layer_norm2d')
emb_1s = fprinter(input_1s) # BxD

ConvLayer entrada: (1, 256, 32, 1)
ConvLayer 1: (1, 128, 16, 128)
ConvLayer 2: (1, 64, 8, 128)
ConvLayer 3: (1, 32, 4, 256)
ConvLayer 4: (1, 16, 2, 256)
ConvLayer 5: (1, 8, 2, 512)
ConvLayer 6: (1, 4, 1, 512)
ConvLayer 7: (1, 2, 1, 1024)
ConvLayer 8: (1, 1, 1, 1024)
Flatten: <keras.src.engine.sequential.Sequential object at 0x7aab074e5910>


In [None]:
emb_1s

In [246]:
# -*- coding: utf-8 -*-
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
""" nnfp.py

'Neural Audio Fingerprint for High-specific Audio Retrieval based on 
Contrastive Learning', https://arxiv.org/abs/2010.11910

USAGE:
    
    Please see test() in the below.
    
"""
import numpy as np
import tensorflow as tf
assert tf.__version__ >= "2.0"


class ConvLayer(tf.keras.layers.Layer):
    """
    Separable convolution layer
    
    Arguments
    ---------
    hidden_ch: (int)
    strides: [(int, int), (int, int)]
    norm: 'layer_norm1d' for normalization on Freq axis. (default)
          'layer_norm2d' for normalization on on FxT space 
          'batch_norm' or else, batch-normalization
    
    Input
    -----
    x: (B,F,T,1)
    
    [Conv1x3]>>[ELU]>>[BN]>>[Conv3x1]>>[ELU]>>[BN]
    
    Output
    ------
    x: (B,F,T,C) with {F=F/stride, T=T/stride, C=hidden_ch}
    
    """
    def __init__(self,
                 hidden_ch=128,
                 strides=[(1,1),(1,1)],
                 norm='layer_norm2d'):
        super(ConvLayer, self).__init__()
        self.conv2d_1x3 = tf.keras.layers.Conv2D(hidden_ch,
                                                 kernel_size=(1, 3),
                                                 strides=strides[0],
                                                 padding='SAME',
                                                 dilation_rate=(1, 1),
                                                 kernel_initializer='glorot_uniform',
                                                 bias_initializer='zeros')
        self.conv2d_3x1 = tf.keras.layers.Conv2D(hidden_ch,
                                                 kernel_size=(3, 1),
                                                 strides=strides[1],
                                                 padding='SAME',
                                                 dilation_rate=(1, 1),
                                                 kernel_initializer='glorot_uniform',
                                                 bias_initializer='zeros')
        
        if norm == 'layer_norm1d':
            self.BN_1x3 = tf.keras.layers.LayerNormalization(axis=-1)
            self.BN_3x1 = tf.keras.layers.LayerNormalization(axis=-1)
        elif norm == 'layer_norm2d':
            self.BN_1x3 = tf.keras.layers.LayerNormalization(axis=(1, 2, 3))
            self.BN_3x1 = tf.keras.layers.LayerNormalization(axis=(1, 2, 3))
        else:
            self.BN_1x3 = tf.keras.layers.BatchNormalization(axis=-1) # Fix axis: 2020 Apr20
            self.BN_3x1 = tf.keras.layers.BatchNormalization(axis=-1)
            
        self.forward = tf.keras.Sequential([self.conv2d_1x3,
                                            tf.keras.layers.ELU(),
                                            self.BN_1x3,
                                            self.conv2d_3x1,
                                            tf.keras.layers.ELU(),
                                            self.BN_3x1
                                            ])

       
    def call(self, x):
        return self.forward(x)


class DivEncLayer(tf.keras.layers.Layer):
    """
    Multi-head projection a.k.a. 'divide and encode' layer:
        
    • The concept of 'divide and encode' was discovered  in Lai et.al.,
     'Simultaneous Feature Learning and Hash Coding with Deep Neural Networks',
      2015. https://arxiv.org/abs/1504.03410
    • It was also adopted in Gfeller et.al. 'Now Playing: Continuo-
      us low-power music recognition', 2017. https://arxiv.org/abs/1711.10958
    
    Arguments
    ---------
    q: (int) number of slices as 'slice_length = input_dim / q'
    unit_dim: [(int), (int)]
    norm: 'layer_norm1d' or 'layer_norm2d' uses 1D-layer normalization on the feature.
          'batch_norm' or else uses batch normalization. Default is 'layer_norm2d'.

    Input
    -----
    x: (B,1,1,C)
    
    Returns
    -------
    emb: (B,Q)
    
    """
    def __init__(self, q=128, unit_dim=[32, 1], norm='batch_norm'):
        super(DivEncLayer, self).__init__()

        self.q = q
        self.unit_dim = unit_dim
        self.norm = norm
        
        if norm in ['layer_norm1d', 'layer_norm2d']:
            self.BN = [tf.keras.layers.LayerNormalization(axis=-1) for i in range(q)]
        else:
            self.BN = [tf.keras.layers.BatchNormalization(axis=-1) for i in range(q)]
            
        self.split_fc_layers = self._construct_layers() 


    def build(self, input_shape):
        # Prepare output embedding variable for dynamic batch-size 
        self.slice_length = int(input_shape[-1] / self.q)

 
    def _construct_layers(self):
        layers = list()
        for i in range(self.q): # q: num_slices
            layers.append(tf.keras.Sequential([tf.keras.layers.Dense(self.unit_dim[0], activation='elu'),
                                               #self.BN[i],
                                               tf.keras.layers.Dense(self.unit_dim[1])]))
        return layers

 
    @tf.function
    def _split_encoding(self, x_slices):
        """
        Input: (B,Q,S)
        Returns: (B,Q)
        
        """
        out = list()
        for i in range(self.q):
            out.append(self.split_fc_layers[i](x_slices[:, i, :]))
        return tf.concat(out, axis=1)

    
    def call(self, x): # x: (B,1,1,2048)
        x = tf.reshape(x, shape=[x.shape[0], self.q, -1]) # (B,Q,S); Q=num_slices; S=slice length; (B,128,8 or 16)
        return self._split_encoding(x)
    
    
class FingerPrinter(tf.keras.Model):
    """
    Fingerprinter: 'Neural Audio Fingerprint for High-specific Audio Retrieval
        based on Contrastive Learning', https://arxiv.org/abs/2010.11910
    
    IN >> [Convlayer]x8 >> [DivEncLayer] >> [L2Normalizer] >> OUT 
    
    Arguments
    ---------
    input_shape: tuple (int), not including the batch size
    front_hidden_ch: (list)
    front_strides: (list)
    emb_sz: (int) default=128
    fc_unit_dim: (list) default=[32,1]
    norm: 'layer_norm1d' for normalization on Freq axis. 
          'layer_norm2d' for normalization on on FxT space (default).
          'batch_norm' or else, batch-normalization.
    use_L2layer: True (default)
    
    • Note: batch-normalization will not work properly with TPUs.
                    
    
    Input
    -----
    x: (B,F,T,1)
    
        
    Returns
    -------
    emb: (B,Q) 
    
    """
    def __init__(self,
                 input_shape=(256,32,1),
                 front_hidden_ch=[128, 128, 256, 256, 512, 512, 1024, 1024],
                 front_strides=[[(1,2), (2,1)], [(1,2), (2,1)],
                                [(1,2), (2,1)], [(1,2), (2,1)],
                                [(1,1), (2,1)], [(1,2), (2,1)],
                                [(1,1), (2,1)], [(1,2), (2,1)]],
                 emb_sz=128, # q
                 fc_unit_dim=[32,1],
                 norm='layer_norm2d',
                 use_L2layer=True):
        super(FingerPrinter, self).__init__()
        self.front_hidden_ch = front_hidden_ch
        self.front_strides = front_strides
        self.emb_sz=emb_sz
        self.norm = norm
        self.use_L2layer = use_L2layer
        
        self.n_clayers = len(front_strides)
        self.front_conv = tf.keras.Sequential(name='ConvLayers')
        if ((front_hidden_ch[-1] % emb_sz) != 0):
            front_hidden_ch[-1] = ((front_hidden_ch[-1]//emb_sz) + 1) * emb_sz                
        
        # Front (sep-)conv layers
        #x = tf.zeros((1,)+ input_shape, dtype=tf.float32)
        #print(f"ConvLayer entrada: {self.front_conv(x).shape}")
        for i in range(self.n_clayers):
            self.front_conv.add(ConvLayer(hidden_ch=front_hidden_ch[i],
                strides=front_strides[i], norm=norm))
            #print(f"ConvLayer {i+1}: {self.front_conv(x).shape}")
        self.front_conv.add(tf.keras.layers.Flatten()) # (B,F',T',C) >> (B,D)
            
        # Divide & Encoder layer
        self.div_enc = DivEncLayer(q=emb_sz, unit_dim=fc_unit_dim, norm=norm)

        
    @tf.function
    def call(self, inputs):
        x = self.front_conv(inputs) # (B,D) with D = (T/2^4) x last_hidden_ch
        x = self.div_enc(x) # (B,Q)
        if self.use_L2layer:
            return tf.math.l2_normalize(x, axis=1) 
        else:
            return x


def get_fingerprinter(cfg, trainable=False):
    """
    Input length : 1s or 2s
    
    Arguements
    ----------
    cfg : (dict)
        created from the '.yaml' located in /config dicrectory

    Returns
    -------
    <tf.keras.Model> FingerPrinter object
    
    """
    input_shape = (256, 32, 1) 
    emb_sz = cfg['MODEL']['EMB_SZ']
    norm = cfg['MODEL']['BN']
    fc_unit_dim = [32, 1]
    
    m = FingerPrinter(input_shape=input_shape,
                      emb_sz=emb_sz,
                      fc_unit_dim=fc_unit_dim,
                      norm=norm)
    m.trainable = trainable
    return m

In [258]:
input_1s = tf.constant(np.random.randn(1,256,32,1), dtype=tf.float32) # BxFxTx1
fprinter = FingerPrinter(emb_sz=128, fc_unit_dim=[32, 1], norm='layer_norm2d')
emb_1s = fprinter(input_1s) # BxD

In [267]:
fprinter(input_1s)

<tf.Tensor: shape=(1, 128), dtype=float32, numpy=
array([[-0.06714662, -0.08877325,  0.18096662,  0.03541807,  0.04442547,
         0.11626906, -0.02300515, -0.00791979,  0.06547429, -0.00591726,
         0.07874333,  0.03066022,  0.04967431, -0.09657394,  0.01860259,
         0.1795162 , -0.09594179, -0.04747898, -0.04731065,  0.0082687 ,
         0.06911098,  0.12106568,  0.00540292, -0.01218743, -0.05398494,
         0.05139915, -0.10475634,  0.00499371,  0.03684568, -0.09473342,
         0.1336947 , -0.02190757, -0.15067072, -0.11182376,  0.04819969,
        -0.08911032, -0.23060912,  0.04759725,  0.05027833, -0.02204659,
         0.107933  , -0.03450353,  0.10758416, -0.07598781,  0.02761962,
        -0.03841873, -0.02068278,  0.01060095, -0.05038963,  0.02484113,
        -0.07576414,  0.00573614,  0.02410174,  0.15370564,  0.2229935 ,
        -0.00034676, -0.10300653, -0.05699234,  0.04484287, -0.02547638,
        -0.0475092 , -0.01074172,  0.19574203,  0.03643182, -0.00032796,
 

In [259]:
emb_1s

<tf.Tensor: shape=(1, 128), dtype=float32, numpy=
array([[-0.06714662, -0.08877325,  0.18096662,  0.03541807,  0.04442547,
         0.11626906, -0.02300515, -0.00791979,  0.06547429, -0.00591726,
         0.07874333,  0.03066022,  0.04967431, -0.09657394,  0.01860259,
         0.1795162 , -0.09594179, -0.04747898, -0.04731065,  0.0082687 ,
         0.06911098,  0.12106568,  0.00540292, -0.01218743, -0.05398494,
         0.05139915, -0.10475634,  0.00499371,  0.03684568, -0.09473342,
         0.1336947 , -0.02190757, -0.15067072, -0.11182376,  0.04819969,
        -0.08911032, -0.23060912,  0.04759725,  0.05027833, -0.02204659,
         0.107933  , -0.03450353,  0.10758416, -0.07598781,  0.02761962,
        -0.03841873, -0.02068278,  0.01060095, -0.05038963,  0.02484113,
        -0.07576414,  0.00573614,  0.02410174,  0.15370564,  0.2229935 ,
        -0.00034676, -0.10300653, -0.05699234,  0.04484287, -0.02547638,
        -0.0475092 , -0.01074172,  0.19574203,  0.03643182, -0.00032796,
 

In [253]:
import numpy as np

In [261]:
np.dot(emb_1s[0], emb_1s[0])

1.0