In [118]:
import numpy as np
import librosa
import torch
import laion_clap
from laion_clap.training.data import get_audio_features
from es_dataset import EpidemicSoundDataset

In [2]:
# quantization
def int16_to_float32(x):
    return (x / 32767.0).astype(np.float32)


def float32_to_int16(x):
    x = np.clip(x, a_min=-1., a_max=1.)
    return (x * 32767.).astype(np.int16)

In [None]:
model = laion_clap.CLAP_Module(enable_fusion=True)
model.load_ckpt() # download the default pretrained checkpoint.

In [104]:
audio_waveform = int16_to_float32(float32_to_int16(audio_data))
audio_waveform = torch.from_numpy(audio_waveform).float()

temp_dict = {}
temp_dict = get_audio_features(
    temp_dict, audio_waveform, 480000, 
    data_truncating='fusion', 
    data_filling='repeatpad',
    audio_cfg=model.model_cfg['audio_cfg'],
    require_grad=False
)




In [115]:
temp_dict['mel_fusion'].requires_grad

False

In [105]:
model.model_cfg['audio_cfg']

{'audio_length': 1024,
 'clip_samples': 480000,
 'mel_bins': 64,
 'sample_rate': 48000,
 'window_size': 1024,
 'hop_size': 480,
 'fmin': 50,
 'fmax': 14000,
 'class_num': 527,
 'model_type': 'HTSAT',
 'model_name': 'tiny'}

In [110]:
# Get audio embeddings from audio data
audio_data, _ = librosa.load('test_song.mp3', sr=48000) # sample rate should be 48000
# audio_data = audio_data[:48000*15] # 5 seconds
audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)

In [119]:
es_dataset = EpidemicSoundDataset('/fastscratch/korte/es_dataset/')

In [100]:
# %%timeit
audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=False)

244 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [93]:
print(audio_embed[:,-20:])
print(audio_embed.shape)

[[ 0.03378514  0.08276038 -0.04029229 -0.03356505  0.02966368 -0.0266439
   0.03970947 -0.00148861  0.03154013  0.02451845  0.03857341  0.02047969
  -0.01352776 -0.02060715 -0.03384665  0.01845452  0.01468264  0.04995199
  -0.02031711 -0.06094122]]
(1, 512)


In [7]:
# Get audio embeddings from audio data
audio_data, _ = librosa.load('test_song.mp3', sr=48000) # sample rate should be 48000
audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
audio_data = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model
audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=True)
print(audio_embed[:,-20:])
print(audio_embed.shape)

tensor([[-0.0131,  0.0785,  0.0767, -0.0259, -0.0184,  0.0028,  0.0451,  0.0133,
         -0.0327,  0.0620, -0.0639,  0.0697,  0.0027, -0.0418, -0.0539,  0.0003,
         -0.0098, -0.0034, -0.0337, -0.0032]], device='cuda:0',
       grad_fn=<SliceBackward0>)
torch.Size([1, 512])


In [134]:
%%timeit
audio_data = es_dataset[0]['audio']



252 ms ± 16.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [129]:
audio_embed = model.model.get_audio_embedding([audio_data])

In [133]:
audio_embed.detach()

tensor([[-9.5037e-02,  2.0186e-03, -5.7755e-02,  6.9711e-02,  1.5338e-02,
          1.1432e-02,  4.1709e-02, -7.5621e-03, -9.7550e-02, -2.3002e-02,
          1.7306e-02,  8.4966e-02,  2.1912e-03, -5.5164e-02, -1.5723e-02,
         -2.5623e-02, -1.6994e-02, -4.3617e-02, -6.5937e-02,  2.6272e-03,
          1.2739e-02,  6.1586e-03,  3.8209e-02, -8.8046e-02,  1.1829e-02,
          1.7457e-02,  1.0160e-02,  2.0708e-02, -2.6445e-02, -1.3777e-02,
          1.2003e-03,  9.4762e-02, -4.7691e-02,  3.7207e-02, -5.9242e-02,
          5.9576e-03, -5.5460e-02, -2.3160e-02,  3.3704e-02, -4.3540e-03,
          5.5291e-03, -1.5345e-02, -3.5983e-02, -1.0820e-02, -1.5250e-02,
          2.8379e-02,  8.6993e-02,  4.7084e-02, -1.1649e-02,  3.4456e-02,
          5.9233e-02,  1.5146e-02, -3.0219e-02,  4.1252e-02, -1.7663e-02,
          5.4934e-03, -1.9642e-02,  3.9494e-03, -1.5750e-02, -1.5834e-02,
         -1.8333e-02,  1.2006e-02, -3.1919e-02, -2.3923e-02,  5.1555e-02,
          5.9433e-02,  8.4424e-02,  2.