In [1]:
import numpy as np
import librosa
import torch
import laion_clap
from laion_clap.training.data import get_audio_features
from es_dataset import EpidemicSoundDataset, EpidemicSoundDataModule
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


[2023-09-03 21:16:51,216] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
# quantization
def int16_to_float32(x):
    return (x / 32767.0).astype(np.float32)


def float32_to_int16(x):
    x = np.clip(x, a_min=-1., a_max=1.)
    return (x * 32767.).astype(np.int16)

In [None]:
model = laion_clap.CLAP_Module(enable_fusion=True)
model.load_ckpt(model_id=3) # download the default pretrained checkpoint.

In [5]:
# torch.save(model.model.audio_branch.state_dict(), '/scratch/korte/audiffuse/clap.ckpt')

In [9]:
model.model.audio_cfg

CLAPAudioCfp(model_type='HTSAT', model_name='tiny', sample_rate=48000, audio_length=1024, window_size=1024, hop_size=480, fmin=50, fmax=14000, class_num=527, mel_bins=64, clip_samples=480000)

In [None]:
audio_waveform = int16_to_float32(float32_to_int16(audio_data))
audio_waveform = torch.from_numpy(audio_waveform).float()

temp_dict = {}
temp_dict = get_audio_features(
    temp_dict, audio_waveform, 480000, 
    data_truncating='fusion', 
    data_filling='repeatpad',
    audio_cfg=model.model_cfg['audio_cfg'],
    require_grad=False
)


In [115]:
temp_dict['mel_fusion'].requires_grad

False

In [105]:
model.model_cfg['audio_cfg']

{'audio_length': 1024,
 'clip_samples': 480000,
 'mel_bins': 64,
 'sample_rate': 48000,
 'window_size': 1024,
 'hop_size': 480,
 'fmin': 50,
 'fmax': 14000,
 'class_num': 527,
 'model_type': 'HTSAT',
 'model_name': 'tiny'}

In [9]:
# Get audio embeddings from audio data
audio_data, _ = librosa.load('test_song.mp3', sr=48000) # sample rate should be 48000
# audio_data = audio_data[:48000*15] # 5 seconds
audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)

In [12]:
# es_dataset = EpidemicSoundDataset('/fastscratch/korte/es_dataset/')
es_dataset = EpidemicSoundDataModule('/fastscratch/korte/es_dataset/')
es_dataset.setup(None)
loader = es_dataset.train_dataloader()

In [15]:
audio_set = next(iter(loader))['audio']




In [31]:
audio_set.keys()

dict_keys(['mel_fusion', 'longer', 'waveform'])

In [34]:
audio_set['longer'].dtype

torch.bool

In [None]:
es_dataset[0]

In [7]:
%%timeit
audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=False)

55.7 ms ± 1.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
print(audio_embed[:,-20:])
print(audio_embed.shape)

[[ 0.02798386  0.09505272 -0.03748165 -0.03948206  0.03360447 -0.03106373
   0.04163197 -0.00567254  0.0260455   0.02825583  0.04025021  0.02217655
  -0.01997084 -0.01692094 -0.0323186   0.01411903  0.01598474  0.05030755
  -0.01884449 -0.05638063]]
(1, 512)


In [7]:
# Get audio embeddings from audio data
audio_data, _ = librosa.load('test_song.mp3', sr=48000) # sample rate should be 48000
audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
audio_data = torch.from_numpy(int16_to_float32(float32_to_int16(audio_data))).float() # quantize before send it in to the model
audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=True)
print(audio_embed[:,-20:])
print(audio_embed.shape)

tensor([[-0.0131,  0.0785,  0.0767, -0.0259, -0.0184,  0.0028,  0.0451,  0.0133,
         -0.0327,  0.0620, -0.0639,  0.0697,  0.0027, -0.0418, -0.0539,  0.0003,
         -0.0098, -0.0034, -0.0337, -0.0032]], device='cuda:0',
       grad_fn=<SliceBackward0>)
torch.Size([1, 512])


In [None]:
%%timeit
audio_data = es_dataset[0]['audio']

In [19]:
# audio_embed = model.model.get_audio_embedding([audio_data])
audio_embeds = model.model.encode_audio(audio_set, device=torch.device('cuda'))["fine_grained_embedding"]

In [21]:
audio_embeds.shape

torch.Size([32, 1024, 768])

In [30]:
F.avg_pool1d(audio_embeds.permute(0, 2, 1), kernel_size=4, padding=1).permute(0, 2, 1).shape

torch.Size([32, 256, 768])

In [22]:
projection = model.model.audio_projection(audio_embeds)

In [23]:
projection.shape

torch.Size([32, 1024, 512])

In [None]:
audio_embed.detach()