In [1]:
import os
import librosa
import numpy as np
import torch
from hear21passt.base import get_basic_model
from tqdm import tqdm


In [2]:
def load_real_audio(file_path, target_sr=32000, max_len_seconds=10):
    waveform, sr = librosa.load(file_path, sr=target_sr, mono=True)
    max_len_samples = target_sr * max_len_seconds
    if len(waveform) < max_len_samples:
        pad_len = max_len_samples - len(waveform)
        waveform = np.pad(waveform, (0, pad_len))
    else:
        waveform = waveform[:max_len_samples]
    return torch.tensor(waveform, dtype=torch.float32)


In [3]:
model = get_basic_model(mode="embed_only")
model.eval()
model = model.cuda()




 Loading PASST TRAINED ON AUDISET 


PaSST(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,), ep

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 23.67 GiB of which 36.88 MiB is free. Process 1205165 has 20.40 GiB memory in use. Process 1232337 has 1.08 GiB memory in use. Including non-PyTorch memory, this process has 464.00 MiB memory in use. Of the allocated memory 235.10 MiB is allocated by PyTorch, and 26.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [4]:
dcase_path = "../grl_dcase/dcase/train"
source_path = os.path.join(dcase_path, "source")
target_path = os.path.join(dcase_path, "target")

def collect_audio_files(path):
    return [os.path.join(path, fname) for fname in os.listdir(path) if fname.endswith(".wav")]

source_files = collect_audio_files(source_path)
target_files = collect_audio_files(target_path)

print("Data loaded Successfully!")


Data loaded Successfully!


In [5]:
def extract_embeddings(file_list, label):
    embeddings = []
    for file_path in tqdm(file_list, desc=f"Processing {label}"):
        waveform = load_real_audio(file_path)
        audio = waveform.unsqueeze(0).cuda()
        with torch.no_grad():
            embedding = model(audio)
        embeddings.append((file_path, embedding.squeeze().cpu().numpy()))
    return embeddings

source_embeddings = extract_embeddings(source_files, "Source")
target_embeddings = extract_embeddings(target_files, "Target")


Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /pytorch/aten/src/ATen/native/SpectralOps.cpp:875.)
  return _VF.stft(  # type: ignore[attr-defined]
  with torch.cuda.amp.autocast(enabled=False):
Processing Source:   0%|                    | 4/14400 [00:01<1:16:57,  3.12it/s]

x torch.Size([1, 1, 128, 1000])
self.norm(x) torch.Size([1, 768, 12, 99])
 patch_embed :  torch.Size([1, 768, 12, 99])
 self.time_new_pos_embed.shape torch.Size([1, 768, 1, 99])
 self.freq_new_pos_embed.shape torch.Size([1, 768, 12, 1])
X flattened torch.Size([1, 1188, 768])
 self.new_pos_embed.shape torch.Size([1, 2, 768])
 self.cls_tokens.shape torch.Size([1, 1, 768])
 self.dist_token.shape torch.Size([1, 1, 768])
 final sequence x torch.Size([1, 1190, 768])
 after 12 atten blocks x torch.Size([1, 1190, 768])
forward_features torch.Size([1, 768])
head torch.Size([1, 527])


Processing Source: 100%|██████████████████| 14400/14400 [09:14<00:00, 25.98it/s]
Processing Target: 100%|████████████████████| 5395/5395 [03:29<00:00, 25.73it/s]


In [7]:

print("Embedding shape:", source_embeddings[0][1].shape)



Embedding shape: (768,)
