# Implementation of paSST feature extractor on .wav files from FreeSound

In [None]:
%pip install hear21passt

In [19]:
%pip install librosa

Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Downloading numba-0.61.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl.metadata (16 kB)
Collecting pooch>=1.1 (from librosa)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.5.0.post1-cp312-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting lazy_loader>=0.1 (from librosa)
  Downloading lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Collecting msgpack>=1.0 (from librosa)
  Downloading msgpack-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting llvmlite<0.45,>=0.44.0dev0 (f

In [56]:
import librosa
import numpy as np
import torch
from hear21passt.base import get_basic_model,get_model_passt

In [59]:
def load_real_audio(file_path, target_sr=32000, max_len_seconds=10):
    waveform, sr = librosa.load(file_path, sr=target_sr, mono=True)

    max_len_samples = target_sr * max_len_seconds
    if len(waveform) < max_len_samples:
        pad_len = max_len_samples - len(waveform)
        waveform = np.pad(waveform, (0, pad_len))
    else:
        waveform = waveform[:max_len_samples]

    return torch.tensor(waveform, dtype=torch.float32)

In [60]:
model = get_basic_model(mode="embed_only")
model.eval()
model = model.cuda()



 Loading PASST TRAINED ON AUDISET 


PaSST(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,), ep

In [61]:
with torch.no_grad():
    waveform = load_real_audio("datasets/audio_files/33711__acclivity__excessiveexposure.wav")
    audio = waveform.unsqueeze(0)
    audio_wave = audio.cuda()
    with torch.no_grad():
        embedding = model(audio_wave)
    print("Embedding Shape:",embedding.shape)

Embedding Shape: torch.Size([1, 768])


In [62]:
with torch.no_grad():
    waveform = load_real_audio("datasets/audio_files/36105__erh__roswell.wav")
    audio = waveform.unsqueeze(0)
    audio_wave = audio.cuda()
    with torch.no_grad():
        embedding = model(audio_wave) 
    print("Embedding Shape:",embedding.shape)

Embedding Shape: torch.Size([1, 768])


In [63]:
with torch.no_grad():
    waveform = load_real_audio("datasets/audio_files/222993__zyrytsounds__people-talking.wav")
    audio = waveform.unsqueeze(0)
    audio_wave = audio.cuda()
    with torch.no_grad():
        embedding = model(audio_wave)
    print("Embedding Shape:",embedding.shape)

Embedding Shape: torch.Size([1, 768])


In [64]:
filepaths = [
    "datasets/audio_files/33711__acclivity__excessiveexposure.wav",
    "datasets/audio_files/222993__zyrytsounds__people-talking.wav",
    "datasets/audio_files/36105__erh__roswell.wav" 
]

batch_waveforms = [load_real_audio(fp) for fp in filepaths]
batch_audio = torch.stack(batch_waveforms).cuda()

with torch.no_grad():
    embeddings = model(batch_audio)

print("Batch Embedding Shape:", embeddings.shape)

Batch Embedding Shape: torch.Size([3, 768])
