# Implementation of paSST feature extractor on .wav files from FreeSound

In [1]:
import librosa
import numpy as np
import torch
from hear21passt.base import get_basic_model,get_model_passt

In [2]:
def load_real_audio(file_path, target_sr=32000, max_len_seconds=10):
    waveform, sr = librosa.load(file_path, sr=target_sr, mono=True)

    max_len_samples = target_sr * max_len_seconds
    if len(waveform) < max_len_samples:
        pad_len = max_len_samples - len(waveform)
        waveform = np.pad(waveform, (0, pad_len))
    else:
        waveform = waveform[:max_len_samples]

    return torch.tensor(waveform, dtype=torch.float32)

In [3]:
model = get_basic_model(mode="embed_only")
model.eval()
model = model.cuda()



 Loading PASST TRAINED ON AUDISET 


PaSST(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,), ep

In [4]:
with torch.no_grad():
    waveform = load_real_audio("./audio_files/33711__acclivity__excessiveexposure.wav")
    audio = waveform.unsqueeze(0)
    audio_wave = audio.cuda()
    with torch.no_grad():
        embedding = model(audio_wave)
    print("Embedding Shape:",embedding.shape)

Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /pytorch/aten/src/ATen/native/SpectralOps.cpp:875.)
  return _VF.stft(  # type: ignore[attr-defined]
  with torch.cuda.amp.autocast(enabled=False):


x torch.Size([1, 1, 128, 1000])
self.norm(x) torch.Size([1, 768, 12, 99])
 patch_embed :  torch.Size([1, 768, 12, 99])
 self.time_new_pos_embed.shape torch.Size([1, 768, 1, 99])
 self.freq_new_pos_embed.shape torch.Size([1, 768, 12, 1])
X flattened torch.Size([1, 1188, 768])
 self.new_pos_embed.shape torch.Size([1, 2, 768])
 self.cls_tokens.shape torch.Size([1, 1, 768])
 self.dist_token.shape torch.Size([1, 1, 768])
 final sequence x torch.Size([1, 1190, 768])
 after 12 atten blocks x torch.Size([1, 1190, 768])
forward_features torch.Size([1, 768])
head torch.Size([1, 527])
Embedding Shape: torch.Size([1, 768])


In [5]:
with torch.no_grad():
    waveform = load_real_audio("./audio_files/36105__erh__roswell.wav")
    audio = waveform.unsqueeze(0)
    audio_wave = audio.cuda()
    with torch.no_grad():
        embedding = model(audio_wave) 
    print("Embedding Shape:",embedding.shape)

Embedding Shape: torch.Size([1, 768])


In [6]:
with torch.no_grad():
    waveform = load_real_audio("./audio_files/222993__zyrytsounds__people-talking.wav")
    audio = waveform.unsqueeze(0)
    audio_wave = audio.cuda()
    with torch.no_grad():
        embedding = model(audio_wave)
    print("Embedding Shape:",embedding.shape)

Embedding Shape: torch.Size([1, 768])


In [7]:
filepaths = [
    "./audio_files/33711__acclivity__excessiveexposure.wav",
    "./audio_files/222993__zyrytsounds__people-talking.wav",
    "./audio_files/36105__erh__roswell.wav" 
]

batch_waveforms = [load_real_audio(fp) for fp in filepaths]
batch_audio = torch.stack(batch_waveforms).cuda()

with torch.no_grad():
    embeddings = model(batch_audio)

print("Batch Embedding Shape:", embeddings.shape)

Batch Embedding Shape: torch.Size([3, 768])
