# Implementation of baseline without Domain Adaptation on DCASE TAU URBAN 2020

In [1]:
import torch
import torch.nn as nn
import numpy as np
import datetime
import os
import librosa
import pandas as pd
from hear21passt.base import get_basic_model
from torch.utils.data import Dataset, DataLoader
import gc


In [2]:
MODEL_NAME = 'Simplified_Audio_Classifier'
print("cuda" if torch.cuda.is_available() else "cpu")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


cuda


In [3]:
class PaSSTFeatureExtractor(torch.nn.Module):
    def __init__(self, device=None):
        super(PaSSTFeatureExtractor, self).__init__()
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model = get_basic_model(mode="embed_only") 
        self.model.to(self.device)
        self.model.eval()

    def forward(self, audio_waveform, sample_rate=32000):
        if audio_waveform.dim() == 1:
            audio_waveform = audio_waveform.unsqueeze(0)  

        audio_waveform = audio_waveform.to(self.device)
        features = self.model(audio_waveform)
             
        return features
  

In [4]:
class Classifier(nn.Module):
    """Simplified Classifier"""
    def __init__(self, input_size=768, num_classes=10):
        super(Classifier, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes),
        )

    def forward(self, h):
        return self.layer(h)


In [5]:
class AudioDataset(Dataset):
    def __init__(self, root_dir, csv_file=None, sample_rate=32000, max_length=5):
        self.root_dir = root_dir
        self.sample_rate = sample_rate
        self.max_length = max_length
        self.files = []
        self.labels = []
        self.label_to_idx = {}
        if csv_file and os.path.exists(csv_file):
            df = pd.read_csv(csv_file, delimiter='\t')
            print(f"CSV columns: {list(df.columns)}")
            
            filename_to_label = {}

            label_col = None
            filename_col = None
            
            for col in df.columns:
                if 'label' in col.lower() or 'scene' in col.lower() or 'class' in col.lower():
                    label_col = col
                    break
            
            for col in df.columns:
                if 'filename' in col.lower() or 'file' in col.lower() or 'name' in col.lower():
                    filename_col = col
                    break
            
            if label_col is None or filename_col is None:
                if len(df.columns) >= 2:
                    filename_col = df.columns[0]
                    label_col = df.columns[1]
                    print(f"Using columns: filename='{filename_col}', label='{label_col}'")
                else:
                    raise ValueError("CSV file must have at least 2 columns")
            
            unique_labels = sorted(df[label_col].unique())
            self.label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
            self.idx_to_label = {idx: label for label, idx in self.label_to_idx.items()}
            
            existing_files = set(os.listdir(root_dir))
            for _, row in df.iterrows():
                csv_filename = row[filename_col]
                if '/' in csv_filename:
                    csv_filename = csv_filename.split('/')[-1]
                
                if csv_filename in existing_files:
                    filename_to_label[csv_filename] = row[label_col]
            
            print(f"Found {len(filename_to_label)} matching files in {root_dir}")
            
            matched_files = 0
            for fname in sorted(os.listdir(root_dir)):
                if fname.endswith('.wav'):
                    if fname in filename_to_label:
                        self.files.append(os.path.join(root_dir, fname))
                        scene_label = filename_to_label[fname]
                        label_idx = self.label_to_idx[scene_label]
                        self.labels.append(label_idx)
                        matched_files += 1
            
            print(f"Successfully loaded {matched_files} files with labels")
        else:
            for fname in sorted(os.listdir(root_dir)):
                if fname.endswith('.wav'):
                    self.files.append(os.path.join(root_dir, fname))
                    self.labels.append(0)
            
        print(f"Dataset {root_dir}: {len(self.files)} audio files")
        if self.label_to_idx:
            print(f"Label mapping: {self.label_to_idx}")

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        audio_path = self.files[idx]
        label = self.labels[idx]
        
        waveform, sr = librosa.load(audio_path, sr=self.sample_rate, mono=True)
        
        if self.max_length:
            max_samples = int(self.max_length * self.sample_rate)
            if len(waveform) > max_samples:
                waveform = waveform[:max_samples]
        
        waveform = torch.tensor(waveform, dtype=torch.float32)
        return waveform, label
    
    def get_num_classes(self):
        return len(self.label_to_idx) if self.label_to_idx else 1


In [6]:
def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [7]:
batch_size = 16 
max_audio_length = 5 

In [8]:
meta_csv = '/home/teaching/interng1/datasets/dcase/meta.csv'

train_dataset = AudioDataset(
    root_dir='/home/teaching/interng1/datasets/dcase/train',
    csv_file=meta_csv,
    max_length=max_audio_length
)

test_dataset = AudioDataset(
    root_dir='/home/teaching/interng1/datasets/dcase/test',
    csv_file=meta_csv,
    max_length=max_audio_length
)

num_classes = max(
    train_dataset.get_num_classes(), 
    test_dataset.get_num_classes()
)
print(f"Number of classes: {num_classes}")
print(f"Train samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")


CSV columns: ['filename', 'scene_label', 'identifier', 'source_label']
Found 13962 matching files in /home/teaching/interng1/grl/datasets/dcase/train
Successfully loaded 13962 files with labels
Dataset /home/teaching/interng1/grl/datasets/dcase/train: 13962 audio files
Label mapping: {'airport': 0, 'bus': 1, 'metro': 2, 'metro_station': 3, 'park': 4, 'public_square': 5, 'shopping_mall': 6, 'street_pedestrian': 7, 'street_traffic': 8, 'tram': 9}
CSV columns: ['filename', 'scene_label', 'identifier', 'source_label']
Found 2968 matching files in /home/teaching/interng1/grl/datasets/dcase/test
Successfully loaded 2968 files with labels
Dataset /home/teaching/interng1/grl/datasets/dcase/test: 2968 audio files
Label mapping: {'airport': 0, 'bus': 1, 'metro': 2, 'metro_station': 3, 'park': 4, 'public_square': 5, 'shopping_mall': 6, 'street_pedestrian': 7, 'street_traffic': 8, 'tram': 9}
Number of classes: 10
Train samples: 13962
Test samples: 2968


In [9]:
source_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, 
                          drop_last=True, num_workers=1, pin_memory=False)
eval_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, 
                        drop_last=False, num_workers=1, pin_memory=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, 
                        drop_last=False, num_workers=1, pin_memory=False)








In [10]:
F = PaSSTFeatureExtractor().to(DEVICE)  
C = Classifier(num_classes=num_classes).to(DEVICE)

print(f"Feature extractor trainable parameters: {sum(p.numel() for p in F.parameters() if p.requires_grad)}")
print(f"Classifier parameters: {sum(p.numel() for p in C.parameters())}")




 Loading PASST TRAINED ON AUDISET 


PaSST(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,), ep

In [11]:
xe = nn.CrossEntropyLoss()


F_opt = torch.optim.Adam(F.parameters(), lr=1e-5)  
C_opt = torch.optim.Adam(C.parameters(), lr=1e-3)


max_epoch = 50  
step = 0


ll_c = [] 
acc_lst = [] 


clear_memory()

In [12]:

for epoch in range(1, max_epoch + 1):
    print(f"\nEpoch {epoch}/{max_epoch}")
    epoch_start_time = datetime.datetime.now()
    
    F.train()
    C.train()
    
    for idx, (src_images, labels) in enumerate(source_loader):
        src = src_images.to(DEVICE)
        labels = labels.to(DEVICE)

        src_features = F(src)
        class_outputs = C(src_features)
        Lc = xe(class_outputs, labels)

        F.zero_grad()
        C.zero_grad()
        
        Lc.backward()
        C_opt.step()
        F_opt.step()

        if step % 100 == 0:
            dt = datetime.datetime.now().strftime('%H:%M:%S')
            print(f'Epoch: {epoch}/{max_epoch}, Step: {step}, C Loss: {Lc.item():.4f} ---- {dt}')
            ll_c.append(Lc.item())

        if step % 500 == 0:
            F.eval()
            C.eval()
            with torch.no_grad():
                correct_src = 0
                total_src = 0
                for eval_src, eval_labels in eval_loader:
                    eval_src = eval_src.to(DEVICE)
                    eval_labels = eval_labels.to(DEVICE)
                    preds = C(F(eval_src))
                    _, predicted = torch.max(preds, 1)
                    correct_src += (predicted == eval_labels).sum().item()
                    total_src += eval_labels.size(0)
                acc_src = correct_src / total_src
                print(f'***** Eval Result (Source): {acc_src:.4f}, Step: {step}')
                correct_tgt = 0
                total_tgt = 0
                for test_tgt, test_labels in test_loader:
                    test_tgt = test_tgt.to(DEVICE)
                    test_labels = test_labels.to(DEVICE)
                    preds = C(F(test_tgt))
                    _, predicted = torch.max(preds, 1)
                    correct_tgt += (predicted == test_labels).sum().item()
                    total_tgt += test_labels.size(0)
                acc_tgt = correct_tgt / total_tgt
                print(f'***** Test Result (Target): {acc_tgt:.4f}, Step: {step}')
                acc_lst.append(acc_tgt)
            
            F.train()
            C.train()
        
        step += 1
        
        if step % 50 == 0:
            clear_memory()
    
    epoch_time = datetime.datetime.now() - epoch_start_time
    print(f"Epoch {epoch} completed in {epoch_time}")
    print("-" * 50)
    
    clear_memory()

print("\n" + "="*60)
print("FINAL EVALUATION")
print("="*60)

F.eval()
C.eval()
with torch.no_grad():
    correct_src = 0
    total_src = 0
    for eval_src, eval_labels in eval_loader:
        eval_src = eval_src.to(DEVICE)
        eval_labels = eval_labels.to(DEVICE)
        preds = C(F(eval_src))
        _, predicted = torch.max(preds, 1)
        correct_src += (predicted == eval_labels).sum().item()
        total_src += eval_labels.size(0)
    final_src_acc = correct_src / total_src
    print(f'Source Accuracy: {final_src_acc:.4f}')
    correct_tgt = 0
    total_tgt = 0
    for test_tgt, test_labels in test_loader:
        test_tgt = test_tgt.to(DEVICE)
        test_labels = test_labels.to(DEVICE)
        preds = C(F(test_tgt))
        _, predicted = torch.max(preds, 1)
        correct_tgt += (predicted == test_labels).sum().item()
        total_tgt += test_labels.size(0)
    final_test_acc = correct_tgt / total_tgt
    print(f'Test Accuracy: {final_test_acc:.4f}')
clear_memory()


Epoch 1/50


Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /pytorch/aten/src/ATen/native/SpectralOps.cpp:875.)
  return _VF.stft(  # type: ignore[attr-defined]
  with torch.cuda.amp.autocast(enabled=False):


x torch.Size([16, 1, 128, 500])
self.norm(x) torch.Size([16, 768, 12, 49])
 patch_embed :  torch.Size([16, 768, 12, 49])
 self.time_new_pos_embed.shape torch.Size([1, 768, 1, 99])
 CUT time_new_pos_embed.shape torch.Size([1, 768, 1, 49])
 self.freq_new_pos_embed.shape torch.Size([1, 768, 12, 1])
X flattened torch.Size([16, 588, 768])
 self.new_pos_embed.shape torch.Size([1, 2, 768])
 self.cls_tokens.shape torch.Size([16, 1, 768])
 self.dist_token.shape torch.Size([16, 1, 768])
 final sequence x torch.Size([16, 590, 768])
 after 12 atten blocks x torch.Size([16, 590, 768])
forward_features torch.Size([16, 768])
head torch.Size([16, 527])
Epoch: 1/50, Step: 0, C Loss: 2.4955 ---- 16:38:27
***** Eval Result (Source): 0.1116, Step: 0
***** Test Result (Target): 0.1071, Step: 0
Epoch: 1/50, Step: 100, C Loss: 1.0185 ---- 16:42:55
Epoch: 1/50, Step: 200, C Loss: 1.1287 ---- 16:43:53
Epoch: 1/50, Step: 300, C Loss: 0.7912 ---- 16:44:50
Epoch: 1/50, Step: 400, C Loss: 0.6669 ---- 16:45:46
Epoc