# Implementation of baseline (source only) for UDA on DCASE TAU URBAN 2020

In [1]:
import torch
import torch.nn as nn
import numpy as np
import datetime
import os
import librosa
import pandas as pd
from hear21passt.base import get_basic_model
from torch.utils.data import Dataset, DataLoader
import gc


In [2]:
MODEL_NAME = 'Audio_Classifier'
print("cuda" if torch.cuda.is_available() else "cpu")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


cuda


In [3]:
class PaSSTFeatureExtractor(torch.nn.Module):
    def __init__(self, device=None):
        super(PaSSTFeatureExtractor, self).__init__()
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.model = get_basic_model(mode="embed_only") 
        self.model.to(self.device)
        self.model.eval()

    def forward(self, audio_waveform, sample_rate=32000):
        if audio_waveform.dim() == 1:
            audio_waveform = audio_waveform.unsqueeze(0)  

        audio_waveform = audio_waveform.to(self.device)
        
        # Allow gradients to flow through PaSST for fine-tuning
        features = self.model(audio_waveform)
             
        return features
  

In [4]:
class Classifier(nn.Module):
    """Simplified Classifier"""
    def __init__(self, input_size=768, num_classes=10):
        super(Classifier, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(128, num_classes),
        )

    def forward(self, h):
        return self.layer(h)


In [5]:
class AudioDataset(Dataset):
    def __init__(self, root_dir, csv_file=None, sample_rate=32000, max_length=10):  # Reduced to 5 seconds
        self.root_dir = root_dir
        self.sample_rate = sample_rate
        self.max_length = max_length
        self.files = []
        self.labels = []
        self.label_to_idx = {}
        
        # Load labels from CSV if provided
        if csv_file and os.path.exists(csv_file):
            df = pd.read_csv(csv_file, delimiter='\t')
            print(f"CSV columns: {list(df.columns)}")
            
            filename_to_label = {}
            
            # Find label and filename columns
            label_col = None
            filename_col = None
            
            for col in df.columns:
                if 'label' in col.lower() or 'scene' in col.lower() or 'class' in col.lower():
                    label_col = col
                    break
            
            for col in df.columns:
                if 'filename' in col.lower() or 'file' in col.lower() or 'name' in col.lower():
                    filename_col = col
                    break
            
            if label_col is None or filename_col is None:
                if len(df.columns) >= 2:
                    filename_col = df.columns[0]
                    label_col = df.columns[1]
                    print(f"Using columns: filename='{filename_col}', label='{label_col}'")
                else:
                    raise ValueError("CSV file must have at least 2 columns")
            
            # Get unique labels and create mapping
            unique_labels = sorted(df[label_col].unique())
            self.label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
            self.idx_to_label = {idx: label for label, idx in self.label_to_idx.items()}
            
            # Create filename to label mapping
            existing_files = set(os.listdir(root_dir))
            for _, row in df.iterrows():
                csv_filename = row[filename_col]
                if '/' in csv_filename:
                    csv_filename = csv_filename.split('/')[-1]
                
                if csv_filename in existing_files:
                    filename_to_label[csv_filename] = row[label_col]
            
            print(f"Found {len(filename_to_label)} matching files in {root_dir}")
            
            # Load files and labels
            matched_files = 0
            for fname in sorted(os.listdir(root_dir)):
                if fname.endswith('.wav'):
                    if fname in filename_to_label:
                        self.files.append(os.path.join(root_dir, fname))
                        scene_label = filename_to_label[fname]
                        label_idx = self.label_to_idx[scene_label]
                        self.labels.append(label_idx)
                        matched_files += 1
            
            print(f"Successfully loaded {matched_files} files with labels")
        else:
            for fname in sorted(os.listdir(root_dir)):
                if fname.endswith('.wav'):
                    self.files.append(os.path.join(root_dir, fname))
                    self.labels.append(0)
            
        print(f"Dataset {root_dir}: {len(self.files)} audio files")
        if self.label_to_idx:
            print(f"Label mapping: {self.label_to_idx}")

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        audio_path = self.files[idx]
        label = self.labels[idx]
        
        # Load audio with reduced length
        waveform, sr = librosa.load(audio_path, sr=self.sample_rate, mono=True)
        
        if self.max_length:
            max_samples = int(self.max_length * self.sample_rate)
            if len(waveform) > max_samples:
                waveform = waveform[:max_samples]
        
        waveform = torch.tensor(waveform, dtype=torch.float32)
        return waveform, label
    
    def get_num_classes(self):
        return len(self.label_to_idx) if self.label_to_idx else 1


In [6]:
def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [7]:
batch_size = 8
max_audio_length = 10

In [8]:
meta_csv = './dcase/meta.csv'

source_train = AudioDataset(
    root_dir='./dcase/train/source',
    csv_file=meta_csv,
    max_length=max_audio_length
)
target_train = AudioDataset(
    root_dir='./dcase/train/target',
    csv_file=meta_csv,
    max_length=max_audio_length
)
source_test = AudioDataset(
    root_dir='./dcase/test/source',
    csv_file=meta_csv,
    max_length=max_audio_length
)

target_test = AudioDataset(
    root_dir='./dcase/test/target',
    csv_file=meta_csv,
    max_length=max_audio_length
)

num_classes = max(
    source_train.get_num_classes(), 
    target_train.get_num_classes(),
    source_test.get_num_classes(),
    target_test.get_num_classes()
)
print(f"Number of classes: {num_classes}")
print(f"Source train samples: {len(source_train)}")
print(f"Target train samples: {len(target_train)}")
print(f"Source test samples: {len(source_test)}")
print(f"Target test samples: {len(target_test)}")


CSV columns: ['filename', 'scene_label', 'identifier', 'source_label']
Found 10215 matching files in ./dcase/train/source
Successfully loaded 10215 files with labels
Dataset ./dcase/train/source: 10215 audio files
Label mapping: {'airport': 0, 'bus': 1, 'metro': 2, 'metro_station': 3, 'park': 4, 'public_square': 5, 'shopping_mall': 6, 'street_pedestrian': 7, 'street_traffic': 8, 'tram': 9}
CSV columns: ['filename', 'scene_label', 'identifier', 'source_label']
Found 3747 matching files in ./dcase/train/target
Successfully loaded 3747 files with labels
Dataset ./dcase/train/target: 3747 audio files
Label mapping: {'airport': 0, 'bus': 1, 'metro': 2, 'metro_station': 3, 'park': 4, 'public_square': 5, 'shopping_mall': 6, 'street_pedestrian': 7, 'street_traffic': 8, 'tram': 9}
CSV columns: ['filename', 'scene_label', 'identifier', 'source_label']
Found 330 matching files in ./dcase/test/source
Successfully loaded 330 files with labels
Dataset ./dcase/test/source: 330 audio files
Label mappi

In [9]:
source_loader = DataLoader(source_train, batch_size=batch_size, shuffle=True, 
                          drop_last=True, num_workers=1, pin_memory=False)
eval_loader = DataLoader(source_train, batch_size=batch_size, shuffle=False, 
                        drop_last=False, num_workers=1, pin_memory=False)
test_source_loader = DataLoader(source_test, batch_size=batch_size, shuffle=False, 
                        drop_last=False, num_workers=1, pin_memory=False)
test_target_loader = DataLoader(target_test, batch_size=batch_size, shuffle=False, 
                        drop_last=False, num_workers=1, pin_memory=False)







In [10]:
F = PaSSTFeatureExtractor().to(DEVICE)  
C = Classifier(num_classes=num_classes).to(DEVICE)

print(f"Feature extractor trainable parameters: {sum(p.numel() for p in F.parameters() if p.requires_grad)}")
print(f"Classifier parameters: {sum(p.numel() for p in C.parameters())}")




 Loading PASST TRAINED ON AUDISET 


PaSST(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,), ep

In [11]:
xe = nn.CrossEntropyLoss()


F_opt = torch.optim.Adam(F.parameters(), lr=1e-5)  
C_opt = torch.optim.Adam(C.parameters(), lr=1e-3)


max_epoch = 50  
step = 0


ll_c = [] 
acc_lst = [] 

print(f"Starting training for {max_epoch} epochs...")
print("="*60)

clear_memory()

Starting training for 50 epochs...


In [12]:

for epoch in range(1, max_epoch + 1):
    print(f"\nEpoch {epoch}/{max_epoch}")
    epoch_start_time = datetime.datetime.now()
    
    F.train()
    C.train()
    
    for idx, (src_images, labels) in enumerate(source_loader):
        src = src_images.to(DEVICE)
        labels = labels.to(DEVICE)

        src_features = F(src)
        class_outputs = C(src_features)
        Lc = xe(class_outputs, labels)

        F.zero_grad()
        C.zero_grad()
        
        Lc.backward()
        C_opt.step()
        F_opt.step()

        if step % 100 == 0:
            dt = datetime.datetime.now().strftime('%H:%M:%S')
            print(f'Epoch: {epoch}/{max_epoch}, Step: {step}, C Loss: {Lc.item():.4f} ---- {dt}')
            ll_c.append(Lc.item())

        if step % 5000 == 0:
            F.eval()
            C.eval()
            with torch.no_grad():
                correct_src = 0
                total_src = 0
                for eval_src, eval_labels in eval_loader:
                    eval_src = eval_src.to(DEVICE)
                    eval_labels = eval_labels.to(DEVICE)
                    preds = C(F(eval_src))
                    _, predicted = torch.max(preds, 1)
                    correct_src += (predicted == eval_labels).sum().item()
                    total_src += eval_labels.size(0)
                acc_src = correct_src / total_src
                print(f'***** Eval Result (Source): {acc_src:.4f}, Step: {step}')
                correct_tgt = 0
                total_tgt = 0
                correct_src_test = 0
                total_src_test = 0
                for test_src, test_labels in test_source_loader:
                    test_src = test_src.to(DEVICE)
                    test_labels = test_labels.to(DEVICE)
                    preds = C(F(test_src))
                    _, predicted = torch.max(preds, 1)
                    correct_src_test += (predicted == test_labels).sum().item()
                    total_src_test += test_labels.size(0)
                acc_src_test = correct_src_test / total_src_test
                print(f'***** Test Result (Source Test): {acc_src_test:.4f}, Step: {step}')

                # Evaluate on target test set
                correct_tgt = 0
                total_tgt = 0
                for test_tgt, test_labels in test_target_loader:
                    test_tgt = test_tgt.to(DEVICE)
                    test_labels = test_labels.to(DEVICE)
                    preds = C(F(test_tgt))
                    _, predicted = torch.max(preds, 1)
                    correct_tgt += (predicted == test_labels).sum().item()
                    total_tgt += test_labels.size(0)
                acc_tgt = correct_tgt / total_tgt 
                print(f'***** Test Result (Target Test): {acc_tgt:.4f}, Step: {step}')
                acc_lst.append(acc_tgt)  # ← If you're only tracking target acc
                
            
            F.train()
            C.train()
        
        step += 1
        
        if step % 50 == 0:
            clear_memory()
    
    epoch_time = datetime.datetime.now() - epoch_start_time
    print(f"Epoch {epoch} completed in {epoch_time}")
    print("-" * 50)
    
    clear_memory()

print("\n" + "="*60)
print("FINAL EVALUATION")
print("="*60)

F.eval()
C.eval()

with torch.no_grad():
    # Source Eval Set Accuracy
    correct_src, total_src = 0, 0
    for eval_src, eval_labels in eval_loader:
        eval_src, eval_labels = eval_src.to(DEVICE), eval_labels.to(DEVICE)
        preds = C(F(eval_src))
        _, predicted = torch.max(preds, 1)
        correct_src += (predicted == eval_labels).sum().item()
        total_src += eval_labels.size(0)
    final_src_eval_acc = correct_src / total_src

    # Source Test Set Accuracy
    correct_src_test, total_src_test = 0, 0
    for test_src, test_labels in test_source_loader:
        test_src, test_labels = test_src.to(DEVICE), test_labels.to(DEVICE)
        preds = C(F(test_src))
        _, predicted = torch.max(preds, 1)
        correct_src_test += (predicted == test_labels).sum().item()
        total_src_test += test_labels.size(0)
    final_src_test_acc = correct_src_test / total_src_test

    # Target Test Set Accuracy
    correct_tgt, total_tgt = 0, 0
    for test_tgt, test_labels in test_target_loader:
        test_tgt, test_labels = test_tgt.to(DEVICE), test_labels.to(DEVICE)
        preds = C(F(test_tgt))
        _, predicted = torch.max(preds, 1)
        correct_tgt += (predicted == test_labels).sum().item()
        total_tgt += test_labels.size(0)
    final_tgt_test_acc = correct_tgt / total_tgt

# Print neatly formatted results
print(f"✔ Final Evaluation Summary")
print(f"    • Source Eval Accuracy      : {final_src_eval_acc:.4f}")
print(f"    • Source Test Accuracy      : {final_src_test_acc:.4f}")
print(f"    • Target Test Accuracy      : {final_tgt_test_acc:.4f}")
print("="*60)

# Additional training statistics
print("✔ Training Statistics")
print(f"    • Total steps               : {step}")
print(f"    • Classification loss logs  : {len(ll_c)}")
print(f"    • Target acc logs           : {len(acc_lst)}")
if acc_lst:
    print(f"    • Best Target Accuracy      : {max(acc_lst):.4f}")
    print(f"    • Final Target Accuracy     : {acc_lst[-1]:.4f}")


print(" Training completed!")


clear_memory()


Epoch 1/50


Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /pytorch/aten/src/ATen/native/SpectralOps.cpp:875.)
  return _VF.stft(  # type: ignore[attr-defined]
  with torch.cuda.amp.autocast(enabled=False):


x torch.Size([8, 1, 128, 1000])
self.norm(x) torch.Size([8, 768, 12, 99])
 patch_embed :  torch.Size([8, 768, 12, 99])
 self.time_new_pos_embed.shape torch.Size([1, 768, 1, 99])
 self.freq_new_pos_embed.shape torch.Size([1, 768, 12, 1])
X flattened torch.Size([8, 1188, 768])
 self.new_pos_embed.shape torch.Size([1, 2, 768])
 self.cls_tokens.shape torch.Size([8, 1, 768])
 self.dist_token.shape torch.Size([8, 1, 768])
 final sequence x torch.Size([8, 1190, 768])
 after 12 atten blocks x torch.Size([8, 1190, 768])
forward_features torch.Size([8, 768])
head torch.Size([8, 527])
Epoch: 1/50, Step: 0, C Loss: 2.3472 ---- 23:42:57
***** Eval Result (Source): 0.1477, Step: 0
***** Test Result (Source Test): 0.1182, Step: 0
***** Test Result (Target Test): 0.1391, Step: 0
Epoch: 1/50, Step: 100, C Loss: 1.2903 ---- 23:50:58
Epoch: 1/50, Step: 200, C Loss: 0.7332 ---- 23:52:09
Epoch: 1/50, Step: 300, C Loss: 0.9930 ---- 23:53:21
Epoch: 1/50, Step: 400, C Loss: 0.6562 ---- 23:54:33
Epoch: 1/50, S

In [None]:
# Define path to save model
save_dir = './saved_models/source_only'
os.makedirs(save_dir, exist_ok=True)
model_path = os.path.join(save_dir, f"{MODEL_NAME}_final.pth")

# Save model state dictionaries
torch.save({
    'feature_extractor_state_dict': F.state_dict(),
    'classifier_state_dict': C.state_dict(),
    'optimizer_F_state_dict': F_opt.state_dict(),
    'optimizer_C_state_dict': C_opt.state_dict(),
    'epoch': epoch,
    'step': step,
    'label_to_idx': source_train.label_to_idx  # Save label mapping if needed
}, model_path)

print(f"✔ Model saved to: {model_path}")
