Import bibliotek

In [59]:
import os
import shutil
import pandas as pd
import torchaudio
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning import LightningDataModule
import kagglehub
from sklearn.model_selection import train_test_split
import random
from pathlib import Path
import torchmetrics
import torch.optim as optim
import openl3
import tensorflow as tf
import numpy as np
from IPython.display import Audio

Download dataset

In [60]:
target_dir = "dataset"

path = kagglehub.dataset_download("junewookim/mad-dataset-military-audio-dataset")
print("Cache KaggleHub:", path)

os.makedirs(target_dir, exist_ok=True)
shutil.copytree(path, target_dir, dirs_exist_ok=True)

print("Zapisano do:", target_dir)

noise_folder = "dataset/noises"
os.makedirs(noise_folder, exist_ok=True)

Cache KaggleHub: /home/remek2go/.cache/kagglehub/datasets/junewookim/mad-dataset-military-audio-dataset/versions/1
Zapisano do: dataset
Zapisano do: dataset


### Funkcja dodająca szum
Funkcja pomocnicza do augmentacji danych poprzez dodawanie szumu do nagrań.

In [61]:
def add_noise_from_folder(waveform, noise_files, noise_std=0.01):
    """
    waveform: [1, N]
    noise_files: lista ścieżek do pliku z szumami
    """
    noise_path = random.choice(noise_files)
    noise_waveform, _ = torchaudio.load(noise_path)

    L = waveform.shape[1]

    # jeśli szum krótszy niż próbka -> powielamy
    if noise_waveform.shape[1] < L:
        repeats = int(L / noise_waveform.shape[1]) + 1
        noise_waveform = noise_waveform.repeat(1, repeats)
    # jeśli szum dłuższy -> losowy fragment
    if noise_waveform.shape[1] > L:
        start = random.randint(0, noise_waveform.shape[1] - L)
        noise_waveform = noise_waveform[:, start:start+L]

    noise_waveform = noise_waveform / (noise_waveform.std() + 1e-9) * noise_std
    noisy = waveform + noise_waveform
    return noisy

### Klasa Dataset na surowym audio
Podstawowa klasa Dataset zwracająca pary nagrań (surowe waveformy) oraz informację czy należą do tej samej klasy.

In [62]:
class SiameseAudioDataset(Dataset):
    def __init__(self, df, root_dir, feature_extractor=None, noise_files=None, max_len=160000):
        self.df = df.reset_index(drop=True)
        self.root_dir = root_dir
        self.feature_extractor = feature_extractor
        self.noise_files = noise_files
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row_a = self.df.iloc[idx]
        path_a = os.path.join(self.root_dir, row_a['path'])
        label_a = int(row_a['label'])

        waveform_a, sample_rate = torchaudio.load(path_a)
        waveform_a = self._pad_or_trim(waveform_a)

        # losowy wybór drugiego nagrania
        idx_b = idx
        while idx_b == idx:
            idx_b = random.randint(0, len(self.df) - 1)
        row_b = self.df.iloc[idx_b]
        path_b = os.path.join(self.root_dir, row_b['path'])
        label_b = int(row_b['label'])

        waveform_b, _ = torchaudio.load(path_b)
        waveform_b = self._pad_or_trim(waveform_b)

        # dodajemy szum, jeśli mamy pliki szumów
        if self.noise_files:
            waveform_b = add_noise_from_folder(waveform_b, self.noise_files, noise_std=0.01)

        same_label = 1 if label_a == label_b else 0

        # Ekstrakcja cech OpenL3
        features_a = None
        features_b = None
        if self.feature_extractor:
            # Ekstraktor oczekuje tensora [batch, samples] lub [batch, 1, samples]
            with torch.no_grad():
                features_a = self.feature_extractor.extract_features(
                    waveform_a.unsqueeze(0)  # [1, 1, samples]
                ).squeeze(0)  # [512]
                features_b = self.feature_extractor.extract_features(
                    waveform_b.unsqueeze(0)  # [1, 1, samples]
                ).squeeze(0)  # [512]

        return waveform_a, label_a, waveform_b, label_b, same_label, sample_rate, features_a, features_b

    def _pad_or_trim(self, waveform):
        L = waveform.shape[1]
        if L > self.max_len:
            waveform = waveform[:, :self.max_len]
        elif L < self.max_len:
            waveform = torch.nn.functional.pad(waveform, (0, self.max_len - L))
        return waveform

In [63]:
def siamese_collate(batch):
    a = torch.stack([item[0] for item in batch])
    label_a = torch.tensor([item[1] for item in batch], dtype=torch.long)
    b = torch.stack([item[2] for item in batch])
    label_b = torch.tensor([item[3] for item in batch], dtype=torch.long)
    same_label = torch.tensor([item[4] for item in batch], dtype=torch.long)
    sample_rate = batch[0][5]
    
    features_a = torch.stack([item[6] for item in batch]) if batch[0][6] is not None else None
    features_b = torch.stack([item[7] for item in batch]) if batch[0][7] is not None else None
    
    return a, label_a, b, label_b, same_label, sample_rate, features_a, features_b

### DataModule dla surowego audio
Klasa LightningDataModule zarządzająca datasetami treningowymi i walidacyjnymi dla surowego audio.

In [64]:
class SiameseAudioDataModule(LightningDataModule):
    def __init__(self, df, root_dir, feature_extractor=None, noise_folder=None, batch_size=4, num_workers=0, max_len=160000):
        super().__init__()
        self.df = df
        self.root_dir = root_dir
        self.feature_extractor = feature_extractor
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.max_len = max_len

        self.noise_files = list(Path(noise_folder).glob("*.wav")) if noise_folder else None

    def setup(self, stage=None):
        train_df, val_df = train_test_split(self.df, test_size=0.2, random_state=42, stratify=self.df['label'])
        self.train_dataset = SiameseAudioDataset(train_df, self.root_dir, self.feature_extractor, self.noise_files, max_len=self.max_len)
        self.val_dataset = SiameseAudioDataset(val_df, self.root_dir, self.feature_extractor, self.noise_files, max_len=self.max_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True,
                          num_workers=self.num_workers, collate_fn=siamese_collate)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False,
                          num_workers=self.num_workers, collate_fn=siamese_collate)

In [65]:
df = pd.read_csv("dataset/MAD_dataset/training.csv")

dm = SiameseAudioDataModule(df, root_dir="dataset/MAD_dataset", noise_folder=noise_folder,
                            batch_size=8, max_len=160000)
dm.setup()

batch = next(iter(dm.train_dataloader()))
a, label_a, b, label_b, same_label, sample_rate, _, _ = batch

print("Batch shapes:", a.shape, b.shape)
print("Labely czy takie same:", same_label)

Batch shapes: torch.Size([8, 1, 160000]) torch.Size([8, 1, 160000])
Labely czy takie same: tensor([0, 0, 0, 0, 1, 1, 0, 0])


In [66]:
#Audio(a[0].squeeze().numpy(), rate=sample_rate)  # czyste
Audio(b[0].squeeze().numpy(), rate=sample_rate)  # zaszumione

### Klasa ekstraktora cech OpenL3
Klasa wrapper dla modelu OpenL3 służąca do ekstrakcji embeddingów z plików audio.

In [67]:
class OpenL3FeatureExtractor:
    """
    Ekstraktor cech audio za pomocą zamrożonego modelu OpenL3
    """
    def __init__(self, input_repr="mel128", content_type="music", embedding_size=512):
        """
        input_repr: "mel128" lub "mel256"
        content_type: "music" lub "environmental"
        embedding_size: 512 lub 6144
        """
        print(f"Ładowanie modelu OpenL3: {input_repr}, {content_type}, embedding_size={embedding_size}")
        self.model = openl3.models.load_audio_embedding_model(
            input_repr=input_repr,
            content_type=content_type,
            embedding_size=embedding_size
        )
        
        self.model.trainable = False
        for layer in self.model.layers:
            layer.trainable = False
        
        self.input_repr = input_repr
        self.content_type = content_type
        self.embedding_size = embedding_size
        self.sample_rate = 48000

    def extract_features(self, waveform):
        """
        Ekstrakcja cech z audio
        waveform: tensor [batch, samples] lub [samples]
        """
        if waveform.dim() == 3:
            waveform = waveform.squeeze(1)
        
        if isinstance(waveform, torch.Tensor):
            waveform_np = waveform.numpy()
        else:
            waveform_np = waveform
        
        features_batch = []
        
        for i in range(waveform_np.shape[0]):
            wav = waveform_np[i]
            
            embedding, timestamps = openl3.get_audio_embedding(
                wav,
                sr=48000,
                model=self.model,
                hop_size=0.1,
                verbose=False
            )
            
            aggregated = np.mean(embedding, axis=0)
            features_batch.append(aggregated)
        
        features_tensor = torch.tensor(np.array(features_batch), dtype=torch.float32)
        return features_tensor

print("Inicjalizacja OpenL3 Feature Extractor...")
feature_extractor = OpenL3FeatureExtractor(
    input_repr="mel128",
    content_type="music",
    embedding_size=512
)
print("OpenL3 ekstraktor załadowany")

Inicjalizacja OpenL3 Feature Extractor...
Ładowanie modelu OpenL3: mel128, music, embedding_size=512
OpenL3 ekstraktor załadowany
OpenL3 ekstraktor załadowany


In [68]:
print("Testowanie ekstraktora cech na batchu...")
print(f"Wymiary audio wejściowego: {a.shape}")

features = feature_extractor.extract_features(a)
print(f"Wymiary cech OpenL3: {features.shape}")
print(f"Zakres wartości: min={features.min():.4f}, max={features.max():.4f}")
print("Ekstrakcja cech powiodła się")

Testowanie ekstraktora cech na batchu...
Wymiary audio wejściowego: torch.Size([8, 1, 160000])
Wymiary cech OpenL3: torch.Size([8, 512])
Zakres wartości: min=-0.3359, max=5.2183
Ekstrakcja cech powiodła się
Wymiary cech OpenL3: torch.Size([8, 512])
Zakres wartości: min=-0.3359, max=5.2183
Ekstrakcja cech powiodła się


In [69]:
print("Inicjalizacja DataModule z OpenL3 ekstraktorem...")
dm = SiameseAudioDataModule(
    df, 
    root_dir="dataset/MAD_dataset", 
    feature_extractor=feature_extractor,
    noise_folder=noise_folder,
    batch_size=4,
    max_len=160000
)
dm.setup()

print("Pobieranie batcha z cechami...")
batch_with_features = next(iter(dm.train_dataloader()))
a_batch, label_a_batch, b_batch, label_b_batch, same_label_batch, sr, features_a, features_b = batch_with_features

print("\n=== Wyniki ===")
print(f"Audio a wymiary: {a_batch.shape}")
print(f"Audio b wymiary: {b_batch.shape}")
print(f"Cechy OpenL3 a wymiary: {features_a.shape}")
print(f"Cechy OpenL3 b wymiary: {features_b.shape}")
print(f"Etykiety (same_label): {same_label_batch}")
print(f"\nUśrednione cechy OpenL3 dla pierwszej pary:")
print(f"  Features A: {features_a[0][:10]}... (pierwsze 10 wartości)")
print(f"  Features B: {features_b[0][:10]}... (pierwsze 10 wartości)")
print(f"\nZakres wartości cech A: min={features_a.min():.4f}, max={features_a.max():.4f}")
print(f"Zakres wartości cech B: min={features_b.min():.4f}, max={features_b.max():.4f}")
print("\nEkstrakcja cech z DataModule powiodła się!")

Inicjalizacja DataModule z OpenL3 ekstraktorem...
Pobieranie batcha z cechami...

=== Wyniki ===
Audio a wymiary: torch.Size([4, 1, 160000])
Audio b wymiary: torch.Size([4, 1, 160000])
Cechy OpenL3 a wymiary: torch.Size([4, 512])
Cechy OpenL3 b wymiary: torch.Size([4, 512])
Etykiety (same_label): tensor([0, 0, 0, 0])

Uśrednione cechy OpenL3 dla pierwszej pary:
  Features A: tensor([2.2075, 0.7346, 2.4403, 3.4444, 0.8512, 1.8949, 0.9153, 0.5021, 3.3765,
        1.6570])... (pierwsze 10 wartości)
  Features B: tensor([2.0363, 1.5574, 2.4478, 3.7002, 1.1426, 1.9358, 0.8230, 0.5359, 3.8668,
        1.3570])... (pierwsze 10 wartości)

Zakres wartości cech A: min=-0.3038, max=5.1407
Zakres wartości cech B: min=-0.3990, max=5.6558

Ekstrakcja cech z DataModule powiodła się!

=== Wyniki ===
Audio a wymiary: torch.Size([4, 1, 160000])
Audio b wymiary: torch.Size([4, 1, 160000])
Cechy OpenL3 a wymiary: torch.Size([4, 512])
Cechy OpenL3 b wymiary: torch.Size([4, 512])
Etykiety (same_label): tens

In [70]:
print("\n=== Analiza cech dla pary nagrań ===")
for i in range(min(2, batch_with_features[0].shape[0])):
    print(f"\nPara {i+1} (same_label={same_label_batch[i].item()}):")
    feat_a = features_a[i]
    feat_b = features_b[i]
    
    cos_sim = torch.nn.functional.cosine_similarity(
        feat_a.unsqueeze(0), feat_b.unsqueeze(0)
    ).item()
    
    eucl_dist = torch.norm(feat_a - feat_b).item()
    
    print(f"  Kosinus podobieństwa: {cos_sim:.4f}")
    print(f"  Dystans euklidesowy: {eucl_dist:.4f}")
    print(f"  Średnia cecha A: {feat_a.mean():.4f}")
    print(f"  Średnia cecha B: {feat_b.mean():.4f}")
    
print("\n✓ Analiza kompletna!")


=== Analiza cech dla pary nagrań ===

Para 1 (same_label=0):
  Kosinus podobieństwa: 0.9815
  Dystans euklidesowy: 10.1594
  Średnia cecha A: 1.7359
  Średnia cecha B: 1.9247

Para 2 (same_label=0):
  Kosinus podobieństwa: 0.9691
  Dystans euklidesowy: 11.2659
  Średnia cecha A: 1.7674
  Średnia cecha B: 1.7945

✓ Analiza kompletna!


### Model Syjamski (LightningModule)
Definicja modelu sieci neuronowej (klasyfikatora), który przyjmuje różnicę cech dwóch nagrań i decyduje czy są to te same klasy.

In [71]:
class SiameseComparator(pl.LightningModule):
    def __init__(self, input_dim=512, hidden_dim=256, learning_rate=1e-3):
        super().__init__()
        self.save_hyperparameters()
        
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            nn.Linear(hidden_dim // 2, 1),
            nn.Sigmoid()
        )
        
        self.loss_fn = nn.BCELoss()
        
        self.accuracy = torchmetrics.Accuracy(task="binary")
        self.f1_score = torchmetrics.F1Score(task="binary")

    def forward(self, feat_a, feat_b):
        diff = torch.abs(feat_a - feat_b)
        
        return self.classifier(diff)

    def training_step(self, batch, batch_idx):
        _, _, _, _, same_label, _, features_a, features_b = batch
        
        probs = self(features_a, features_b)
        probs = probs.squeeze()
        
        loss = self.loss_fn(probs, same_label.float())
        
        preds = (probs > 0.5).long()
        acc = self.accuracy(preds, same_label)
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", acc, prog_bar=True)
        
        return loss

    def validation_step(self, batch, batch_idx):
        _, _, _, _, same_label, _, features_a, features_b = batch
        
        probs = self(features_a, features_b)
        probs = probs.squeeze()
        
        loss = self.loss_fn(probs, same_label.float())
        
        preds = (probs > 0.5).long()
        acc = self.accuracy(preds, same_label)
        f1 = self.f1_score(preds, same_label)
        
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)
        self.log("val_f1", f1, prog_bar=True)
        
        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "monitor": "val_loss"
            }
        }

### Trening modelu

In [72]:
model = SiameseComparator(input_dim=512, hidden_dim=256, learning_rate=0.001)

trainer = pl.Trainer(
    max_epochs=10,
    accelerator="auto",
    devices=1,
    log_every_n_steps=5
)

print("Rozpoczynam trening...")
trainer.fit(model, datamodule=dm)
print("Trening zakończony!")

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
GPU available: False, used: False
TPU available: False, using: 0 TPU cores


Rozpoczynam trening...



Detected KeyboardInterrupt, attempting graceful shutdown ...


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
