Import bibliotek

In [None]:
!pip install pandas
!pip install torch
!pip install torchaudio
!pip install lightning
!pip install kagglehub
!pip install scikit-learn
!pip install ipython
!pip install soundfile
!pip install wandb
!pip install onnx onnxscript onnxruntime

!pip install "resampy>=0.4.0"
!pip install numpy scipy tqdm requests julius
!pip install torchopenl3 --no-deps
!pip install torchcodec



In [None]:
import os
import shutil
import pandas as pd
import torchaudio
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning import LightningDataModule
from pytorch_lightning.loggers import WandbLogger
import kagglehub
from sklearn.model_selection import train_test_split
import random
from pathlib import Path
import torchmetrics
import torch.optim as optim
import torchopenl3
import numpy as np
import wandb
from IPython.display import Audio
from tqdm.auto import tqdm
import gdown
import glob

wandb.login()

True

Download dataset

In [None]:
target_dir = "dataset"

if os.path.exists(target_dir) and len(os.listdir(target_dir)) > 0:
    print(f"Dataset ju≈º istnieje w folderze '{target_dir}'. Pomijam pobieranie.")
else:
    print("Dataset nie znaleziony. Rozpoczynam pobieranie...")
    path = kagglehub.dataset_download("junewookim/mad-dataset-military-audio-dataset")
    print("Cache KaggleHub:", path)

    os.makedirs(target_dir, exist_ok=True)
    shutil.copytree(path, target_dir, dirs_exist_ok=True)
    print("Pobrano i zapisano do:", target_dir)

noise_folder = "dataset/noises"
os.makedirs(noise_folder, exist_ok=True)

Dataset ju≈º istnieje w folderze 'dataset'. Pomijam pobieranie.


### Pobieranie szum√≥w z dysku google
Ten fragment ma za zadanie pobraƒá szumy na colaba z dysku google je≈ºeli jakiego≈õ dzwiƒôku szum√≥w nie ma w docelowym folderze. Gdy kto≈õ chce dodaƒá inne szumy to nale≈ºy dodaƒá je do folderu pod tym adresem URL: https://drive.google.com/drive/folders/14Q_0KNDXACkFQ2oTF1T-gnjIaNbNuaKL?usp=sharing

In [None]:
url = "https://drive.google.com/drive/folders/14Q_0KNDXACkFQ2oTF1T-gnjIaNbNuaKL?usp=sharing"
output_folder = "dataset/noises"

os.makedirs(output_folder, exist_ok=True)

existing_wavs = list(Path(output_folder).glob("*.wav"))

if len(existing_wavs) > 0:
    print(f"Folder {output_folder} zawiera ju≈º {len(existing_wavs)} plik√≥w. Pomijam pobieranie.")
else:
    print("Folder pusty. Rozpoczynam pobieranie szum√≥w z Google Drive...")
    try:
        gdown.download_folder(url, output=output_folder, quiet=False, use_cookies=False)
        print("Pobieranie zako≈Ñczone sukcesem.")
    except Exception as e:
        print(f"WystƒÖpi≈Ç b≈ÇƒÖd podczas pobierania: {e}")
        print("Upewnij siƒô, ≈ºe link na Google Drive jest ustawiony jako 'Ka≈ºdy majƒÖcy link' (Anyone with the link).")

noise_files_list = list(Path(output_folder).glob("*.wav"))
print(f"Gotowe. Dostƒôpnych plik√≥w szumu do treningu: {len(noise_files_list)}")

Folder dataset/noises zawiera ju≈º 5 plik√≥w. Pomijam pobieranie.
Gotowe. Dostƒôpnych plik√≥w szumu do treningu: 5


### Funkcja dodajƒÖca szum
Funkcja pomocnicza do augmentacji danych poprzez dodawanie szumu do nagra≈Ñ.

In [None]:
def aggressive_augment(waveform, noise_files, sr=48000):
    """
    Zaawansowana augmentacja: losowy Gain, losowy SNR, losowy fragment szumu.
    """
    # 1. Losowe wzmocnienie (Gain) - symulacja r√≥≈ºnych odleg≈Ço≈õci od mikrofonu
    gain = random.uniform(0.5, 1.5)
    aug_wav = waveform * gain

    # 2. Dodawanie szumu
    if noise_files and len(noise_files) > 0:
        noise_path = random.choice(noise_files)
        # ≈Åadowanie szumu
        noise_wav, noise_sr = torchaudio.load(noise_path)

        # Resample szumu je≈õli trzeba
        if noise_sr != sr:
            resampler = torchaudio.transforms.Resample(noise_sr, sr)
            noise_wav = resampler(noise_wav)

        # Upewnij siƒô, ≈ºe szum jest mono
        if noise_wav.shape[0] > 1:
            noise_wav = noise_wav.mean(dim=0, keepdim=True)

        L_signal = aug_wav.shape[1]
        L_noise = noise_wav.shape[1]

        # Dopasowanie d≈Çugo≈õci (Twoje wymaganie: losowe fragmenty)
        if L_noise < L_signal:
            repeats = int(L_signal / L_noise) + 1
            noise_wav = noise_wav.repeat(1, repeats)
            noise_wav = noise_wav[:, :L_signal]
        elif L_noise > L_signal:
            start_max = L_noise - L_signal
            start = random.randint(0, start_max)
            noise_wav = noise_wav[:, start : start + L_signal]

        # Mieszanie z losowym SNR (Signal-to-Noise Ratio)
        # SNR 5 (g≈Ço≈õny szum) do 25 (cichy szum)
        snr_db = random.uniform(5.0, 25.0)

        signal_power = aug_wav.norm(p=2)
        noise_power = noise_wav.norm(p=2)

        if noise_power > 0:
            snr = 10 ** (snr_db / 20)
            scale = signal_power / (noise_power * snr + 1e-9)
            aug_wav = aug_wav + (noise_wav * scale)

    # 3. Clip (symulacja przesteru)
    if random.random() > 0.3:
        aug_wav = torch.clamp(aug_wav, -0.95, 0.95)

    return aug_wav

### Klasa ekstraktora cech OpenL3
Klasa wrapper dla modelu OpenL3 s≈Çu≈ºƒÖca do ekstrakcji embedding√≥w z plik√≥w audio.

In [None]:
class OpenL3FeatureExtractor:
    """
    Ekstraktor cech audio za pomocƒÖ modelu torchopenl3 (PyTorch)
    """
    def __init__(self, input_repr="mel128", content_type="music", embedding_size=512):
        """
        input_repr: "mel128" lub "mel256"
        content_type: "music" lub "env" (w torchopenl3 'environmental' to 'env')
        embedding_size: 512 lub 6144
        """
        if content_type == "environmental":
            content_type = "env"

        print(f"≈Åadowanie modelu TorchOpenL3: {input_repr}, {content_type}, embedding_size={embedding_size}")

        self.model = torchopenl3.models.load_audio_embedding_model(
            input_repr=input_repr,
            content_type=content_type,
            embedding_size=embedding_size
        )

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()

        self.input_repr = input_repr
        self.content_type = content_type
        self.embedding_size = embedding_size
        self.sample_rate = 8000

    def extract_features(self, waveform):
        """
        Ekstrakcja cech z audio.
        waveform: tensor PyTorch [batch, channels, samples] lub [channels, samples]
        """

        if waveform.dim() == 2:
            waveform = waveform.unsqueeze(0)

        waveform = waveform.to(self.device)

        with torch.no_grad():
            embeddings, _ = torchopenl3.get_audio_embedding(
                waveform,
                sr=48000,
                model=self.model,
                hop_size=0.1,
                verbose=False
            )

        aggregated = embeddings.mean(dim=1)

        return aggregated.cpu()

print("Inicjalizacja TorchOpenL3 Feature Extractor...")
feature_extractor = OpenL3FeatureExtractor(
    input_repr="mel128",
    content_type="music",
    embedding_size=512
)
print(f"OpenL3 ekstraktor za≈Çadowany na urzƒÖdzeniu: {feature_extractor.device}")

Inicjalizacja TorchOpenL3 Feature Extractor...
≈Åadowanie modelu TorchOpenL3: mel128, music, embedding_size=512
OpenL3 ekstraktor za≈Çadowany na urzƒÖdzeniu: cuda


In [None]:
def generate_augmented_dataset(
    df,
    root_dir,
    output_base_dir,
    feature_extractor,
    noise_files,
    augmentations_per_file=5
):
    # Tworzenie struktur folder√≥w
    audio_out_dir = os.path.join(output_base_dir, "audio")
    features_out_dir = os.path.join(output_base_dir, "features")

    os.makedirs(audio_out_dir, exist_ok=True)
    os.makedirs(features_out_dir, exist_ok=True)

    new_data = []

    print(f"Generowanie datasetu. Wersji na plik: {augmentations_per_file}")
    print(f"Folder docelowy: {output_base_dir}")

    # Upewnij siƒô, ≈ºe model jest na GPU
    feature_extractor.model.to(feature_extractor.device)

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        original_path = str(row['path'])
        label = row['label']
        full_path = os.path.join(root_dir, original_path)

        # Nazwa bazowa pliku (bez .wav)
        filename_base = os.path.basename(original_path).replace(".wav", "")

        try:
            # 1. Wczytaj orygina≈Ç
            wav, sr = torchaudio.load(full_path)

            # Resample do 48k (OpenL3 wymaga 48k)
            if sr != 48000:
                wav = torchaudio.transforms.Resample(sr, 48000)(wav)

            # Pad/Trim do sta≈Çej d≈Çugo≈õci (ok 3.3 sekundy)
            target_len = 160000
            if wav.shape[1] > target_len:
                wav = wav[:, :target_len]
            else:
                wav = F.pad(wav, (0, target_len - wav.shape[1]))

            # --- Pƒôtla augmentacji ---
            for i in range(augmentations_per_file):
                new_filename = f"{filename_base}_aug_{i}"
                wav_save_path = os.path.join(audio_out_dir, f"{new_filename}.wav")
                feat_save_path = os.path.join(features_out_dir, f"{new_filename}.pt")

                # Je≈õli pliki ju≈º istniejƒÖ, dodaj do listy i pomi≈Ñ obliczenia (cache)
                if os.path.exists(wav_save_path) and os.path.exists(feat_save_path):
                    new_data.append({
                        "path": wav_save_path,
                        "feature_path": feat_save_path,
                        "label": label
                    })
                    continue

                # A. Augmentacja
                current_wav = wav.clone()
                # U≈ºywamy nowej funkcji augmentacji
                noisy_wav = aggressive_augment(current_wav, noise_files, sr=48000)

                # B. Ekstrakcja cech
                with torch.no_grad():
                    # extract_features zwraca [512] (dziƒôki .mean() wewnƒÖtrz Twojej klasy)
                    features = feature_extractor.extract_features(noisy_wav)
                    if features.dim() > 1:
                        features = features.squeeze(0) # Upewnij siƒô ≈ºe mamy [512]

                # C. Zapis
                torchaudio.save(wav_save_path, noisy_wav, 48000)
                torch.save(features.cpu(), feat_save_path)

                new_data.append({
                    "path": wav_save_path,
                    "feature_path": feat_save_path,
                    "label": label
                })

        except Exception as e:
            print(f"B≈ÇƒÖd przy pliku {original_path}: {e}")

    # Zwracamy nowy DataFrame z wygenerowanymi ≈õcie≈ºkami
    return pd.DataFrame(new_data)

# Setup
features_cache_path = "dataset/processed_augmented" # Nowy folder na przetworzone dane
noise_folder_path = "dataset/MAD_dataset/noise" # lub gdzie masz szumy
noise_files_list = list(Path(noise_folder_path).glob("*.wav"))

# Wywo≈Çanie generatora
df_augmented = generate_augmented_dataset(
    df=df_full, # Tw√≥j oryginalny DataFrame
    root_dir="dataset/MAD_dataset",
    output_base_dir=features_cache_path,
    feature_extractor=feature_extractor,
    noise_files=noise_files_list,
    augmentations_per_file=5  # 5 wersji na ka≈ºdy plik -> 5x wiƒôcej danych!
)

print(f"Nowy rozmiar datasetu: {len(df_augmented)}")

Generowanie datasetu. Wersji na plik: 5
Folder docelowy: dataset/processed_augmented


  0%|          | 0/6429 [00:00<?, ?it/s]

Nowy rozmiar datasetu: 32145


### Klasa Dataset na surowym audio
Podstawowa klasa Dataset zwracajƒÖca pary nagra≈Ñ (surowe waveformy) oraz informacjƒô czy nale≈ºƒÖ do tej samej klasy.

In [None]:
class SiameseAudioDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # 1. Wybierz kotwicƒô (Anchor)
        row_a = self.df.iloc[idx]
        label_a = int(row_a['label'])

        # Wczytujemy gotowy tensor cech (nie musimy przetwarzaƒá audio)
        feat_path_a = row_a['feature_path']
        feat_a = torch.load(feat_path_a)

        # 2. Wybierz parƒô (Positive / Negative)
        if random.random() > 0.5:
            # Positive (ta sama klasa)
            target_df = self.df[self.df['label'] == label_a]
            same_label = 1
        else:
            # Negative (inna klasa)
            target_df = self.df[self.df['label'] != label_a]
            same_label = 0

        # Losujemy pr√≥bkƒô B
        # .sample(1) zwraca DataFrame, wiƒôc bierzemy iloc[0]
        row_b = target_df.sample(1).iloc[0]
        label_b = int(row_b['label'])

        feat_path_b = row_b['feature_path']
        feat_b = torch.load(feat_path_b)

        # Dummy audio (≈ºeby collate function siƒô zgadza≈Ço ze strukturƒÖ, kt√≥rƒÖ masz)
        dummy = torch.empty(1)

        return dummy, label_a, dummy, label_b, same_label, 48000, feat_a, feat_b

In [None]:
def siamese_collate(batch):
    a = torch.stack([item[0] for item in batch])
    label_a = torch.tensor([item[1] for item in batch], dtype=torch.long)
    b = torch.stack([item[2] for item in batch])
    label_b = torch.tensor([item[3] for item in batch], dtype=torch.long)
    same_label = torch.tensor([item[4] for item in batch], dtype=torch.long)
    sample_rate = batch[0][5]

    features_a = torch.stack([item[6] for item in batch]) if batch[0][6] is not None else None
    features_b = torch.stack([item[7] for item in batch]) if batch[0][7] is not None else None

    return a, label_a, b, label_b, same_label, sample_rate, features_a, features_b

### DataModule dla surowego audio
Klasa LightningDataModule zarzƒÖdzajƒÖca datasetami treningowymi i walidacyjnymi dla surowego audio.

In [None]:

class SiameseAudioDataModule(LightningDataModule):
    def __init__(self, df, batch_size=32, num_workers=4): # usuniƒôto features_dir
        super().__init__()
        self.df = df
        self.batch_size = batch_size
        self.num_workers = num_workers

    def setup(self, stage=None):
        train_df, val_df = train_test_split(self.df, test_size=0.2, random_state=42, stratify=self.df['label'])
        self.train_ds = SiameseAudioDataset(train_df) # Tylko DF
        self.val_ds = SiameseAudioDataset(val_df)

    def train_dataloader(self):
        return DataLoader(self.train_ds, batch_size=self.batch_size, shuffle=True,
                          num_workers=self.num_workers, collate_fn=siamese_collate)

    def val_dataloader(self):
        return DataLoader(self.val_ds, batch_size=self.batch_size, shuffle=False,
                          num_workers=self.num_workers, collate_fn=siamese_collate)

In [None]:
dm_test = SiameseAudioDataModule(
    df=df_augmented,      # <--- ZMIANA: u≈ºywamy nowego DataFrame z wygenerowanymi cechami
    batch_size=32,
    num_workers=4
    # features_dir - USUNIƒòTO (klasa ju≈º tego nie przyjmuje)
)
dm_test.setup()
batch = next(iter(dm_test.train_dataloader()))
print("Features shape:", batch[6].shape) # Powinno byƒá [8, 512]
print("Dzia≈Ça!")

Features shape: torch.Size([32, 512])
Dzia≈Ça!


### Model Syjamski (LightningModule)
Definicja modelu sieci neuronowej (klasyfikatora), kt√≥ry przyjmuje r√≥≈ºnicƒô cech dw√≥ch nagra≈Ñ i decyduje czy sƒÖ to te same klasy.

In [None]:
class ResidualBlock(nn.Module):
    """
    Blok rezydualny dla sieci MLP.
    Pozwala budowaƒá g≈Çƒôbokie sieci bez problemu zanikajƒÖcego gradientu.
    x -> [Linear->BN->ReLU->Dropout->Linear->BN] + x -> ReLU
    """
    def __init__(self, hidden_dim, dropout_rate=0.3):
        super().__init__()
        self.block = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim)
        )
        self.relu = nn.ReLU()

    def forward(self, x):
        identity = x
        out = self.block(x)
        out += identity  # Skip connection (kluczowe dla g≈Çƒôbokich sieci)
        return self.relu(out)

class SiameseComparator(pl.LightningModule):
    def __init__(self, input_dim=512, hidden_dim=1024, learning_rate=5e-4):
        super().__init__()
        self.save_hyperparameters()

        # Wej≈õcie: [u, v, |u-v|, u*v] -> 4 * 512 = 2048
        concat_dim = input_dim * 4

        # 1. Projekcja wej≈õcia do przestrzeni ukrytej (szerokiej)
        self.input_projection = nn.Sequential(
            nn.Linear(concat_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU()
        )

        # 2. G≈Çƒôboka czƒô≈õƒá sieci (Stack blok√≥w rezydualnych)
        # To daje nam ekwiwalent ~8-10 gƒôstych warstw, ale stabilnych w treningu
        self.res_blocks = nn.Sequential(
            ResidualBlock(hidden_dim, dropout_rate=0.4),
            ResidualBlock(hidden_dim, dropout_rate=0.4),
            ResidualBlock(hidden_dim, dropout_rate=0.4),
            ResidualBlock(hidden_dim, dropout_rate=0.3)
        )

        # 3. G≈Çowa klasyfikacyjna (Head) - zawƒô≈ºanie do wyniku
        self.head = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2), # 1024 -> 512
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(hidden_dim // 2, 256),        # 512 -> 256
            nn.BatchNorm1d(256),
            nn.ReLU(),

            nn.Linear(256, 1)                       # Wynik (logit)
        )

        self.loss_fn = nn.BCEWithLogitsLoss()
        self.accuracy = torchmetrics.Accuracy(task="binary")
        self.f1_score = torchmetrics.F1Score(task="binary")

    def forward(self, feat_a, feat_b):
        # Normalizacja L2 nadal jest kluczowa!
        u = F.normalize(feat_a, p=2, dim=1)
        v = F.normalize(feat_b, p=2, dim=1)

        # Bogate cechy
        features = torch.cat([
            u,
            v,
            torch.abs(u - v),
            u * v
        ], dim=1)

        # Przep≈Çyw przez sieƒá
        x = self.input_projection(features)
        x = self.res_blocks(x)
        return self.head(x)

    def training_step(self, batch, batch_idx):
        _, _, _, _, same_label, _, features_a, features_b = batch

        logits = self(features_a, features_b)
        logits = logits.squeeze(1)

        loss = self.loss_fn(logits, same_label.float())

        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).long()

        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", self.accuracy(preds, same_label), prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        _, _, _, _, same_label, _, features_a, features_b = batch

        logits = self(features_a, features_b)
        logits = logits.squeeze(1)
        loss = self.loss_fn(logits, same_label.float())

        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).long()

        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", self.accuracy(preds, same_label), prog_bar=True)
        self.log("val_f1", self.f1_score(preds, same_label), prog_bar=True)
        return loss

    def configure_optimizers(self):
        # Dla wiƒôkszej sieci OneCycleLR czƒôsto dzia≈Ça lepiej i szybciej zbiega
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams.learning_rate, weight_decay=1e-3)

        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.5, patience=3
        )

        return {
            "optimizer": optimizer,
            "lr_scheduler": {"scheduler": scheduler, "monitor": "val_loss"}
        }

### Trening modelu

In [None]:
dm = SiameseAudioDataModule(
    df=df_augmented,
    batch_size=32,
    num_workers=4
)

model = SiameseComparator(input_dim=512, hidden_dim=256, learning_rate=0.001)

wandb_logger = WandbLogger(
    project="siamese-audio-classifier",
    name="Kamil_Maj_2",
    log_model="all"
)

trainer = pl.Trainer(
    max_epochs=20,
    accelerator="auto",
    devices=1,
    logger=wandb_logger,
    log_every_n_steps=5
)

print("Rozpoczynam trening z logowaniem do W&B...")
trainer.fit(model, datamodule=dm)

print("Trening zako≈Ñczony!")
wandb.finish()

INFO:pytorch_lightning.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/loggers/wandb.py:400: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Rozpoczynam trening z logowaniem do W&B...


Output()

**Zapisywanie do ONNX**

In [None]:
search_pattern = "siamese-audio-classifier/**/*.ckpt"
list_of_files = glob.glob(search_pattern, recursive=True)

if not list_of_files:
    print("Nie znaleziono checkpointu .ckpt do eksportu!")
else:
    latest_checkpoint = max(list_of_files, key=os.path.getctime)
    print(f"Eksportujƒô model z pliku: {latest_checkpoint}")

    device = torch.device("cpu") # Do eksportu ONNX bezpieczniej u≈ºyƒá CPU
    model_export = SiameseComparator.load_from_checkpoint(latest_checkpoint)
    model_export.to(device)
    model_export.eval()

    dummy_input_a = torch.randn(1, 512, device=device)
    dummy_input_b = torch.randn(1, 512, device=device)

    onnx_path = "siamese_audio_comparator.onnx"

    try:
        torch.onnx.export(
            model_export,
            (dummy_input_a, dummy_input_b),
            onnx_path,
            export_params=True,
            opset_version=12,
            do_constant_folding=True,
            input_names=['feature_vector_a', 'feature_vector_b'],
            output_names=['similarity_score'],
            dynamic_axes={
                'feature_vector_a': {0: 'batch_size'},
                'feature_vector_b': {0: 'batch_size'},
                'similarity_score': {0: 'batch_size'}
            }
        )
        print(f"Sukces! Model zapisany jako: {onnx_path}")

    except Exception as e:
        print(f"B≈ÇƒÖd podczas eksportu do ONNX: {e}")