In [2]:
from datasets import load_from_disk, DatasetDict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


import torch
from torch import optim
from torch import nn
from torch.utils.data import DataLoader
from torchmetrics import Accuracy
from tqdm import tqdm

import torch.nn.functional as F
import torchmetrics


SEED=42
SAMPLE_RATE=16000
BATCH_SIZE=16
PIN_MEMORY=False
NUM_WORKERS = 24



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds = load_from_disk("../../data/datasets/ds_462700")
LABELS = ds.features["label"]

# When the dataset is not a datrasetDict, we split manually
if not isinstance(ds, DatasetDict):
    ds = ds.train_test_split(test_size=0.3, seed=SEED)
    test_and_valid = ds["test"].train_test_split(test_size=0.5, seed=SEED)

    ds = DatasetDict({
        "train": ds["train"],
        "valid": test_and_valid["train"],
        "test": test_and_valid["test"],
    })


ds["train"].shape

(323890, 3)

In [4]:
def collate_fn(batch):
    audios = [torch.tensor(x["audio"], dtype=torch.float32) for x in batch]
    labels = torch.tensor([x["label"] for x in batch], dtype=torch.long)

    max_len = max(a.shape[0] for a in audios)
    audios = torch.stack([F.pad(a, (0, max_len - a.shape[0])) for a in audios])
    return audios, labels

In [5]:
train_loader = DataLoader(dataset=ds["train"], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS)
test_loader = DataLoader(dataset=ds["test"], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS)
valid_loader = DataLoader(dataset=ds["valid"], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS)

In [6]:
class AudioCNN(nn.Module):
    def __init__(self, n_classes: int = len(LABELS.names)):
        super().__init__()

        self.net = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=9, stride=2, padding=4),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Conv1d(16, 32, kernel_size=9, stride=2, padding=4),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Conv1d(32, 64, kernel_size=9, stride=2, padding=4),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),
        )
        # PRELU, sigmoid, reduire nb couche
        self.fc = nn.Linear(64, n_classes)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.net(x).squeeze(-1)

        return self.fc(x)

model = AudioCNN()

In [23]:
class AudioCNN(nn.Module):
    def __init__(self, n_classes: int = len(LABELS.names)):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=9, stride=2, padding=4),
            nn.BatchNorm1d(16),
            nn.PReLU(),
        )
        self.fc = nn.Linear(16, n_classes)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.net(x)
        x = x.mean(dim=-1)
        return self.fc(x)


model = AudioCNN()

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)

In [25]:
from tqdm import tqdm

acc_metric = Accuracy(task='multiclass', num_classes=len(LABELS.names)).to(device)

EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0
    acc_metric.reset()

    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [train]", leave=False)
    for x, y in pbar:
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * x.size(0)
        acc_metric.update(out, y)

        pbar.set_postfix({"loss": f"{loss.item():.4f}"})

    train_loss /= len(ds["train"])
    train_acc = acc_metric.compute()

    # Validation
    model.eval()
    val_loss = 0.0
    acc_metric.reset()
    with torch.no_grad():
        pbar = tqdm(valid_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [valid]", leave=False)
        for x, y in pbar:
            x, y = x.to(device), y.to(device)
            out = model(x)
            loss = criterion(out, y)
            val_loss += loss.item() * x.size(0)
            acc_metric.update(out, y)

    val_loss /= len(ds["valid"])
    val_acc = acc_metric.compute()

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss   : {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
          f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")


                                                                                       

Epoch 1/10 | Train Loss   : 0.3137, Train Acc: 0.8857, Val Loss: 0.2975, Val Acc: 0.8885


                                                                                       

Epoch 2/10 | Train Loss   : 0.2814, Train Acc: 0.8932, Val Loss: 0.2584, Val Acc: 0.9014


                                                                                       

Epoch 3/10 | Train Loss   : 0.2725, Train Acc: 0.8955, Val Loss: 0.2764, Val Acc: 0.8914


                                                                                       

Epoch 4/10 | Train Loss   : 0.2699, Train Acc: 0.8961, Val Loss: 0.2654, Val Acc: 0.8962


                                                                                       

Epoch 5/10 | Train Loss   : 0.2681, Train Acc: 0.8970, Val Loss: 0.2605, Val Acc: 0.9001


                                                                                       

Epoch 6/10 | Train Loss   : 0.2672, Train Acc: 0.8972, Val Loss: 0.2994, Val Acc: 0.8906


                                                                                       

Epoch 7/10 | Train Loss   : 0.2664, Train Acc: 0.8975, Val Loss: 0.2678, Val Acc: 0.9005


                                                                                       

Epoch 8/10 | Train Loss   : 0.2668, Train Acc: 0.8971, Val Loss: 0.3047, Val Acc: 0.8892


                                                                                       

Epoch 9/10 | Train Loss   : 0.2666, Train Acc: 0.8972, Val Loss: 0.2540, Val Acc: 0.8986


                                                                                        

Epoch 10/10 | Train Loss   : 0.2664, Train Acc: 0.8975, Val Loss: 0.3019, Val Acc: 0.9047




In [26]:
model.eval()
acc_metric.reset()
test_loss = 0.0

with torch.no_grad():
    pbar = tqdm(test_loader, desc="Testing", leave=False)
    for x, y in pbar:
        x, y = x.to(device), y.to(device)
        out = model(x)
        loss = criterion(out, y)
        test_loss += loss.item() * x.size(0)
        acc_metric.update(out, y)

test_loss /= len(ds["test"])
test_acc = acc_metric.compute()

print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}")

                                                             

Test Loss: 0.2973 | Test Acc: 0.9063




In [27]:
torch.save(model.state_dict(), "../checkpoints/audio_cnn.pth")

In [29]:
import torchaudio
import torch
import torch.nn.functional as F

def predict_wav(path, model, labels=LABELS, sample_rate=SAMPLE_RATE, duration=0.5, device='cpu'):
    waveform, sr = torchaudio.load(path)  # [channels, T]

    # Convert to mono
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    #
    # # Resample if needed
    if sr != sample_rate:
        resampler = torchaudio.transforms.Resample(sr, sample_rate)
        waveform = resampler(waveform)

    # normalize
    # waveform = (waveform - waveform.mean()) / (waveform.std() + 1e-6)

    waveform = waveform.squeeze(0)  # now [T]
    num_samples = int(sample_rate * duration)
    # Truncate or pad
    if waveform.shape[0] > num_samples:
        waveform = waveform[:num_samples]
    elif waveform.shape[0] < num_samples:
        waveform = F.pad(waveform, (0, num_samples - waveform.shape[0]))

    waveform = waveform.unsqueeze(0).to(device)

    # Predict
    model.eval()
    with torch.no_grad():
        out = model(waveform)
        probs = torch.softmax(out, dim=1).cpu().numpy()[0]
        pred_idx = out.argmax(dim=1).item()
        pred_label = labels.names[pred_idx]

    return pred_label, probs


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# predict_wav("/home/pierre/Downloads/B_S2_D1_092-bebop_000_.wav", model, device=device)
# label, probs = predict_wav("/home/pierre/Documents/Projects/PST4/AI/data/raw/test/drone/10.wav", model, device=device)
# print("Predicted label:", label)
# print("Probabilities:", probs)

# label, probs = predict_wav("/home/pierre/Downloads/audio.wav", model, device=device)
# label, probs = predict_wav("/home/pierre/Documents/Projects/PST4/AI/data/raw/test/drone/1.wav", model, device=device)
# label, probs = predict_wav("/home/pierre/Documents/Projects/PST4/AI/data/raw/test/drone/2.wav", model, device=device)
# label, probs = predict_wav("/home/pierre/Documents/Projects/PST4/AI/data/raw/test/drone/3.wav", model, device=device)
# label, probs = predict_wav("/home/pierre/Documents/Projects/PST4/AI/data/raw/archive(1)/DREGON_clean_recordings_whitenoise/DREGON_clean_recordings_whitenoise/60_-15_1.2.wav", model, device=device)
# label, probs = predict_wav("/home/pierre/Documents/Projects/PST4/AI/data/raw/archive(1)/DREGON_clean_recordings_whitenoise/DREGON_clean_recordings_whitenoise/75_-15_2.4.wav", model, device=device)
# label, probs = predict_wav("/home/pierre/Documents/Projects/PST4/AI/data/raw/archive(1)/DREGON_clean_recordings_speech/DREGON_clean_recordings_speech/45_0_1.2__3.wav", model, device=device)
# label, probs = predict_wav("/home/pierre/Documents/Projects/PST4/AI/data/raw/test/other/1.wav", model, device=device)
# label, probs = predict_wav("/home/pierre/Documents/Projects/PST4/AI/data/raw/hibou_dataset/drone/2997-2997.wav", model, device=device)
# label, probs = predict_wav("/home/pierre/Documents/Projects/PST4/AI/data/raw/hibou_dataset/drone/2997-2997.wav", model, device=device)


recordings = [
    ("/home/pierre/Downloads/B_S2_D1_092-bebop_000_.wav", "drone"),
    ("/home/pierre/Downloads/B_S2_D1_067-bebop_000_.wav", "drone"),
    ("/home/pierre/Documents/Projects/PST4/AI/data/raw/test/drone/1.wav", "drone"),
    ("/home/pierre/Documents/Projects/PST4/AI/data/raw/test/drone/2.wav", "drone"),
    ("/home/pierre/Documents/Projects/PST4/AI/data/raw/test/drone/3.wav", "drone"),
    ("/home/pierre/Downloads/audio.wav", "drone"),
    ("/home/pierre/Documents/Projects/PST4/AI/data/raw/archive(1)/DREGON_clean_recordings_whitenoise/DREGON_clean_recordings_whitenoise/60_-15_1.2.wav", "other"),
    ("/home/pierre/Documents/Projects/PST4/AI/data/raw/archive(1)/DREGON_clean_recordings_whitenoise/DREGON_clean_recordings_whitenoise/75_-15_2.4.wav", "other"),
    ("/home/pierre/Documents/Projects/PST4/AI/data/raw/archive(1)/DREGON_clean_recordings_speech/DREGON_clean_recordings_speech/45_0_1.2__3.wav", "other"),
    ("/home/pierre/Documents/Projects/PST4/AI/data/raw/test/other/1.wav", "other"),
    ("/home/pierre/Documents/Projects/PST4/AI/data/raw/hibou_dataset/drone/2997-2997.wav", "drone"),
]

for path, label in recordings:
    pred_label, probs = predict_wav(path, model, device=device)
    print(f"BON={label == pred_label} - Expected Label: {label}, Predicted Label: {pred_label}, Probabilities: {probs}")



BON=True - Expected Label: drone, Predicted Label: drone, Probabilities: [0.2394755 0.7605245]
BON=True - Expected Label: drone, Predicted Label: drone, Probabilities: [0.23611061 0.7638893 ]
BON=False - Expected Label: drone, Predicted Label: other, Probabilities: [0.9468438  0.05315627]
BON=False - Expected Label: drone, Predicted Label: other, Probabilities: [0.9256596 0.0743404]
BON=False - Expected Label: drone, Predicted Label: other, Probabilities: [0.9361964  0.06380361]
BON=True - Expected Label: drone, Predicted Label: drone, Probabilities: [0.00782319 0.99217683]
BON=True - Expected Label: other, Predicted Label: other, Probabilities: [0.9597323 0.0402677]
BON=True - Expected Label: other, Predicted Label: other, Probabilities: [0.9580672  0.04193284]
BON=True - Expected Label: other, Predicted Label: other, Probabilities: [0.9669583  0.03304169]
BON=True - Expected Label: other, Predicted Label: other, Probabilities: [0.95130193 0.04869804]
BON=False - Expected Label: drone

In [30]:
drone_dir = "/data/raw/test/drone"
import os
# List all .wav files in the directory
wav_files = [f for f in os.listdir(drone_dir) if f.endswith('.wav')]

# Predict for each file
for wav_file in wav_files:
    path = os.path.join(drone_dir, wav_file)
    label, probs = predict_wav(path, model, device=device)
    print(f"File: {wav_file} {label}")

File: 1_1.wav other
File: 1_2.wav other
File: 1_3.wav other
File: 0.wav other
File: 1.wav other
File: 2.wav other
File: 3.wav other
File: 4.wav other
File: 5.wav other
File: 6.wav other
File: 7.wav other
File: 8.wav other
File: 9.wav drone
File: 10.wav other
File: 11.wav other
File: 12.wav other
File: 13.wav other
File: 14.wav other
File: 15.wav other
File: 16.wav other
File: 17.wav other
File: 18.wav other
File: 19.wav other
File: 20.wav other
File: 21.wav other
File: 22.wav other
File: 23.wav other
File: 24.wav other
File: 25.wav other
File: 26.wav other
File: 27.wav other
File: 28.wav other
File: 29.wav other
File: 30.wav other
File: 31.wav other
File: 32.wav other
File: 33.wav other
File: 34.wav other
File: 35.wav other
File: 36.wav other
File: 37.wav drone
File: 38.wav drone
File: 39.wav drone
File: 40.wav other
File: 41.wav other
File: 42.wav drone
File: 43.wav other
File: 44.wav other
File: 45.wav other
File: 46.wav other
File: 47.wav other
File: 48.wav other
File: 49.wav other


In [31]:
model = AudioCNN()
model.load_state_dict(torch.load("../checkpoints/audio_cnn.pth"))

<All keys matched successfully>

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

AudioCNN(
  (net): Sequential(
    (0): Conv1d(1, 16, kernel_size=(9,), stride=(2,), padding=(4,))
    (1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): PReLU(num_parameters=1)
  )
  (fc): Linear(in_features=16, out_features=2, bias=True)
)

In [34]:
for path, label in recordings:
    pred_label, probs = predict_wav(path, model, device=device)
    print(f"BON={label == pred_label} - Expected Label: {label}, Predicted Label: {pred_label}, Probabilities: {probs}")

BON=True - Expected Label: drone, Predicted Label: drone, Probabilities: [0.2394755 0.7605245]
BON=True - Expected Label: drone, Predicted Label: drone, Probabilities: [0.23611061 0.7638893 ]
BON=False - Expected Label: drone, Predicted Label: other, Probabilities: [0.9468438  0.05315627]
BON=False - Expected Label: drone, Predicted Label: other, Probabilities: [0.9256596 0.0743404]
BON=False - Expected Label: drone, Predicted Label: other, Probabilities: [0.9361964  0.06380361]
BON=True - Expected Label: drone, Predicted Label: drone, Probabilities: [0.00782319 0.99217683]
BON=True - Expected Label: other, Predicted Label: other, Probabilities: [0.9597323 0.0402677]
BON=True - Expected Label: other, Predicted Label: other, Probabilities: [0.9580672  0.04193284]
BON=True - Expected Label: other, Predicted Label: other, Probabilities: [0.9669583  0.03304169]
BON=True - Expected Label: other, Predicted Label: other, Probabilities: [0.95130193 0.04869804]
BON=False - Expected Label: drone