In [26]:
# ===========================
# 1️⃣ Imports et setup
# ===========================
import torch
import torchaudio
from datasets import load_dataset
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

In [28]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

In [30]:
dataset = load_dataset("Usernameeeeee/drone_test")  # dataset test drone

In [31]:
# Fonction de prétraitement
def preprocess_audio(example):
    # "example['audio']" est un Audio ou AudioDecoder
    waveform = example["audio"]["array"]  # on récupère le numpy array directement
    sample_rate = example["audio"]["sampling_rate"]  # la fréquence d'échantillonnage

    # Resample si nécessaire
    if sample_rate != processor.feature_extractor.sampling_rate:
        waveform = torchaudio.transforms.Resample(
            orig_freq=sample_rate,
            new_freq=processor.feature_extractor.sampling_rate
        )(torch.tensor(waveform))

    # Appliquer Wav2Vec2Processor
    inputs = processor(waveform.squeeze(),
                       sampling_rate=processor.feature_extractor.sampling_rate,
                       return_tensors="pt", padding=True)
    return {"input_values": inputs.input_values.squeeze(0)}

# Appliquer le prétraitement
dataset = dataset.map(preprocess_audio)
dataset = dataset.remove_columns(["audio"])


In [33]:
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base-960h",
    num_labels=2,
    problem_type="single_label_classification"
).to(DEVICE)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
from torch.utils.data import DataLoader

# Convertir les données en DataLoader

ds = dataset["test"].train_test_split(test_size=0.2, seed=SEED, shuffle=False)

train_dataset = ds["train"].with_format(type="torch", columns=["input_values", "label"])
test_dataset = ds["test"].with_format(type="torch", columns=["input_values", "label"])

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)


In [59]:
optimizer = AdamW(model.parameters(), lr=3e-5)

In [60]:
def train_epoch(model, dataloader, optimizer):
    model.train()
    losses = []
    for batch in tqdm(dataloader, desc="Train"):
        inputs = batch["input_values"].to(DEVICE)
        labels = batch["label"].to(DEVICE)

        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    return np.mean(losses)

In [61]:
def eval_model(model, dataloader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Eval"):
            inputs = batch["input_values"].to(DEVICE)
            labels = batch["label"].to(DEVICE)
            outputs = model(inputs)
            preds = torch.argmax(outputs.logits, dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    return acc

In [62]:
EPOCHS = 5
best_val_acc = 0

for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_dataloader, optimizer)
    val_acc = eval_model(model, test_dataloader)
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Acc: {val_acc*100:.2f}%")

    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_wav2vec2_drone.pt")
        print("✅ Saved new best model!")


Train: 100%|██████████| 45/45 [00:01<00:00, 24.29it/s]
Eval: 100%|██████████| 12/12 [00:00<00:00, 89.32it/s]


Epoch 1/5 | Train Loss: 0.6943 | Val Acc: 0.00%


Train: 100%|██████████| 45/45 [00:01<00:00, 24.83it/s]
Eval: 100%|██████████| 12/12 [00:00<00:00, 89.75it/s]


Epoch 2/5 | Train Loss: 0.6917 | Val Acc: 0.00%


Train: 100%|██████████| 45/45 [00:01<00:00, 24.68it/s]
Eval: 100%|██████████| 12/12 [00:00<00:00, 87.47it/s]


Epoch 3/5 | Train Loss: 0.6915 | Val Acc: 0.00%


Train: 100%|██████████| 45/45 [00:01<00:00, 24.68it/s]
Eval: 100%|██████████| 12/12 [00:00<00:00, 89.38it/s]


Epoch 4/5 | Train Loss: 0.6915 | Val Acc: 0.00%


Train: 100%|██████████| 45/45 [00:01<00:00, 24.33it/s]
Eval: 100%|██████████| 12/12 [00:00<00:00, 89.52it/s]

Epoch 5/5 | Train Loss: 0.6914 | Val Acc: 0.00%





In [None]:
test_acc = eval_model(model, test_dataloader)
print(f"✅ Test Accuracy: {test_acc*100:.2f}%")

In [57]:
import pandas as pd
from IPython.display import display

def show_predictions(model, dataloader, label_names=None, max_examples=20):
    """
    Display a table of true labels vs predicted labels for a batch from the dataloader.

    Args:
        model: trained Wav2Vec2 model
        dataloader: DataLoader with 'input_values' and 'labels'
        label_names: list of label names (e.g., ["other", "drone"])
        max_examples: max number of examples to display
    """
    model.eval()
    results = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = batch["input_values"].to(DEVICE)
            labels = batch["label"].to(DEVICE)
            outputs = model(inputs)
            preds = torch.argmax(outputs.logits, dim=-1)

            for t, p in zip(labels.cpu().numpy(), preds.cpu().numpy()):
                true_label = label_names[t] if label_names else t
                pred_label = label_names[p] if label_names else p
                results.append({"True": true_label, "Predicted": pred_label})

            if len(results) >= max_examples:
                break

    df = pd.DataFrame(results[:max_examples])
    display(df)

label_names = ["other", "drone"]
show_predictions(model, train_dataloader, label_names=label_names, max_examples=20)



Unnamed: 0,True,Predicted
0,other,drone
1,drone,drone
2,other,drone
3,other,drone
4,other,drone
5,other,drone
6,drone,drone
7,drone,drone
8,other,drone
9,other,drone
