**Wav2Vec2 --> MLP CLASSIFIER **

In [None]:
# 📌 IMPORTS
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch
import torchaudio
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from google.colab import drive
from tqdm import tqdm

# 📂 MONTAGE GOOGLE DRIVE
drive.mount('/content/drive')
DATA_PATH = "/content/drive/MyDrive"

# 🎧 PARAMÈTRES AUDIO
SAMPLE_RATE = 16000
DURATION = 3  # secondes
MAX_LENGTH = SAMPLE_RATE * DURATION  # nombre d'échantillons

# 🧠 CHARGEMENT DU MODÈLE Wav2Vec2
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model_wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
model_wav2vec.eval()

# 🎵 FONCTION D'EXTRACTION DE FEATURES
def extract_wav2vec_features(file_path, max_length=MAX_LENGTH):
    try:
        waveform, sr = torchaudio.load(file_path)

        # Mono : moyenne des canaux si stéréo
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Resample si nécessaire
        if sr != SAMPLE_RATE:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
            waveform = resampler(waveform)

        # Tronquage / Padding
        waveform = waveform[:, :max_length]
        if waveform.shape[1] < max_length:
            pad = torch.zeros((1, max_length - waveform.shape[1]))
            waveform = torch.cat((waveform, pad), dim=1)

        # Préparation pour Wav2Vec2
        inputs = processor(waveform.squeeze(0), sampling_rate=SAMPLE_RATE, return_tensors="pt").input_values

        # Extraction sans gradients
        with torch.no_grad():
            outputs = model_wav2vec(inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()  # (768,)
            return embeddings
    except Exception as e:
        print(f"❌ Erreur avec {file_path} : {e}")
        return None

# 📅 CHARGEMENT DES DONNÉES
def load_wav2vec_data(base_path=DATA_PATH):
    X, y = [], []
    label_map = {'non_belle_voix': 0, 'belle_voix': 1}

    for label_name, label in label_map.items():
        folder = os.path.join(base_path, label_name)
        print(f"🔍 Lecture de : {folder}")
        for fname in tqdm(os.listdir(folder), desc=f"Traitement {label_name}"):
            if fname.endswith(".wav"):
                fpath = os.path.join(folder, fname)
                feat = extract_wav2vec_features(fpath)
                if feat is not None:
                    X.append(feat)
                    y.append(label)

    return np.array(X), np.array(y)

# ⚖️ ÉQUILIBRAGE AVEC OVERSAMPLING
def balance(X, y):
    print(f"Avant équilibrage : {Counter(y)}")
    ros = RandomOverSampler(random_state=42)
    X_res, y_res = ros.fit_resample(X, y)
    print(f"Après équilibrage : {Counter(y_res)}")
    return np.array(X_res), np.array(y_res)

# 🧼 PRÉTRAITEMENT ET SPLIT
def preprocess(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42)
    class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    return X_train, X_test, y_train, y_test, dict(enumerate(class_weights))

# 🏗️ MLP CLASSIFIER
def build_classifier(input_dim):
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# 🚀 PIPELINE COMPLET
X, y = load_wav2vec_data()
X, y = balance(X, y)
X_train, X_test, y_train, y_test, class_weights = preprocess(X, y)

model = build_classifier(X_train.shape[1])
early_stop = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=30,
    batch_size=32,
    class_weight=class_weights,
    callbacks=[early_stop, lr_scheduler],
    shuffle=True
)

# 📊 ÉVALUATION
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"\n🎯 Accuracy sur test : {test_acc:.4f}")


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔍 Lecture de : /content/drive/MyDrive/non_belle_voix


Traitement non_belle_voix: 100%|██████████| 7005/7005 [2:25:29<00:00,  1.25s/it]


🔍 Lecture de : /content/drive/MyDrive/belle_voix


Traitement belle_voix: 100%|██████████| 3612/3612 [1:13:05<00:00,  1.21s/it]


Avant équilibrage : Counter({np.int64(0): 7005, np.int64(1): 3612})
Après équilibrage : Counter({np.int64(0): 7005, np.int64(1): 7005})
Epoch 1/30
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.6919 - loss: 0.5752 - val_accuracy: 0.7453 - val_loss: 0.5113 - learning_rate: 0.0010
Epoch 2/30
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7331 - loss: 0.5139 - val_accuracy: 0.7489 - val_loss: 0.4947 - learning_rate: 0.0010
Epoch 3/30
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.7221 - loss: 0.5229 - val_accuracy: 0.7520 - val_loss: 0.4906 - learning_rate: 0.0010
Epoch 4/30
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.7289 - loss: 0.5087 - val_accuracy: 0.7547 - val_loss: 0.4838 - learning_rate: 0.0010
Epoch 5/30
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7361 - loss: 0.5043