Données Multimedia - Projet non-alternant - Module AUDIO
==============
---

# 0.a Imports et connection google drive

In [13]:
import os
import json
import torch
import librosa

import numpy as np
import pandas as pd

import torch.nn as nn
import tensorflow as tf
import matplotlib.pyplot as plt

from tqdm import tqdm
from tensorflow import keras
from google.colab import drive
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix

In [14]:
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [64]:
corpus_dir = "/content/drive/MyDrive/Projet non-alternant/Corpus/"
csv_dir = "/content/drive/MyDrive/Projet non-alternant/Corpus/csv/"
json_dir = "/content/drive/MyDrive/Projet non-alternant/Corpus/json/"
audio_dir = "/content/drive/MyDrive/Projet non-alternant/Corpus/train_val_videos/TrainValAudio/"
working_dir = "/content/drive/MyDrive/Projet non-alternant/Audio/CNN/"

# 0.b Fonctions de reproductibilité

In [16]:
def get_device():
    return 'cuda' if torch.cuda.is_available() else 'cpu'

device = get_device()
device

'cpu'

In [17]:
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

same_seeds(42)

# test

In [None]:
# Audio processing parameters
SAMPLE_RATE = 22050
DURATION = 5  # seconds
N_MELS = 128
HOP_LENGTH = 512
N_FFT = 2048

In [None]:
def load_and_preprocess_audio(audio_id, duration=DURATION):
    """Load audio file and convert to mel-spectrogram"""
    audio_path = os.path.join(chemin_audios, f"{audio_id}.wav")

    try:
        # Load audio
        audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, duration=duration)

        # Pad if necessary
        if len(audio) < SAMPLE_RATE * duration:
            audio = np.pad(audio, (0, SAMPLE_RATE * duration - len(audio)))

        # Convert to mel-spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=audio,
            sr=sr,
            n_mels=N_MELS,
            hop_length=HOP_LENGTH,
            n_fft=N_FFT
        )

        # Convert to dB scale
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

        return mel_spec_db

    except Exception as e:
        print(f"Error loading {audio_id}: {e}")
        return None

In [None]:
def prepare_dataset(df, label_encoder=None, fit_encoder=False):
    # Encode labels
    if fit_encoder:
        label_encoder = LabelEncoder()
        labels = label_encoder.fit_transform(df['label'])
    else:
        labels = label_encoder.transform(df['label'])

    # Load and process audio files
    spectrograms = []
    valid_labels = []

    print(f"Processing {len(df)} audio files...")
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        spec = load_and_preprocess_audio(row['video_id'])
        if spec is not None:
            spectrograms.append(spec)
            valid_labels.append(labels[idx])

    # Convert to numpy arrays
    X = np.array(spectrograms)
    y = np.array(valid_labels)

    # Add channel dimension for CNN
    X = X[..., np.newaxis]

    return X, y, label_encoder

In [None]:
train_csv = "train_audio_labels.csv"
val_csv = "val_audio_labels.csv"

train_df = pd.read_csv(csv_dir + train_csv)
val_df = pd.read_csv(csv_dir + val_csv)

len(train_df), len(val_df)

In [None]:
train_df = train_df[:800]
val_df = val_df[:200]

len(train_df), len(val_df)

In [None]:
# Load datasets
print("Loading training data...")
X_train, y_train, label_encoder = prepare_dataset(train_df, fit_encoder=True)

In [None]:
print("\nLoading validation data...")
X_val, y_val, _ = prepare_dataset(val_df, label_encoder=label_encoder)

In [60]:
print(f"\nDataset shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"Number of classes: {len(label_encoder.classes_)}")


Dataset shapes:
X_train: (800, 128, 431, 1), y_train: (800,)
X_val: (200, 128, 431, 1), y_val: (200,)
Number of classes: 20


In [65]:
# Save
np.save(working_dir + "X_train.npy", X_train)
np.save(working_dir + "y_train.npy", y_train)
np.save(working_dir + "X_val.npy", X_val)
np.save(working_dir + "y_val.npy", y_val)

# Later, load it again
# my_array = np.load("my_array.npy")

In [67]:
# Build CNN model
def build_cnn_model(input_shape, num_classes):
    """Build CNN architecture for audio classification"""
    model = keras.Sequential([
        # Block 1
        layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        layers.BatchNormalization(),
        layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),

        # Block 2
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),

        # Block 3
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),

        # Dense layers
        layers.Flatten(),
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])

    return model

In [69]:
# Create model
input_shape = X_train.shape[1:]
num_classes = len(label_encoder.classes_)

model = build_cnn_model(input_shape, num_classes)

In [70]:
# Compile model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [71]:
# Callbacks
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-7
    ),
    keras.callbacks.ModelCheckpoint(
        'best_audio_model.h5',
        monitor='val_accuracy',
        save_best_only=True
    )
]

In [None]:
# Train model
print("\nTraining model...")
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)


Training model...
Epoch 1/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17s/step - accuracy: 0.0633 - loss: 4.0843 



[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m464s[0m 18s/step - accuracy: 0.0633 - loss: 4.0823 - val_accuracy: 0.0150 - val_loss: 42.1329 - learning_rate: 0.0010
Epoch 2/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18s/step - accuracy: 0.0868 - loss: 3.7472 



[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m514s[0m 19s/step - accuracy: 0.0871 - loss: 3.7492 - val_accuracy: 0.0400 - val_loss: 95.6025 - learning_rate: 0.0010
Epoch 3/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17s/step - accuracy: 0.0742 - loss: 3.7424 



[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 18s/step - accuracy: 0.0742 - loss: 3.7425 - val_accuracy: 0.0450 - val_loss: 11.8359 - learning_rate: 0.0010
Epoch 4/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m480s[0m 17s/step - accuracy: 0.0867 - loss: 3.5595 - val_accuracy: 0.0150 - val_loss: 13.2528 - learning_rate: 0.0010
Epoch 5/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16s/step - accuracy: 0.1357 - loss: 3.3696 



[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m453s[0m 17s/step - accuracy: 0.1357 - loss: 3.3674 - val_accuracy: 0.0650 - val_loss: 3.2929 - learning_rate: 0.0010
Epoch 6/50
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m438s[0m 17s/step - accuracy: 0.1340 - loss: 3.3799 - val_accuracy: 0.0300 - val_loss: 13.2851 - learning_rate: 0.0010
Epoch 7/50
[1m24/25[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m15s[0m 16s/step - accuracy: 0.1190 - loss: 3.3407

In [None]:
# Evaluate model
print("\nEvaluating model...")
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {val_accuracy:.4f}")

In [None]:
# Predictions
y_pred = model.predict(X_val)
y_pred_classes = np.argmax(y_pred, axis=1)

# Classification report
print("\nClassification Report:")
print(classification_report(
    y_val,
    y_pred_classes,
    target_names=label_encoder.classes_
))

In [None]:
# Save model and label encoder
model.save(working_dir + 'audio_classifier_model.h5')
np.save(working_dir + 'label_encoder_classes.npy', label_encoder.classes_)
print("\nModel and label encoder saved!")

In [None]:
# Function to predict on new audio
def predict_audio(audio_id, model, label_encoder):
    """Predict label for a new audio file"""
    spec = load_and_preprocess_audio(audio_id)
    if spec is None:
        return None

    spec = spec[np.newaxis, ..., np.newaxis]
    prediction = model.predict(spec, verbose=0)
    predicted_class = np.argmax(prediction)
    confidence = prediction[0][predicted_class]

    return label_encoder.classes_[predicted_class], confidence

Using strategy: global_pooling
Loading training data...
Processing 5728 audio files...


 27%|██▋       | 1549/5728 [24:53<1:07:10,  1.04it/s]


KeyboardInterrupt: 

In [None]:
# Example prediction
print("\nExample prediction:")
sample_id = "video0"  # Change to an actual audio_id from your data
pred_label, confidence = predict_audio(sample_id, model, label_encoder)
print(f"Audio: {sample_id}")
print(f"Predicted Label: {pred_label}")
print(f"Confidence: {confidence:.4f}")