Données Multimedia - Projet non-alternant - Module AUDIO
==============
---

# 0.a Imports et connection google drive

In [1]:
import os
import json
import torch
import librosa
import librosa.display

import numpy as np
import pandas as pd
import torch.nn as nn
import tensorflow as tf
import matplotlib.pyplot as plt

from tqdm import tqdm
from tensorflow import keras
from google.colab import drive
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.utils import to_categorical, Sequence
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [2]:
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [3]:
corpus_dir = "/content/drive/MyDrive/Projet non-alternant/Corpus/"
csv_dir = "/content/drive/MyDrive/Projet non-alternant/Corpus/csv/"
json_dir = "/content/drive/MyDrive/Projet non-alternant/Corpus/json/"
working_dir = "/content/drive/MyDrive/Projet non-alternant/Audio/CNN/"
audio_dir = "/content/drive/MyDrive/Projet non-alternant/Corpus/train_val_videos/TrainValAudio/"

# 0.b Fonctions de reproductibilité

In [4]:
def get_device():
    return 'cuda' if torch.cuda.is_available() else 'cpu'

device = get_device()
device

'cuda'

In [5]:
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

same_seeds(42)

# test

In [6]:
working_dir = "/content/drive/MyDrive/Projet non-alternant/Audio/CNN/"
audio_dir = "/content/drive/MyDrive/Projet non-alternant/Corpus/train_val_videos/TrainValAudio/"
csv_dir = "/content/drive/MyDrive/Projet non-alternant/Corpus/csv/"

train_csv = os.path.join(csv_dir, "train_audio_labels.csv")
val_csv = os.path.join(csv_dir, "val_audio_labels.csv")

In [None]:
SAMPLE_RATE = 22050
DURATION = 10  # seconds
N_MELS = 64
HOP_LENGTH = 512
N_FFT = 1024
INPUT_SHAPE = (N_MELS, 128, 1)  # (freq_bins, time_frames, channels)
BATCH_SIZE = 32

In [None]:
def load_audio(file_path, duration=DURATION, sr=SAMPLE_RATE):
    y, sr = librosa.load(file_path, sr=sr, duration=duration)
    if len(y) < sr * duration:
        y = np.pad(y, (0, sr * duration - len(y)))
    return y, sr

def audio_to_mel(y, sr):
    mel = librosa.feature.melspectrogram(
        y=y, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH
    )
    mel_db = librosa.power_to_db(mel, ref=np.max)
    return mel_db

def preprocess_audio(file_path):
    y, sr = load_audio(file_path)
    mel_db = audio_to_mel(y, sr)
    # Ensure consistent shape (N_MELS x 128)
    if mel_db.shape[1] < 128:
        mel_db = np.pad(mel_db, ((0, 0), (0, 128 - mel_db.shape[1])))
    else:
        mel_db = mel_db[:, :128]
    mel_db = np.expand_dims(mel_db, axis=-1)
    return mel_db

In [None]:
class AudioDataGenerator(Sequence):
    def super().__init__(self, df, audio_dir, labels, batch_size=BATCH_SIZE, shuffle=True):
        self.df = df
        self.audio_dir = audio_dir
        self.labels = labels
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.df))
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.df) / self.batch_size))

    def __getitem__(self, idx):
        batch_idx = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_df = self.df.iloc[batch_idx]

        X, y = [], []
        for _, row in batch_df.iterrows():
            file_path = os.path.join(self.audio_dir, f"{row['video_id']}.wav")
            mel_db = preprocess_audio(file_path)
            X.append(mel_db)
            y.append(self.labels.transform([row['label']])[0])

        X = np.array(X)
        y = to_categorical(y, num_classes=len(self.labels.classes_))
        return X, y

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

In [None]:
train_df = pd.read_csv(train_csv)
val_df = pd.read_csv(val_csv)

label_encoder = LabelEncoder()
label_encoder.fit(train_df['label'])

train_gen = AudioDataGenerator(train_df, audio_dir, label_encoder)
val_gen = AudioDataGenerator(val_df, audio_dir, label_encoder, shuffle=False)

In [None]:
def build_cnn(input_shape=INPUT_SHAPE, num_classes=len(label_encoder.classes_)):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        BatchNormalization(),

        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        BatchNormalization(),

        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        BatchNormalization(),

        Flatten(),
        Dense(256, activation='relu'),
        Dropout(0.4),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
model = build_cnn()
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
checkpoint_path = os.path.join(working_dir, "best_model.keras")

callbacks = [
    ModelCheckpoint(checkpoint_path, monitor='val_accuracy', save_best_only=True, verbose=1),
    EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)
]

history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=30,
    callbacks=callbacks
)

  self._warn_if_super_not_called()


Epoch 1/30
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33s/step - accuracy: 0.0800 - loss: 3.9611 

  self._warn_if_super_not_called()



Epoch 1: val_accuracy improved from -inf to 0.12277, saving model to /content/drive/MyDrive/Projet non-alternant/Audio/CNN/best_model.keras
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6342s[0m 35s/step - accuracy: 0.0800 - loss: 3.9585 - val_accuracy: 0.1228 - val_loss: 3.0516
Epoch 2/30
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - accuracy: 0.0988 - loss: 3.0472
Epoch 2: val_accuracy improved from 0.12277 to 0.15179, saving model to /content/drive/MyDrive/Projet non-alternant/Audio/CNN/best_model.keras
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m875s[0m 5s/step - accuracy: 0.0988 - loss: 3.0471 - val_accuracy: 0.1518 - val_loss: 3.0698
Epoch 3/30
[1m173/179[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m25s[0m 4s/step - accuracy: 0.1216 - loss: 2.9349

In [None]:
model.save(os.path.join(working_dir, "final_model.keras"))
np.save(os.path.join(working_dir, "label_encoder.npy"), label_encoder.classes_)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title('Model Accuracy')
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(working_dir, "training_accuracy.png"))
plt.show()