In [1]:
import librosa
import numpy as np
import tensorflow as tf
import cv2
import moviepy.editor as mp
import os
import json


In [2]:
# Prétraitement de l'audio
# max_length : longueur maximale des MFCCs, correspond à 2 secondes d'audio
def preprocess_audio(video, n_mfcc=40, max_length=173):
    """Split the audio from the video, extract the MFCCs and pad them to max_length"""
    # Charger l'audio
    audio = video.audio
    # Si l'audio est trop court, le remplir avec du silence
    if audio.duration < 2:
        audio = audio.set_duration(2)
    # Si l'audio est plus long que 2 secondes, ne garder que les 2 premières secondes
    else:
        audio = audio.subclip(0, 2)
    # Manipulation temporaire pour sauvegarder l'audio en .wav, obligatoire pour pouvoir charger avec librosa et extraire les MFCCs
    audio.write_audiofile("temp_audio.wav", logger=None)
    y, sr = librosa.load("temp_audio.wav")
    os.remove("temp_audio.wav")
    # Extraire les MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    # Transposer pour avoir la forme (timesteps, features)
    mfccs = mfccs.T
    # Troncature ou padding pour uniformiser la longueur
    if mfccs.shape[0] > max_length:
        mfccs = mfccs[:max_length, :]
    else:
        padding = max_length - mfccs.shape[0]
        mfccs = np.pad(mfccs, ((0, padding), (0, 0)), mode='constant')
    return mfccs
    # La sortie doit être de forme (None, timesteps, features)
    # return mfccs[np.newaxis, ...]

In [3]:
# Prétraitement des frames
def preprocess_frame(video, target_size=(224, 224)):
    """Split the video to only keep the frame after 1sec and return it as an object variable"""
    # If the file is too short, only keep the first frame
    if video.duration < 1:
        frame = video.get_frame(0)
    else:
        # Only keep the frame after 1 second
        frame = video.get_frame(1)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    # Convert it to the right color space
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    # Redimensionner l'image
    frame = cv2.resize(frame, target_size)
    # Normalisation pour ResNet
    frame = tf.keras.applications.resnet.preprocess_input(frame)
    return frame

In [4]:
# Load the model and the weights
model_path = 'weights and id\CompleteModel.keras'
model = tf.keras.models.load_model(model_path)
model.summary()

# Load the dictionary to convert the user id to the user name
dict_path = 'weights and id\id_to_user.json'
with open(dict_path) as f:
    id_to_user = json.load(f)
print("id_to_user:", id_to_user)

id_to_user: {'0': 'Jean_Bon', '1': 'Yann_Zurbrugg'}


In [5]:
# Load the video
video_path = 'data\Jean_Bon\WhatsApp Video 2024-12-02 at 11.46.35.mp4'
video = mp.VideoFileClip(video_path)

# Preprocess the frame and the audio
mfccs = preprocess_audio(video)
# Preprocess the frame
frame = preprocess_frame(video)

# Make the prediction
mfccs = mfccs[np.newaxis, ...]
frame = frame[np.newaxis, ...]
prediction = model.predict([mfccs, frame])
print(prediction)
# Get the user from the embeddings
user = id_to_user[str(np.argmax(prediction))]
print("User:", user)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[[0.5 0.5]]
User: Jean_Bon
