In [None]:
import os

!apt-get install aria2 -y

actor_ids = [f"{i:02d}" for i in range(1, 25)]

base_url = "https://zenodo.org/records/1188976/files/Video_Speech_Actor_"

save_dir = "ravdess_video_speech"
os.makedirs(save_dir, exist_ok=True)

for actor_id in actor_ids:
    file_url = f"{base_url}{actor_id}.zip?download=1"
    zip_path = f"{save_dir}/Video_Speech_Actor_{actor_id}.zip"

    print(f"Downloading: {file_url}")
    os.system(f"aria2c -x 16 -s 16 '{file_url}' -o {zip_path}")

    print(f"Extracting: Video_Speech_Actor_{actor_id}.zip")
    os.system(f"unzip -q {zip_path} -d {save_dir}")

print("All video speech files downloaded and extracted successfully!")


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libaria2-0 libc-ares2
The following NEW packages will be installed:
  aria2 libaria2-0 libc-ares2
0 upgraded, 3 newly installed, 0 to remove and 30 not upgraded.
Need to get 1,513 kB of archives.
After this operation, 5,441 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libc-ares2 amd64 1.18.1-1ubuntu0.22.04.3 [45.1 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libaria2-0 amd64 1.36.0-1 [1,086 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 aria2 amd64 1.36.0-1 [381 kB]
Fetched 1,513 kB in 2s (700 kB/s)
Selecting previously unselected package libc-ares2:amd64.
(Reading database ... 126315 files and directories currently installed.)
Preparing to unpack .../libc-ares2_1.18.1-1ubuntu0.22.04.3_amd64.deb ...
Unpacking libc-ares2:amd64 (1.18.1-1ubuntu

In [None]:
import cv2
import numpy as np
import tensorflow as tf
import librosa
import os
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder

image_model = load_model('Image-Classifiaction.keras')
audio_model = load_model('my_model.h5')

emotion_map = {
    "01": "Neutral",
    "02": "Calm",
    "03": "Happy",
    "04": "Sad",
    "05": "Angry",
    "06": "Fearful",
    "07": "Disgust",
    "08": "Surprised"
}

label_encoder = LabelEncoder()
label_encoder.fit(["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"])

def extract_features(y, sr):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    spec_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    features = np.concatenate([mfcc, chroma, spec_contrast], axis=0)
    return features

def extract_audio_features(video_path):
    audio_file = video_path.replace(".mp4", ".wav")
    os.system(f"ffmpeg -i '{video_path}' -ac 1 -ar 48000 '{audio_file}' -hide_banner -loglevel error")
    y, sr = librosa.load(audio_file, sr=22050)
    features = extract_features(y, sr)
    max_length = 254
    if features.shape[1] < max_length:
        features = np.pad(features, ((0, 0), (0, max_length - features.shape[1])), mode='constant')
    else:
        features = features[:, :max_length]
    features = np.expand_dims(np.transpose(features), axis=-1)
    features = np.expand_dims(features, axis=0)
    return features

def extract_frames(video_path, frame_skip=5, img_size=(64, 64)):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if len(frames) % frame_skip == 0:
            frame = cv2.resize(frame, img_size)
            frame = frame / 255.0
            frames.append(frame)
    cap.release()
    return np.array(frames)

def predict_emotion(video_path):
    frames = extract_frames(video_path)

    frame_predictions = image_model.predict(frames)
    frame_pred_label = np.argmax(np.mean(frame_predictions, axis=0))
    frame_pred_emotion = label_encoder.inverse_transform([frame_pred_label])[0]
    frame_confidence = np.mean(np.max(frame_predictions, axis=1))

    audio_features = extract_audio_features(video_path)
    audio_predictions = audio_model.predict(audio_features)
    audio_pred_label = np.argmax(audio_predictions, axis=1)[0]
    audio_pred_emotion = label_encoder.inverse_transform([audio_pred_label])[0]
    audio_confidence = np.max(audio_predictions)

    return frame_pred_emotion, audio_pred_emotion, frame_confidence, audio_confidence

def get_actual_emotion_from_filename(filename):
    emotion_code = filename.split("-")[2]
    return emotion_map.get(emotion_code, "Unknown")

frame_correct = 0
audio_correct = 0
final_correct = 0
total = 0

video_dir = "/content/ravdess_video_speech"
for actor_folder in sorted(os.listdir(video_dir)):
    actor_path = os.path.join(video_dir, actor_folder)
    if os.path.isdir(actor_path):
        for video_file in sorted(os.listdir(actor_path)):
            if video_file.endswith(".mp4"):
                if video_file.startswith("02"):
                      continue
                video_path = os.path.join(actor_path, video_file)


                frame_emotion, audio_emotion, frame_confidence, audio_confidence = predict_emotion(video_path)


                actual_emotion = get_actual_emotion_from_filename(video_file)


                print(f"Video: {video_file}")
                print(f"Predicted Emotion from Frames: {frame_emotion} (Confidence: {frame_confidence:.2f})")
                print(f"Predicted Emotion from Audio: {audio_emotion} (Confidence: {audio_confidence:.2f})")


                if frame_confidence > audio_confidence:
                    final_predicted_emotion = frame_emotion
                else:
                    final_predicted_emotion = audio_emotion

                print(f"Final Predicted Emotion: {final_predicted_emotion}")
                print(f"Actual Emotion: {actual_emotion}")
                print(f"Match: {final_predicted_emotion.lower() == actual_emotion.lower()}")
                print("-" * 50)


                if frame_emotion.lower() == actual_emotion.lower():
                    frame_correct += 1
                if audio_emotion.lower() == actual_emotion.lower():
                    audio_correct += 1
                if final_predicted_emotion.lower() == actual_emotion.lower():
                    final_correct += 1
                total += 1

frame_accuracy = frame_correct / total * 100
audio_accuracy = audio_correct / total * 100
final_accuracy = final_correct / total * 100

print(f"Frame Emotion Accuracy: {frame_accuracy:.2f}%")
print(f"Audio Emotion Accuracy: {audio_accuracy:.2f}%")
print(f"Final Predicted Emotion Accuracy: {final_accuracy:.2f}%")




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Match: True
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
Video: 01-01-07-01-01-02-15.mp4
Predicted Emotion from Frames: disgust (Confidence: 0.99)
Predicted Emotion from Audio: disgust (Confidence: 1.00)
Final Predicted Emotion: disgust
Actual Emotion: Disgust
Match: True
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
Video: 01-01-07-01-02-01-15.mp4
Predicted Emotion from Frames: disgust (Confidence: 0.98)
Predicted Emotion from Audio: disgust (Confidence: 1.00)
Final Predicted Emotion: disgust
Actual Emotion: Disgust
Match: True
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m