*Importing the required Dependencies*

In [1]:
import cv2
import numpy as np
import soundfile as sf
from tensorflow.keras.models import load_model

In [4]:
import librosa
import pyaudio

*Loading both face emotion and voice emotion model*

In [5]:
# face emotion detection model
json_file_face = open("model_architecture.json", "r")
model_json_face = json_file_face.read()
json_file_face.close()
model_face = load_model("model_weights.h5")

# voice emotion detection model
model_voice = load_model('voice_model.h5')

In [6]:
haar_file = cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
face_cascade = cv2.CascadeClassifier(haar_file)

chunk_size = 1024
format = pyaudio.paInt16
channels = 1
sample_rate_voice = 44100
p = pyaudio.PyAudio()
stream_voice = p.open(format=format,
                      channels=channels,
                      rate=sample_rate_voice,
                      input=True,
                      frames_per_buffer=chunk_size)

# Labels for face emotion detection
labels_face = {0: 'angry', 1: 'disgust', 2: 'fear', 3: 'happy', 4: 'neutral', 5: 'sad', 6: 'surprise'}

# Labels for voice emotion detection
labels_voice = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fear', 6: 'disgust', 7: 'surprise'}

*Real Time Detection*

In [7]:
camera = cv2.VideoCapture(0)
buffer_voice = np.zeros(chunk_size, dtype=np.int16)
buffer_duration = 5 

while True:
    ret, frame = camera.read()
    if not ret:
        print("Failed to capture frame.")
        break

    # Face detection
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)

    for (p, q, r, s) in faces:
        face_roi = gray[q:q + s, p:p + r]
        cv2.rectangle(frame, (p, q), (p + r, q + s), (255, 0, 0), 2)
        face_roi = cv2.resize(face_roi, (48, 48))
        img_face = np.array(face_roi).reshape(1, 48, 48, 1) / 255.0
        pred_face = model_face.predict(img_face)
        emotion_label_face = labels_face[pred_face.argmax()]
        cv2.putText(frame, emotion_label_face, (p - 10, q - 10), cv2.FONT_HERSHEY_COMPLEX_SMALL, 2, (0, 0, 255))

    # Voice detection
    data_voice = np.frombuffer(stream_voice.read(chunk_size), dtype=np.int16)
    buffer_voice = np.concatenate((buffer_voice[len(data_voice):], data_voice))

    if len(buffer_voice) >= buffer_duration * sample_rate_voice:
        # Process the real-time audio
        features_voice = extract_realtime_features(buffer_voice, sample_rate_voice)
        features_voice = np.expand_dims(features_voice, axis=2)

        # Make prediction
        pred_voice = model_voice.predict(features_voice)
        emotion_label_voice = labels_voice[pred_voice.argmax()]

        print("Predicted Emotion (Voice): ", emotion_label_voice)

    # Display the combined result
    cv2.imshow("Combined Emotion Recognition ", frame)

    key = cv2.waitKey(1)
    if key == ord('q'):  # Press 'q' to exit
        break

# Cleanup
stream_voice.stop_stream()
stream_voice.close()
camera.release()
cv2.destroyAllWindows()


